## Stage 0: Count vectorizer search
Searching based on simple word counts in query and targets

In [None]:
import pandas as pd
import numpy as np
import os
import pickle

from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import CountVectorizer

import preprocessing
import my_tfidf

In [None]:
def count_vectorize(documents, pickle_path=None, save_files_prefix=""):
    """Input:
    documents: Series or List of strings to vectorize
    pickle_path: path of directory to save vectorizer and term-document matrix, e.g. 'data/processed/'
    save_files_prefix: prefix for saved files. For example, passing "01" will produce files '01_vectorizer.pkl' and '01_tdm.npz'
    
    Output: Fit vectorizer and term-document matrix"""
    
    vectorizer = CountVectorizer()
    vectorizer.fit(documents)
    tdm = vectorizer.transform(documents)
    
    if pickle_path is not None: # save vectorizer and term-document matrix
        
        # if files by that name already exist, prompt user to choose another prefix. Repeats if new input still exists
        while os.path.exists(pickle_path + save_files_prefix + "_vectorizer.pkl"):
            save_files_prefix = input("Files by that name already exist. Enter another prefix...")
        
        vec_path = pickle_path + save_files_prefix + "_vectorizer.pkl"
        
        with open(vec_path, 'wb') as file: # pickle vectorizer
            pickle.dump(vectorizer, file)
        print('Vectorizer pickled at ', vec_path)
        
        tdm_path = pickle_path + save_files_prefix + "_tdm.npz"
        save_npz(tdm_path, tdm) # save term-document matrix
        print('Term-document matrix saved at ', tdm_path)

    return vectorizer, tdm

In [None]:
path = 'data/processed/metadata_clean.csv.gz'
df = pd.read_csv(path, sep='\t', low_memory=False)

In [None]:
len(df)

### Vectorize search texts

In [None]:
documents = df.search_text.tolist()
path = 'results/final_models/'
vectorizer, tdm = count_vectorize(documents, pickle_path=path, save_files_prefix="cv")

### Perform searches

In [None]:
# load vectorizer and tdm
path = 'results/final_models/cv_vectorizer.pkl'
vectorizer = my_tfidf.load_vectorizer(path)
tdm = load_npz('results/final_models/cv_tdm.npz')
questions = pd.read_csv('data/processed/questions_expert.csv', sep='\t').question.tolist()
index = df.cord_uid.tolist()

In [None]:
directory='results/final_models/cv_2021/'

for i in range(len(questions)):
    query = questions[i]
    uids = my_tfidf.tfidf_search(query, vectorizer, tdm, index, df)
    my_tfidf.write_details(query, uids, df,
                               record_file_prefix=f'cv_{i}', 
                               directory=directory)