### Count vectorizer search
Search system based on simple word counts in query and targets

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import CountVectorizer

import preprocessing
import tfidf_search

In [2]:
def count_vectorize(documents, pickle_path=None, save_files_prefix=""):
    """Input:
    documents: Series or List of strings to vectorize
    pickle_path: path of directory to save vectorizer and term-document matrix, e.g. 'data/processed/'
    save_files_prefix: prefix for saved files. For example, passing "01" will produce files '01_vectorizer.pkl' and '01_tdm.npz'
    
    Output: Fit vectorizer and term-document matrix"""
    
    vectorizer = CountVectorizer()
    vectorizer.fit(documents)
    tdm = vectorizer.transform(documents)
    
    if pickle_path is not None: # save vectorizer and term-document matrix
        
        # if files by that name already exist, prompt user to choose another prefix. Repeats if new input still exists
        while os.path.exists(pickle_path + save_files_prefix + "_vectorizer.pkl"):
            save_files_prefix = input("Files by that name already exist. Enter another prefix...")
        
        vec_path = pickle_path + save_files_prefix + "_vectorizer.pkl"
        
        with open(vec_path, 'wb') as file: # pickle vectorizer
            pickle.dump(vectorizer, file)
        print('Vectorizer pickled at ', vec_path)
        
        tdm_path = pickle_path + save_files_prefix + "_tdm.npz"
        save_npz(tdm_path, tdm) # save term-document matrix
        print('Term-document matrix saved at ', tdm_path)

    return vectorizer, tdm

In [3]:
path = 'data/processed/04_jan21_full_table.csv'
df = pd.read_csv(path, sep='\t', index_col=0, low_memory=False)

In [4]:
len(df)

72326

### Vectorize search texts

In [5]:
documents = df.search_text.tolist()
path = 'data/processed/'
vectorizer, tdm = count_vectorize(documents, pickle_path=path, save_files_prefix="cv_jan21")

Files by that name already exist. Enter another prefix... cv_jan21


Vectorizer pickled at  data/processed/cv_jan21_vectorizer.pkl
Term-document matrix saved at  data/processed/cv_jan21_tdm.npz


### Perform searches

In [6]:
# load vectorizer and tdm
path = 'data/processed/cv_jan21_vectorizer.pkl'
vectorizer = tfidf_search.load_vectorizer(path)
tdm = load_npz('data/processed/cv_jan21_tdm.npz')
questions = pd.read_csv('data/processed/questions_expert.csv', sep='\t').question.tolist()
index = df.cord_uid.tolist()

In [None]:
directory='results/cv_jan21/'

for i in range(len(questions)):
    query = questions[i]
    uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)
    tfidf_search.write_details(query, uids, df,
                               record_file_prefix=f'cv_{i}', 
                               directory='results/cv_jan21/')

  0%|          | 79/72326 [00:00<01:31, 788.55it/s]

Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:59<00:00, 1222.18it/s]
  0%|          | 121/72326 [00:00<00:59, 1205.09it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_0_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:07<00:00, 1068.05it/s]
  0%|          | 118/72326 [00:00<01:01, 1176.87it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_1_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:06<00:00, 1082.02it/s]
  0%|          | 107/72326 [00:00<01:07, 1065.32it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_2_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:03<00:00, 1146.40it/s]
  0%|          | 117/72326 [00:00<01:01, 1166.96it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_3_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:04<00:00, 1127.76it/s]
  0%|          | 113/72326 [00:00<01:04, 1127.54it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_4_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:05<00:00, 1110.22it/s]
  0%|          | 98/72326 [00:00<01:13, 979.41it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_5_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:02<00:00, 1148.41it/s]
  0%|          | 60/72326 [00:00<02:00, 599.76it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_6_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:09<00:00, 1043.73it/s]
  0%|          | 100/72326 [00:00<01:12, 996.73it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_7_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:06<00:00, 1086.89it/s]
  0%|          | 103/72326 [00:00<01:10, 1024.58it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_8_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:02<00:00, 1155.58it/s]
  0%|          | 87/72326 [00:00<01:24, 855.08it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_9_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:09<00:00, 1040.11it/s]
  0%|          | 88/72326 [00:00<01:22, 873.36it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_10_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:02<00:00, 1148.82it/s]
  0%|          | 117/72326 [00:00<01:01, 1169.69it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_11_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:03<00:00, 1132.34it/s]
  0%|          | 107/72326 [00:00<01:07, 1068.11it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_12_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:03<00:00, 1143.32it/s]
  0%|          | 123/72326 [00:00<00:58, 1229.06it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_13_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:00<00:00, 1193.01it/s]
  0%|          | 123/72326 [00:00<00:59, 1220.95it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_14_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:00<00:00, 1186.87it/s]
  0%|          | 120/72326 [00:00<01:00, 1189.90it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_15_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:00<00:00, 1187.49it/s]
  0%|          | 120/72326 [00:00<01:00, 1198.34it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_16_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:01<00:00, 1171.85it/s]
  0%|          | 119/72326 [00:00<01:01, 1183.26it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_17_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:09<00:00, 1043.91it/s]
  0%|          | 87/72326 [00:00<01:23, 869.19it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_18_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:05<00:00, 1100.77it/s]
  0%|          | 117/72326 [00:00<01:02, 1162.76it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_19_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:05<00:00, 1108.07it/s]
  0%|          | 117/72326 [00:00<01:01, 1168.22it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_20_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [01:04<00:00, 1129.62it/s]
  0%|          | 113/72326 [00:00<01:04, 1117.01it/s]

Complete
Returned top 5 results.
Search results saved to results/cv_jan21/cv_21_search_record.txt
Vectorized search query
Computing document similarity...


 31%|███▏      | 22605/72326 [00:19<00:41, 1202.31it/s]