In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz

from scipy.spatial.distance import cosine

import preprocessing
import tfidf_search

In [2]:
# load data from file
df = pd.read_csv('data/processed/01_full_table.csv', index_col=0, sep='\t', low_memory=False)

In [3]:
# for now, just remove rows without a search_text
df = df[df.search_text.notnull()]

In [31]:
# select only papers published in 2021
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df_jan21 = df[df.publish_time.apply(lambda x: x.year == 2021)]
len(df_jan21)

98473

In [35]:
# create vectorizer and term-document matrix
path = 'data/processed/'
documents = df_jan21.search_text
vectorizer, tdm = tfidf_search.tfidf_vectorize(documents, pickle_path=path,
                                              save_files_prefix="03_jan21")

Files by that name already exist. Enter another prefix... 03_jan21


Vectorizer pickled at  data/processed/03_jan21_vectorizer.pkl
Term-document matrix saved at  data/processed/03_jan21_tdm.npz


In [43]:
# load query data and define uid index
query_df = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0)
index = df_jan21['cord_uid'].values

In [44]:
# iterate through all questions
queries = query_df.question.values
for i in range(len(queries)):
    query = queries[i]
    uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)
    tfidf_search.write_details(query, uids, df_jan21, f'03_jan21_q{i}', 'data/processed/')

  0%|          | 122/98473 [00:00<01:20, 1217.59it/s]

Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:04<00:00, 1529.52it/s]
  0%|          | 156/98473 [00:00<01:03, 1556.62it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q0_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:05<00:00, 1492.20it/s]
  0%|          | 139/98473 [00:00<01:10, 1389.20it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q1_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1474.56it/s]
  0%|          | 148/98473 [00:00<01:06, 1473.28it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q2_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:15<00:00, 1308.87it/s]
  0%|          | 88/98473 [00:00<01:53, 870.35it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q3_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1597.38it/s]
  0%|          | 142/98473 [00:00<01:09, 1413.58it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q4_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1589.56it/s]
  0%|          | 149/98473 [00:00<01:06, 1485.12it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q5_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1605.82it/s]
  0%|          | 150/98473 [00:00<01:05, 1493.56it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q6_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1611.39it/s]
  0%|          | 147/98473 [00:00<01:07, 1462.72it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q7_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1611.36it/s]
  0%|          | 153/98473 [00:00<01:04, 1523.77it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q8_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:00<00:00, 1615.09it/s]
  0%|          | 151/98473 [00:00<01:05, 1507.93it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q9_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1605.86it/s]
  0%|          | 151/98473 [00:00<01:05, 1507.79it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q10_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1477.85it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q11_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:58<00:00, 831.38it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q12_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:54<00:00, 858.89it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q13_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 851.90it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q14_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:59<00:00, 822.60it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q15_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 855.17it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q16_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:54<00:00, 856.47it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q17_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:54<00:00, 857.20it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q18_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:54<00:00, 859.42it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q19_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 855.67it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q20_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 853.63it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q21_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 855.55it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q22_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [02:19<00:00, 706.51it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q23_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:57<00:00, 838.11it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q24_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 850.14it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q25_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 849.76it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q26_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:56<00:00, 847.96it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q27_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 852.35it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q28_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:55<00:00, 849.24it/s]


Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_q29_search_record.txt


## Consumer questions

In [46]:
# load query data and define uid index
query_consumers = pd.read_csv('data/processed/questions_consumer.csv', sep='\t', index_col=0)
index = df_jan21['cord_uid'].values
queries = query_consumers.question.values

In [48]:
for i in range(len(queries)):
    query = queries[i]
    uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)
    tfidf_search.write_details(query, uids, df_jan21, f'03_jan21_consumer_q{i}', 'data/processed/')

  0%|          | 80/98473 [00:00<02:04, 793.18it/s]

Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:11<00:00, 1381.01it/s]
  0%|          | 170/98473 [00:00<00:58, 1691.22it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q0_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:00<00:00, 1637.72it/s]
  0%|          | 134/98473 [00:00<01:13, 1337.26it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q1_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:00<00:00, 1630.62it/s]
  0%|          | 168/98473 [00:00<00:58, 1678.23it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q2_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:02<00:00, 1567.32it/s]
  0%|          | 183/98473 [00:00<00:53, 1827.05it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q3_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:55<00:00, 1758.54it/s]
  0%|          | 180/98473 [00:00<00:54, 1798.01it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q4_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1593.60it/s]
  0%|          | 170/98473 [00:00<00:57, 1698.29it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q5_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:56<00:00, 1754.21it/s]
  0%|          | 182/98473 [00:00<00:54, 1815.10it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q6_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:55<00:00, 1781.05it/s]
  0%|          | 187/98473 [00:00<00:52, 1855.36it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q7_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:57<00:00, 1720.26it/s]
  0%|          | 183/98473 [00:00<00:53, 1822.48it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q8_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1655.01it/s]
  0%|          | 154/98473 [00:00<01:03, 1537.64it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q9_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:48<00:00, 911.78it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q10_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:09<00:00, 1423.10it/s]
  0%|          | 154/98473 [00:00<01:03, 1536.30it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q11_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:02<00:00, 1568.74it/s]
  0%|          | 144/98473 [00:00<01:08, 1436.00it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q12_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:04<00:00, 1516.39it/s]
  0%|          | 168/98473 [00:00<00:58, 1679.41it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q13_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1664.70it/s]
  0%|          | 157/98473 [00:00<01:02, 1562.75it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q14_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1658.09it/s]
  0%|          | 175/98473 [00:00<00:56, 1746.58it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q15_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1540.44it/s]
  0%|          | 181/98473 [00:00<00:54, 1807.95it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q16_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:15<00:00, 1304.61it/s]
  0%|          | 120/98473 [00:00<01:22, 1195.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q17_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:07<00:00, 1458.20it/s]
  0%|          | 152/98473 [00:00<01:04, 1518.10it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q18_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1491.27it/s]
  0%|          | 111/98473 [00:00<01:28, 1105.77it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q19_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1612.29it/s]
  0%|          | 159/98473 [00:00<01:01, 1585.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q20_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1655.57it/s]
  0%|          | 176/98473 [00:00<00:55, 1756.54it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q21_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1607.31it/s]
  0%|          | 161/98473 [00:00<01:01, 1606.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q22_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1543.96it/s]
  0%|          | 156/98473 [00:00<01:03, 1553.08it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q23_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:17<00:00, 1274.97it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q24_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:08<00:00, 1435.53it/s]
  0%|          | 120/98473 [00:00<01:22, 1186.95it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q25_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1485.11it/s]
  0%|          | 144/98473 [00:00<01:08, 1432.54it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q26_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:05<00:00, 1500.64it/s]
  0%|          | 151/98473 [00:00<01:05, 1502.58it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q27_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1553.27it/s]
  0%|          | 150/98473 [00:00<01:05, 1497.36it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q28_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:04<00:00, 1517.57it/s]


Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q29_search_record.txt
