In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz

from scipy.spatial.distance import cosine

import preprocessing
import tfidf_search

In [2]:
# load data from file
df = pd.read_csv('data/processed/02_full_table.csv', index_col=0, sep='\t', low_memory=False)

In [3]:
# for now, just remove rows without a search_text
df = df[df.search_text.notnull()]

In [4]:
# select only papers published in 2021
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df_jan21 = df[df.publish_time.apply(lambda x: x.year == 2021)]
len(df_jan21)

93736

In [5]:
def show_duplicated(df, col):
    dup = df[df[col].duplicated(keep=False)].sort_values(col)
    return dup

In [6]:
print(len(df_jan21))
mask = ~df_jan21.search_text.duplicated()
df_jan21 = df_jan21[mask]
print(len(df_jan21))

93736
72326


### Vectorize

In [7]:
# create vectorizer and term-document matrix
path = 'data/processed/'
documents = df_jan21.search_text
vectorizer, tdm = tfidf_search.tfidf_vectorize(documents, pickle_path=path,
                                              save_files_prefix="04_jan21")

Vectorizer pickled at  data/processed/04_jan21_vectorizer.pkl
Term-document matrix saved at  data/processed/04_jan21_tdm.npz


In [8]:
# load query data and define uid index
query_df = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0)
index = df_jan21['cord_uid'].values

In [9]:
# iterate through all questions
queries = query_df.question.values
for i in range(len(queries)):
    query = queries[i]
    uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)
    tfidf_search.write_details(query, uids, df_jan21, f'04_jan21_q{i}', 'data/processed/')

  0%|          | 157/72326 [00:00<00:45, 1569.28it/s]

Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:45<00:00, 1577.43it/s]
  0%|          | 131/72326 [00:00<00:55, 1303.09it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q0_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1505.99it/s]
  0%|          | 148/72326 [00:00<00:48, 1474.20it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q1_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1503.50it/s]
  0%|          | 134/72326 [00:00<00:54, 1334.69it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q2_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1505.65it/s]
  0%|          | 150/72326 [00:00<00:48, 1494.97it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q3_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1503.51it/s]
  0%|          | 149/72326 [00:00<00:48, 1485.22it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q4_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1506.17it/s]
  0%|          | 140/72326 [00:00<00:51, 1394.09it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q5_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1476.85it/s]
  0%|          | 148/72326 [00:00<00:48, 1473.52it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q6_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:48<00:00, 1488.71it/s]
  0%|          | 156/72326 [00:00<00:46, 1553.18it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q7_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:46<00:00, 1552.11it/s]
  0%|          | 137/72326 [00:00<00:52, 1368.05it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q8_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:46<00:00, 1557.23it/s]
  0%|          | 139/72326 [00:00<00:52, 1384.09it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q9_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:46<00:00, 1547.89it/s]
  0%|          | 157/72326 [00:00<00:46, 1567.96it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q10_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:46<00:00, 1551.99it/s]
  0%|          | 140/72326 [00:00<00:51, 1389.08it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q11_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:47<00:00, 1530.01it/s]
  0%|          | 158/72326 [00:00<00:45, 1574.29it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q12_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:45<00:00, 1606.49it/s]
  0%|          | 166/72326 [00:00<00:43, 1655.76it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q13_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1664.93it/s]
  0%|          | 152/72326 [00:00<00:47, 1513.18it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q14_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1663.89it/s]
  0%|          | 153/72326 [00:00<00:47, 1526.69it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q15_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1666.95it/s]
  0%|          | 159/72326 [00:00<00:45, 1589.25it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q16_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1662.05it/s]
  0%|          | 161/72326 [00:00<00:44, 1608.97it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q17_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1668.39it/s]
  0%|          | 165/72326 [00:00<00:43, 1647.49it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q18_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1671.87it/s]
  0%|          | 167/72326 [00:00<00:43, 1662.29it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q19_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:45<00:00, 1598.30it/s]
  0%|          | 164/72326 [00:00<00:44, 1633.51it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q20_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1655.33it/s]
  0%|          | 168/72326 [00:00<00:42, 1678.94it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q21_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1667.42it/s]
  0%|          | 148/72326 [00:00<00:48, 1477.69it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q22_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1649.50it/s]
  0%|          | 169/72326 [00:00<00:42, 1685.35it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q23_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:44<00:00, 1643.53it/s]
  0%|          | 167/72326 [00:00<00:43, 1661.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q24_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1664.99it/s]
  0%|          | 168/72326 [00:00<00:43, 1675.71it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q25_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1645.83it/s]
  0%|          | 156/72326 [00:00<00:46, 1559.68it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q26_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:44<00:00, 1621.31it/s]
  0%|          | 159/72326 [00:00<00:45, 1585.42it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q27_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1644.39it/s]
  0%|          | 164/72326 [00:00<00:44, 1637.88it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q28_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 72326/72326 [00:43<00:00, 1643.79it/s]


Complete
Returned top 5 results.
Search results saved to data/processed/04_jan21_q29_search_record.txt


In [10]:
# df_jan21.to_csv('data/processed/04_jan21_full_table.csv', sep='\t')

## Consumer questions

In [46]:
# # load query data and define uid index
# query_consumers = pd.read_csv('data/processed/questions_consumer.csv', sep='\t', index_col=0)
# index = df_jan21['cord_uid'].values
# queries = query_consumers.question.values

In [48]:
# for i in range(len(queries)):
#     query = queries[i]
#     uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)
#     tfidf_search.write_details(query, uids, df_jan21, f'03_jan21_consumer_q{i}', 'data/processed/')

  0%|          | 80/98473 [00:00<02:04, 793.18it/s]

Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:11<00:00, 1381.01it/s]
  0%|          | 170/98473 [00:00<00:58, 1691.22it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q0_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:00<00:00, 1637.72it/s]
  0%|          | 134/98473 [00:00<01:13, 1337.26it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q1_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:00<00:00, 1630.62it/s]
  0%|          | 168/98473 [00:00<00:58, 1678.23it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q2_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:02<00:00, 1567.32it/s]
  0%|          | 183/98473 [00:00<00:53, 1827.05it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q3_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:55<00:00, 1758.54it/s]
  0%|          | 180/98473 [00:00<00:54, 1798.01it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q4_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1593.60it/s]
  0%|          | 170/98473 [00:00<00:57, 1698.29it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q5_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:56<00:00, 1754.21it/s]
  0%|          | 182/98473 [00:00<00:54, 1815.10it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q6_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:55<00:00, 1781.05it/s]
  0%|          | 187/98473 [00:00<00:52, 1855.36it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q7_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:57<00:00, 1720.26it/s]
  0%|          | 183/98473 [00:00<00:53, 1822.48it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q8_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1655.01it/s]
  0%|          | 154/98473 [00:00<01:03, 1537.64it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q9_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:48<00:00, 911.78it/s] 
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q10_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:09<00:00, 1423.10it/s]
  0%|          | 154/98473 [00:00<01:03, 1536.30it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q11_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:02<00:00, 1568.74it/s]
  0%|          | 144/98473 [00:00<01:08, 1436.00it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q12_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:04<00:00, 1516.39it/s]
  0%|          | 168/98473 [00:00<00:58, 1679.41it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q13_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1664.70it/s]
  0%|          | 157/98473 [00:00<01:02, 1562.75it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q14_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1658.09it/s]
  0%|          | 175/98473 [00:00<00:56, 1746.58it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q15_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1540.44it/s]
  0%|          | 181/98473 [00:00<00:54, 1807.95it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q16_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:15<00:00, 1304.61it/s]
  0%|          | 120/98473 [00:00<01:22, 1195.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q17_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:07<00:00, 1458.20it/s]
  0%|          | 152/98473 [00:00<01:04, 1518.10it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q18_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1491.27it/s]
  0%|          | 111/98473 [00:00<01:28, 1105.77it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q19_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1612.29it/s]
  0%|          | 159/98473 [00:00<01:01, 1585.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q20_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [00:59<00:00, 1655.57it/s]
  0%|          | 176/98473 [00:00<00:55, 1756.54it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q21_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:01<00:00, 1607.31it/s]
  0%|          | 161/98473 [00:00<01:01, 1606.84it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q22_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1543.96it/s]
  0%|          | 156/98473 [00:00<01:03, 1553.08it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q23_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:17<00:00, 1274.97it/s]
  0%|          | 0/98473 [00:00<?, ?it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q24_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:08<00:00, 1435.53it/s]
  0%|          | 120/98473 [00:00<01:22, 1186.95it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q25_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:06<00:00, 1485.11it/s]
  0%|          | 144/98473 [00:00<01:08, 1432.54it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q26_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:05<00:00, 1500.64it/s]
  0%|          | 151/98473 [00:00<01:05, 1502.58it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q27_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:03<00:00, 1553.27it/s]
  0%|          | 150/98473 [00:00<01:05, 1497.36it/s]

Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q28_search_record.txt
Vectorized search query
Computing document similarity...


100%|██████████| 98473/98473 [01:04<00:00, 1517.57it/s]


Complete
Returned top 5 results.
Search results saved to data/processed/03_jan21_consumer_q29_search_record.txt
