# Stage 1: Basic content search by tf-idf

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz, csr_matrix

from scipy.spatial.distance import cosine

import preprocessing
import my_tfidf

In [3]:
dtypes = {'cord_uid': str, 'sha': str, 'source_x': str, 'title': str, 'doi': str, 'pmcid': str, 'pubmed_id': str,
       'license': str, 'abstract': str, 'publish_time': str, 'authors': str, 'journal': str, 'mag_id': str,
       'who_covidence_id': str, 'arxiv_id': str, 'pdf_json_files': str, 'pmc_json_files': str,
       'url': str, 's2_id': str, 'search_text': str, 'date': str}

In [4]:
# load dataframe, filter only papers from 2021
path = 'data/processed/metadata_clean.csv.gz'
data = pd.read_csv(path, sep='\t', dtype=dtypes)
data.date = pd.to_datetime(data.date)
data = data[data.date.apply(lambda x: x.year == 2021)]

data = data[['cord_uid', 'date', 'title', 'abstract', 'authors', 'doi',
      'url', 'pdf_json_files', 'pmc_json_files', 'search_text']]
documents = data.search_text
index = data['cord_uid'].values

In [25]:
# data.to_csv('results/final_models/metadata_2021.csv.gz', index=False, sep='\t', compression='gzip')

### Vectorize

In [5]:
path = 'results/final_models/'

vectorizer = my_tfidf.make_vectorizer(documents, pickle_path=path, save_files_prefix="2021")
# vectorizer = my_tfidf.load_vectorizer(path + 'vectorizer.pkl')

tdm = vectorizer.transform(documents)
save_npz(path + '2021_tdm.npz', tdm)
# tdm = load_npz(path + '2021_tdm.npz')

Vectorizer pickled at  results/final_models/2021vectorizer.pkl


### Run search on queries

In [23]:
def search_write_queries(queries, vectorizer, tdm, index, metadata, save_directory, num_top_results=5):
    
    def write_results(results_df, query, save_directory, filename):
        path = save_directory + filename
        with open(path, 'w') as file:
            file.write(query + '\n\n\n')
            for i in range(len(results)):
                row = results.iloc[i]
                file.write(f'Result {i+1}: uid {row.cord_uid}\n\n{row.title}\n\n{row.abstract}\n\n\n')
    
    
    for i in range(len(queries)):
        query = queries[i]
        results = my_tfidf.tfidf_search(query, vectorizer, tdm, index, 
                                        metadata, num_top_results=5)
        filename = f'q{i}'
        write_results(results, query, save_directory, filename)    

In [24]:
queries = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0).question.values
save_directory = 'results/final_models/query_results/'

search_write_queries(queries, vectorizer, tdm, index, data, save_directory)

  0%|          | 121/56655 [00:00<00:46, 1209.28it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:31<00:00, 1801.21it/s]
  0%|          | 176/56655 [00:00<00:32, 1752.06it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:30<00:00, 1839.12it/s]
  0%|          | 198/56655 [00:00<00:28, 1974.30it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:30<00:00, 1886.36it/s]
  0%|          | 194/56655 [00:00<00:29, 1934.95it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:35<00:00, 1589.85it/s]
  0%|          | 178/56655 [00:00<00:31, 1771.46it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:35<00:00, 1608.41it/s]
  0%|          | 129/56655 [00:00<00:43, 1285.45it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:34<00:00, 1637.51it/s]
  1%|          | 327/56655 [00:00<00:34, 1630.81it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:32<00:00, 1729.93it/s]
  0%|          | 166/56655 [00:00<00:34, 1653.28it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1712.55it/s]
  0%|          | 161/56655 [00:00<00:35, 1605.32it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:32<00:00, 1733.95it/s]
  0%|          | 158/56655 [00:00<00:36, 1563.97it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:32<00:00, 1737.20it/s]
  0%|          | 169/56655 [00:00<00:33, 1689.08it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:34<00:00, 1638.37it/s]
  0%|          | 172/56655 [00:00<00:32, 1716.38it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1699.50it/s]
  0%|          | 162/56655 [00:00<00:34, 1614.32it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1712.42it/s]
  0%|          | 163/56655 [00:00<00:34, 1626.32it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:32<00:00, 1718.63it/s]
  0%|          | 176/56655 [00:00<00:32, 1758.15it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1689.53it/s]
  0%|          | 162/56655 [00:00<00:35, 1613.62it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1711.51it/s]
  0%|          | 161/56655 [00:00<00:35, 1605.95it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1708.24it/s]
  0%|          | 158/56655 [00:00<00:35, 1578.91it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1713.36it/s]
  1%|          | 342/56655 [00:00<00:32, 1707.65it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1704.27it/s]
  0%|          | 152/56655 [00:00<00:37, 1517.74it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1704.20it/s]
  0%|          | 165/56655 [00:00<00:34, 1640.12it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1712.01it/s]
  0%|          | 165/56655 [00:00<00:34, 1648.93it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1692.99it/s]
  0%|          | 163/56655 [00:00<00:34, 1626.95it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:34<00:00, 1663.87it/s]
  0%|          | 172/56655 [00:00<00:32, 1712.80it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1704.87it/s]
  1%|          | 323/56655 [00:00<00:34, 1617.25it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1715.79it/s]
  0%|          | 168/56655 [00:00<00:33, 1671.60it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1690.69it/s]
  1%|          | 344/56655 [00:00<00:32, 1714.31it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1711.65it/s]
  0%|          | 164/56655 [00:00<00:34, 1633.64it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1693.10it/s]
  0%|          | 159/56655 [00:00<00:35, 1584.82it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1685.49it/s]
  0%|          | 166/56655 [00:00<00:34, 1656.92it/s]

Vectorized search query


100%|██████████| 56655/56655 [00:33<00:00, 1701.78it/s]
