# Stage 1: Basic content search by tf-idf

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz, csr_matrix

from scipy.spatial.distance import cosine

import preprocessing
import my_tfidf

In [2]:
dtypes = {'cord_uid': str, 'sha': str, 'source_x': str, 'title': str, 'doi': str, 'pmcid': str, 'pubmed_id': str,
       'license': str, 'abstract': str, 'publish_time': str, 'authors': str, 'journal': str, 'mag_id': str,
       'who_covidence_id': str, 'arxiv_id': str, 'pdf_json_files': str, 'pmc_json_files': str,
       'url': str, 's2_id': str, 'search_text': str, 'date': str}

In [3]:
# load dataframe, filter only papers from 2021
path = 'data/processed/metadata_clean.csv.gz'
data = pd.read_csv(path, sep='\t', dtype=dtypes)
data.date = pd.to_datetime(data.date)
data = data[data.date.apply(lambda x: x.year == 2021)]

data = data[['cord_uid', 'date', 'title', 'abstract', 'authors', 'doi',
      'url', 'pdf_json_files', 'pmc_json_files', 'search_text']]
documents = data.search_text
index = data['cord_uid'].values

In [4]:
# data.to_csv('results/final_models/metadata_2021.csv.gz', index=False, sep='\t', compression='gzip')

### Vectorize

In [5]:
path = 'results/final_models/'

vectorizer = my_tfidf.make_vectorizer(documents, pickle_path=path, save_files_prefix="_2021")
# vectorizer = my_tfidf.load_vectorizer(path + 'vectorizer.pkl')

tdm = vectorizer.transform(documents)
save_npz(path + '2021_tdm.npz', tdm)
# tdm = load_npz(path + '2021_tdm.npz')

Files by that name already exist. Enter another prefix... 2021


Vectorizer pickled at  results/final_models/2021vectorizer.pkl


### Run search on queries

In [6]:
def search_write_queries(queries, vectorizer, tdm, index, metadata, save_directory, num_top_results=5):
    
    def write_results(results_df, query, save_directory, filename):
        path = save_directory + filename
        with open(path, 'w') as file:
            file.write(query + '\n\n\n')
            for i in range(len(results)):
                row = results.iloc[i]
                file.write(f'Result {i+1}: uid {row.cord_uid}\n\n{row.title}\n\n{row.abstract}\n\n\n')
    
    
    for i in range(len(queries)):
        query = queries[i]
        results = my_tfidf.tfidf_search(query, vectorizer, tdm, index, 
                                        metadata, num_top_results=5)
        filename = f'q{i}'
        write_results(results, query, save_directory, filename)    

In [7]:
queries = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0).question.values
save_directory = 'results/final_models/tfidf_results/'

search_write_queries(queries, vectorizer, tdm, index, data, save_directory)

  0%|          | 186/53758 [00:00<00:28, 1856.31it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:30<00:00, 1786.51it/s]
  0%|          | 181/53758 [00:00<00:29, 1786.31it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1718.84it/s]
  0%|          | 179/53758 [00:00<00:29, 1789.63it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1656.91it/s]
  0%|          | 168/53758 [00:00<00:32, 1672.07it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1693.73it/s]
  0%|          | 174/53758 [00:00<00:30, 1733.51it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:44<00:00, 1220.28it/s]
  0%|          | 137/53758 [00:00<00:39, 1367.75it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1625.73it/s]
  0%|          | 172/53758 [00:00<00:31, 1714.09it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1711.92it/s]
  0%|          | 138/53758 [00:00<00:38, 1378.20it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:37<00:00, 1419.15it/s]
  0%|          | 190/53758 [00:00<00:28, 1895.01it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:30<00:00, 1774.46it/s]
  0%|          | 190/53758 [00:00<00:28, 1892.42it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1705.11it/s]
  0%|          | 150/53758 [00:00<00:35, 1492.06it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1652.87it/s]
  0%|          | 118/53758 [00:00<00:45, 1173.64it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1685.23it/s]
  0%|          | 185/53758 [00:00<00:29, 1844.68it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:40<00:00, 1329.47it/s]
  0%|          | 154/53758 [00:00<00:34, 1532.06it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1607.48it/s]
  0%|          | 192/53758 [00:00<00:27, 1917.43it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1611.24it/s]
  0%|          | 34/53758 [00:00<02:38, 339.05it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:34<00:00, 1577.12it/s]
  0%|          | 186/53758 [00:00<00:28, 1851.13it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:29<00:00, 1797.38it/s]
  0%|          | 180/53758 [00:00<00:29, 1794.08it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1611.29it/s]
  0%|          | 147/53758 [00:00<00:36, 1461.52it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1597.42it/s]
  0%|          | 155/53758 [00:00<00:34, 1548.40it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:34<00:00, 1546.90it/s]
  0%|          | 188/53758 [00:00<00:28, 1878.25it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1676.60it/s]
  0%|          | 167/53758 [00:00<00:32, 1668.98it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:39<00:00, 1358.43it/s]
  0%|          | 170/53758 [00:00<00:31, 1693.27it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1658.83it/s]
  0%|          | 167/53758 [00:00<00:32, 1663.41it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:33<00:00, 1612.88it/s]
  0%|          | 164/53758 [00:00<00:32, 1627.66it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1689.28it/s]
  0%|          | 166/53758 [00:00<00:32, 1653.56it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1719.86it/s]
  0%|          | 176/53758 [00:00<00:30, 1751.73it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1648.24it/s]
  0%|          | 170/53758 [00:00<00:31, 1698.18it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:31<00:00, 1709.61it/s]
  0%|          | 170/53758 [00:00<00:31, 1692.07it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1633.85it/s]
  0%|          | 168/53758 [00:00<00:32, 1672.66it/s]

Vectorized search query


100%|██████████| 53758/53758 [00:32<00:00, 1641.06it/s]
