# Stage 1: Basic content search by tf-idf

In [8]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz

from scipy.spatial.distance import cosine

import preprocessing
import tfidf_search

## Vectorization 

In [11]:
# load data from file
df = pd.read_csv('data/processed/01_full_table.csv', index_col=0, sep='\t', low_memory=False)

In [15]:
# for now, just remove rows without a search_text
df = df[df.search_text.notnull()]

In [17]:
df.head()

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,search_text
0,ug7v899j,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,OBJECTIVE: This retrospective chart review des...,2001-07-04 00:00:00,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,none,none,none,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,none,clinical feature culture prove mycoplasma pneu...
1,02tnwd4m,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,Inflammatory diseases of the respiratory tract...,2000-08-15 00:00:00,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,none,none,none,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,none,nitric oxide pro inflammatory mediator lung di...
2,ejv2xln0,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,Surfactant protein-D (SP-D) participates in th...,2000-08-25 00:00:00,"Crouch, Erika C",Respir Res,none,none,none,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,none,surfactant protein d pulmonary host defense su...
3,2b73a28n,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22 00:00:00,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,none,none,none,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,none,role endothelin lung disease endothelin et ami...
4,9785vg6d,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11 00:00:00,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,none,none,none,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,none,gene expression epithelial cells response pneu...


In [18]:
# create vectorizer and term-document matrix
path = 'data/processed/'
documents = df.search_text
vectorizer, tdm = tfidf_search.tfidf_vectorize(documents, pickle_path=path,
                                              save_files_prefix="02")

Vectorizer picked at  data/processed/02_vectorizer.pkl
Term-document matrix saved at  data/processed/02_tdm.npz


### Input query

In [19]:
query_df = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0)
index = df['cord_uid']

vectorizer = tfidf_search.load_vectorizer('data/processed/02_vectorizer.pkl')
tdm = load_npz('data/processed/02_tdm.npz') # load term-document matrix 

In [64]:
query = query_df.question[1]
print("Input query:", query)

Input query: How do cytokine pathways link sleep and immunity to infection and COVID-19?


In [21]:
uids = tfidf_search.tfidf_search(query, vectorizer, tdm, index)

  0%|          | 23/508040 [00:00<37:32, 225.58it/s]

Vectorized search query
Computing document similarity...


  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 508040/508040 [22:20<00:00, 378.87it/s]


Complete
Returned top 5 results.


In [22]:
list(uids.values)

['wt4crton', 'vdd88nwl', '1ansnm46', 'wtbu91hz', '9dpe3zq1']

In [65]:
tfidwrite_details(query, uids, df, '02_singleq')

Search results saved to data/processed/02_singleq_search_record.txt
