# LDA-based search engine
Requirements: Trained topic model, id2word

Method:
- 

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import re

import scipy.sparse as sparse
from scipy.spatial.distance import cosine

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import cossim

from preprocessing import load_cleaned_metadata, clean_text_lda
from data_access import get_txt
import my_lda

# First run on all questions

In [2]:
# load model
model = LdaModel.load('results/lda_21_01/21_01')
# load dictionary
dictionary = Dictionary.load('results/lda_21_01/21_01.dict')

# load metadata and corpus
metadata = pd.read_csv('data/processed/metadata_21.csv', sep='\t', index_col=0)
uids = metadata.cord_uid.tolist()
doc_path_list = ['data/cord-19/body_text/lda_clean/' + uid + '_clean.txt' for uid in uids]

corpus = my_lda.MyCorpus(doc_path_list, dictionary=dictionary)

queries = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0).question.tolist()

In [3]:
len(metadata)

79438

In [4]:
query = "warfarin"
results_df = my_lda.lda_search(query, model, corpus, dictionary, metadata)
results_df

100%|██████████| 79438/79438 [01:51<00:00, 712.19it/s] 


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
16510,338s4y89,6c2b459ff5a60511193f253c4d39d6fb4d860036,PMC,The influence of Transversus Abdominis Muscle ...,10.1007/s10029-021-02395-8,PMC7983096,33751278.0,no-cc,INTRODUCTION: Among many other techniques for ...,2021-03-22,"Oprea, V.; Mardale, S.; Buia, F.; Gheorghescu,...",Hernia,,,,document_parses/pdf_json/6c2b459ff5a60511193f2...,document_parses/pmc_json/PMC7983096.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
206258,7r01k4o5,,WHO,Power law behaviour in the saturation regime o...,,,,unk,"We apply a versatile growth model, whose growt...",2021-01-01,"Vasconcelos, Giovani L; Macêdo, Antônio M S; D...",Sci Rep,,#1104548,,,,,232055045.0
358902,ovi93m5v,,WHO,Neuro-oncology practice guidelines from a high...,,,,unk,BACKGROUND: During the coronavirus 19 (COVID-1...,2021-01-01,"Luther, Evan; Burks, Joshua; Eichberg, Daniel ...",J Clin Neurosci,,#978342,,,,,229183414.0
438587,twwb3uwy,1a7973686ed9cc63dce4fda025a008f2f4e855af,Medline; PMC,Prevalence and associated factors of the caree...,10.1186/s12875-021-01389-w,PMC7888696,33596820.0,cc-by,BACKGROUND: Primary care providers are pillars...,2021-02-17,"Liu, Di; Yang, Xu; Li, Qinglin; Shi, Lei; Tang...",BMC Fam Pract,,,,document_parses/pdf_json/1a7973686ed9cc63dce4f...,document_parses/pmc_json/PMC7888696.xml.json,https://doi.org/10.1186/s12875-021-01389-w; ht...,231947982.0
448226,y74fqsr6,f364117b26847b55fade163f5353da14d0feb445,Medline; PMC,Mechanism for epeirogenic uplift of the Archea...,10.1038/s41598-021-80965-7,PMC7811008,33452325.0,cc-by,"Plateaus, located far away from the plate boun...",2021-01-15,"Mandal, Biswajit; Vijaya Rao, V.; Karuppannan,...",Sci Rep,,,,document_parses/pdf_json/f364117b26847b55fade1...,document_parses/pmc_json/PMC7811008.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/33452325/;...,231615647.0


In [5]:
for title in results_df.title.values:
    print(title)

The influence of Transversus Abdominis Muscle Release (TAR) for complex incisional hernia repair on the intraabdominal pressure and pulmonary function
Power law behaviour in the saturation regime of fatality curves of the COVID-19 pandemic
Neuro-oncology practice guidelines from a high-volume surgeon at the COVID-19 epicenter
Prevalence and associated factors of the career plateau of primary care providers in Heilongjiang, China: a cross-sectional study
Mechanism for epeirogenic uplift of the Archean Dharwar craton, southern India as evidenced by orthogonal seismic reflection profiles


In [5]:
# def write_results_to_file(query, df, dest_path):
#     with open(dest_path, 'w') as file:
#         for i in range(len(df)):
#             row = df.iloc[i]
#             file.write(query + '\n\n')
#             file.write(f"Result #{i+1}: uid {row.cord_uid} \n")
#             file.write(row.title + '\n')
#             file.write(f"Abstract: \n {row.abstract} \n\n")

# directory = 'results/lda_21_01/'

# for i in range(len(queries)):
#     query = queries[i]
#     results_df = my_lda.lda_search(query, model, corpus, dictionary, metadata)
#     dest_path = directory + f'q{i}'
#     write_results_to_file(query, results_df, dest_path)  


100%|██████████| 79438/79438 [01:57<00:00, 674.79it/s] 
100%|██████████| 79438/79438 [01:21<00:00, 975.24it/s] 
100%|██████████| 79438/79438 [01:21<00:00, 978.63it/s] 
100%|██████████| 79438/79438 [01:27<00:00, 911.05it/s] 
100%|██████████| 79438/79438 [01:24<00:00, 937.83it/s] 
100%|██████████| 79438/79438 [01:16<00:00, 1032.45it/s]
100%|██████████| 79438/79438 [01:15<00:00, 1047.28it/s]
100%|██████████| 79438/79438 [01:18<00:00, 1006.99it/s]
100%|██████████| 79438/79438 [01:17<00:00, 1025.92it/s]
100%|██████████| 79438/79438 [01:19<00:00, 1001.15it/s]
100%|██████████| 79438/79438 [01:19<00:00, 1000.91it/s]
100%|██████████| 79438/79438 [01:17<00:00, 1022.40it/s]
100%|██████████| 79438/79438 [01:18<00:00, 1010.35it/s]
100%|██████████| 79438/79438 [01:27<00:00, 911.19it/s] 
100%|██████████| 79438/79438 [01:31<00:00, 870.93it/s] 
100%|██████████| 79438/79438 [01:23<00:00, 946.89it/s] 
100%|██████████| 79438/79438 [01:34<00:00, 843.93it/s] 
100%|██████████| 79438/79438 [01:16<00:00, 1031.