# LDA-based search engine
Requirements: Trained topic model, id2word

Method:
- 

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import re

import scipy.sparse as sparse
from scipy.spatial.distance import cosine

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import cossim

from preprocessing import load_cleaned_metadata, clean_text_lda
from data_access import get_txt
import my_lda

# First run on all questions

In [2]:
filename = 'results/final_models/lda_2021_60_0-5'

# load model
model = LdaModel.load(filename)
# load dictionary
dictionary = Dictionary.load(filename+'.dict')

# load metadata and corpus
metadata = pd.read_csv('results/final_models/metadata_2021.csv.gz', sep='\t')
uids = metadata.cord_uid.tolist()
doc_path_list = ['results/final_models/lda_2021_corpus_clean/' + uid + '_clean.txt' for uid in uids]

corpus = my_lda.MyCorpus(doc_path_list, dictionary=dictionary)

questions = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0)

In [4]:
# concatenate full background to questions
def q_plus_bg(row):
    return row.question + ' ' + row.background
    
questions['full_query'] = questions.apply(q_plus_bg, axis=1)
queries = questions.full_query.tolist()

In [None]:
# Main question only

In [5]:
def write_results_to_file(query, df, dest_path):
    with open(dest_path, 'w') as file:
        file.write(query + '\n\n')
        for i in range(len(df)):
            row = df.iloc[i]
            file.write(f"Result #{i+1}: uid {row.cord_uid} \n")
            file.write(row.title + '\n')
            file.write(f"Abstract: \n {row.abstract} \n\n")

directory = 'results/final_models/lda_2021_60_0-5_results/'

for i in range(len(queries)):
    query = queries[i]
    results_df = my_lda.lda_search(query, model, corpus, dictionary, metadata)
    dest_path = directory + f'q{i}'
    write_results_to_file(query, results_df, dest_path)  


100%|██████████| 53758/53758 [01:08<00:00, 788.27it/s] 
100%|██████████| 53758/53758 [01:09<00:00, 778.01it/s] 
100%|██████████| 53758/53758 [01:08<00:00, 785.83it/s] 
100%|██████████| 53758/53758 [01:05<00:00, 825.81it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 828.75it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 829.47it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 837.00it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 829.14it/s] 
100%|██████████| 53758/53758 [01:05<00:00, 816.60it/s] 
100%|██████████| 53758/53758 [01:07<00:00, 792.18it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 829.56it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 831.96it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 836.71it/s] 
100%|██████████| 53758/53758 [01:01<00:00, 876.96it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 835.34it/s] 
100%|██████████| 53758/53758 [01:04<00:00, 839.16it/s] 
100%|██████████| 53758/53758 [01:03<00:00, 841.73it/s] 
100%|██████████| 53758/53758 [01:03<00:00, 840.6

### Troubleshoot

In [6]:
def word_topics(word_id):
    return(model.get_term_topics(word_id, minimum_probability=None))

In [7]:
def print_pred(query, corpus, model):
    print(query, '\n')
    query = clean_text_lda(query)
    print(query, '\n')
    q_vec = corpus.dictionary.doc2bow(query.split())
    print("bow: ")
    for i, n in q_vec:
        print(corpus.id2word[i], n)
    pred = model[q_vec]
    pred.sort(key=lambda x: x[1])
    pred = pred[::-1]
    print("\nTopic predictions: ")
    print(pred, '\n')
    for word, _ in q_vec:
        print(corpus.id2word[word], word_topics(word))

In [10]:
print_pred('drive', corpus, model)

drive 

drive 

bow: 
drive 1

Topic predictions: 
[(52, 0.019755345), (42, 0.018223174), (22, 0.017937066), (38, 0.01768029), (17, 0.017669143), (26, 0.017626299), (31, 0.017428488), (53, 0.017271085), (5, 0.01704785), (45, 0.016961437), (15, 0.016917594), (59, 0.016904617), (56, 0.016887777), (47, 0.01685835), (32, 0.016842607), (49, 0.016782738), (24, 0.016780162), (46, 0.016750552), (14, 0.016734596), (37, 0.016724542), (25, 0.016684154), (8, 0.016679836), (48, 0.016662132), (3, 0.016644744), (1, 0.016639555), (10, 0.016626328), (16, 0.016604705), (20, 0.016589323), (39, 0.016558817), (58, 0.016474724), (35, 0.016469829), (55, 0.016460542), (7, 0.016460022), (28, 0.016449103), (12, 0.016430587), (41, 0.016425142), (57, 0.01638569), (44, 0.016347349), (34, 0.016299497), (30, 0.016286893), (33, 0.016272632), (19, 0.016265301), (2, 0.016254703), (40, 0.016254677), (0, 0.01625255), (6, 0.016246272), (4, 0.016230641), (54, 0.016228804), (36, 0.016216127), (51, 0.016215706), (21, 0.01619

In [9]:
for i, n in model.get_topic_terms(45, topn=50):
    print(model.id2word[i], n)

cancer 0.07067993
genetic 0.024497882
variants 0.022230443
sequence 0.016090862
mutations 0.014204685
variant 0.013256889
breast 0.011527257
gene 0.009862625
mutation 0.00887791
genomic 0.008049104
chemotherapy 0.0073077707
genes 0.00728924
survival 0.0072188056
frequency 0.006905627
genome 0.0066896803
oncology 0.0058932453
therapy 0.005628531
genomes 0.0055097146
cancers 0.0054290113
tumor 0.0049717235
lineage 0.004912226
selection 0.0048118723
populations 0.004806468
lineages 0.0046470375
allele 0.0044820653
rare 0.0044328873
stage 0.0040391567
snps 0.0035212233
aa 0.0035091995
advance 0.0034786137
susceptibility 0.003421268
phenotypes 0.0033462818
radiotherapy 0.0032536471
supplementary 0.0031986327
diversity 0.003135707
genetics 0.0030174633
genotype 0.0029587464
phenotype 0.002940324
uk 0.0028912742
variation 0.002862136
os 0.0027196992
lung 0.0027124712
colorectal 0.0027110993
polymorphisms 0.002635743
european 0.0026269658
alleles 0.0026216907
prostate 0.0026171284
combination 

In [86]:
# corpus.dictionary.id2token

In [88]:
# model.id2word