# LDA-based search engine
Requirements: Trained topic model, id2word

Method:
- 

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import re

import scipy.sparse as sparse
from scipy.spatial.distance import cosine

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.matutils import cossim

from preprocessing import load_cleaned_metadata, clean_text_lda
from data_access import get_txt
import my_lda

# First run on all questions

In [2]:
# load model
model = LdaModel.load('data/cord-19/body_text/test')
# load dictionary
dictionary = Dictionary.load('data/cord-19/body_text/test.dict')

# load metadata and corpus
metadata = pd.read_csv('data/processed/metadata_jan21.csv', sep='\t', index_col=0)
uids = metadata.cord_uid.tolist()
doc_path_list = ['data/cord-19/body_text/lda_clean/' + uid + '_clean.txt' for uid in uids]

corpus = my_lda.MyCorpus(doc_path_list, dictionary=dictionary)

queries = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0).question.tolist()

In [12]:
def write_results_to_file(query, df, dest_path):
    with open(dest_path, 'w') as file:
        for i in range(len(df)):
            row = df.iloc[i]
            file.write(query + '\n\n')
            file.write(f"Result #{i+1}: uid {row.cord_uid} \n")
            file.write(row.title + '\n')
            file.write(f"Abstract: \n {row.abstract} \n\n")

directory = 'results/lda_jan21_01/'

for i in range(len(queries)):
    query = queries[i]
    results_df = my_lda.lda_search(query, model, corpus, dictionary, metadata)
    dest_path = directory + f'q{i}'
    write_results_to_file(query, results_df, dest_path)  


100%|██████████| 49366/49366 [00:27<00:00, 1767.09it/s]
100%|██████████| 49366/49366 [00:28<00:00, 1714.04it/s]
100%|██████████| 49366/49366 [00:31<00:00, 1545.67it/s]
100%|██████████| 49366/49366 [00:33<00:00, 1466.35it/s]
100%|██████████| 49366/49366 [00:30<00:00, 1645.16it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1650.12it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1675.73it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1671.54it/s]
100%|██████████| 49366/49366 [00:28<00:00, 1730.74it/s]
100%|██████████| 49366/49366 [00:28<00:00, 1721.79it/s]
100%|██████████| 49366/49366 [00:28<00:00, 1733.81it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1662.95it/s]
100%|██████████| 49366/49366 [00:30<00:00, 1598.28it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1659.13it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1685.72it/s]
100%|██████████| 49366/49366 [00:30<00:00, 1633.92it/s]
100%|██████████| 49366/49366 [00:27<00:00, 1794.47it/s]
100%|██████████| 49366/49366 [00:29<00:00, 1648.