In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import os
from tqdm import tqdm

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from preprocessing import load_cleaned_metadata, clean_text_lda, get_clean_write
from data_access import get_txt
from my_lda import MyCorpus
import my_lda

In [None]:
# load metadata
df = load_cleaned_metadata('data/processed/metadata_clean.csv.gz')
len(df)

In [None]:
# # filter documents from 2021 only
# df.date = pd.to_datetime(df.date)
# df = df[df.date.apply(lambda x: x.year == 2021)]
# len(df)

In [None]:
# df.to_csv('results/final_models/metadata_2021.csv.gz', index=False, sep='\t', compression='gzip')

In [None]:
df = pd.read_csv('results/final_models/metadata_2021.csv.gz', sep='\t', compression='gzip')
df.date = pd.to_datetime(df.date)
len(df)

In [None]:
# # 2000 onwards
# df.date = pd.to_datetime(df.date)
# df = df[df.date.apply(lambda x: x.year >= 2000)]
# len(df)

# Create data files of clean data

In [None]:
# # to save time, clean only files for ids that aren't already in the directory

# dest_directory = 'results/final_models/lda_2021_corpus_clean/'
# existing_files = os.listdir(dest_directory)
# ids = df.cord_uid.tolist()
# missing_ids = [i for i in ids if f'{i}_clean.txt' not in existing_files]
# print(len(ids))
# print(len(existing_files))
# print(len(missing_ids))

In [None]:
# for uid in tqdm(missing_ids):
#     get_clean_write(uid, dest_directory)

In [None]:
# len(os.listdir('results/final_models/lda2000plus_corpus_clean/'))

## Create corpus and dictionary

Required output:
- topic profile of every document
- visualisation of topics

> "[Gensim](https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-streaming-tutorial) accepts any object that, when iterated over, successively yields documents."

In [None]:
# list of full paths for documents to be accessed

corpus_directory = 'results/final_models/lda_2021_corpus_clean/'
doc_path_list = [corpus_directory + file for file in os.listdir(corpus_directory) if file.endswith('.txt')]

# initiate corpus object
save_directory = 'results/final_models/'
dictionary = Dictionary.load(save_directory + 'dictionary.dict')
corpus = MyCorpus(doc_path_list, dictionary=dictionary)

In [None]:
# save_directory = 'results/final_models/'
# corpus.make_dictionary(save_directory, 'dictionary')

In [None]:
# # filter extremes
# corpus.filter_extremes(no_below=5, no_above=0.2)

In [None]:
print(len(corpus))
print(len(corpus.dictionary))

### Train model

In [None]:
num_topics = 100
alpha = 0.01
eta = 0.01

filename = f'lda_{num_topics}_{alpha}_{eta}'

In [None]:
# model = LdaModel(corpus, num_topics=num_topics, id2word=corpus.id2word, alpha=alpha, eta=eta)

In [None]:
# model.save(save_directory+filename+"_model")

In [None]:
# load pre-trained model
model = LdaModel.load(save_directory + filename + '_model')

### Visualise

In [None]:
# vis = gensimvis.prepare(model, corpus, corpus.dictionary, sort_topics=False)

In [None]:
# pyLDAvis.save_html(vis, save_directory+filename+'_vis.html')

### Run search using model

In [None]:
questions_df = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0)

In [None]:
# # concatenate full background to questions
# def q_plus_bg(row):
#     return row.question + ' ' + row.background
# questions_df['full_query'] = questions_df.apply(q_plus_bg, axis=1)
# queries = questions_df.full_query.tolist()

# main question only
queries = questions_df.question.tolist()

def write_results_to_file(query, df, dest_path):
    with open(dest_path, 'w') as file:
        file.write(query + '\n\n')
        for i in range(len(df)):
            row = df.iloc[i]
            file.write(f"Result #{i+1}: uid {row.cord_uid} \n")
            file.write(row.title + '\n')
            file.write(f"Abstract: \n {row.abstract} \n\n")

results_dir = save_directory+filename
os.mkdir(results_dir)

for i in range(len(queries)):
    query = queries[i]
    results_df = my_lda.lda_search(query, model, corpus, corpus.dictionary, df)
    dest_path = results_dir + f'/q{i}'
    write_results_to_file(query, results_df, dest_path) 