In [11]:
import pandas as pd
from tqdm import tqdm

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import pickle

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from my_files import get_text
import my_preprocessing
from my_preprocessing import clean_text
from my_lda import MyCorpus

In [12]:
# load metadata
df = my_preprocessing.load_metadata()
print(len(df))
# drop documents that aren't research articles (front matter, obituaries, etc.)
df = my_preprocessing.drop_non_research(df)
print(len(df))
# generate list of file paths for corpus
filelist = ['data/txt/' + file for file in df.filename]

16842
15092


In [13]:
# # create a dictionary for the corpus

# corpus = MyCorpus(filelist, clean_text)
# corpus.make_dictionary(save_directory='models/')
# corpus.filter_extremes(no_below=5, 
#                        no_above=0.5, 
#                        keep_n=len(corpus.dictionary),
#                        keep_tokens=None, 
#                        save_directory='models/', 
#                        file_name="dictionary")

In [4]:
# corpus.dictionary.save('models/dictionary.dict')

In [10]:
# create corpus with existing dictionary

corpus = MyCorpus(filelist, clean_text, dictionary=Dictionary.load('models/dictionary.dict'))

## Model training

In [6]:
num_topics = 25
alpha = 1.0 # 50/num_topics is standard
eta = 0.002 # 200/(number of words in dictionary) is standard

model_id = f'lda_{str(num_topics)}_{str(alpha)}_{str(eta)}' # create model id based on hyperparameters
model_path = 'models/' + model_id + '.pkl'

In [6]:
# # train model
# model = LdaModel(corpus, 
#                  num_topics=num_topics, 
#                  id2word = corpus.id2word,
#                  alpha=alpha, 
#                  eta=eta)

In [7]:
# # save trained model
# with open(model_path, 'wb') as file:
#     pickle.dump(model, file)

In [10]:
# # load previously trained model

# with open(model_path, 'rb') as file:
#     model = pickle.load(file)

## Generate Visualization

In [8]:
vis = gensimvis.prepare(model, corpus, corpus.dictionary, sort_topics=False)

In [9]:
pyLDAvis.save_html(vis, f'models/{model_id}.html')