# Model selection
Selection of LDA model hyperparameters by [topic coherence](https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0)

In [1]:
# import modules
import os

import pandas as pd
from tqdm import tqdm

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import pickle

from my_files import get_text
import my_preprocessing
from my_preprocessing import clean_text
from my_lda import MyCorpus

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [2]:
# load metadata
df = my_preprocessing.load_metadata()
print(len(df))
# drop documents that aren't research articles (front matter, obituaries, etc.)
df = my_preprocessing.drop_non_research(df)
print(len(df))
# generate list of file paths for corpus
filelist = ['data/txt/' + file for file in df.filename]

16842
15092


In [3]:
# create corpus with existing dictionary

corpus = MyCorpus(filelist, clean_text, dictionary=Dictionary.load('models/dictionary.dict'))

In [4]:
# # train model with default settings

# model = LdaMulticore(corpus=corpus,
#                      id2word=corpus.dictionary.id2token,
#                      num_topics=25, 
#                      random_state=42,
#                      chunksize=100,
#                      passes=10,
#                      per_word_topics=True)

In [5]:
# # # save trained model
# model_path = 'models/lda_25_default.pkl'

# with open(model_path, 'wb') as file:
#     pickle.dump(model, file)

In [6]:
# # load existing model
# model_path = 'models/lda_25_default.pkl'

# with open(model_path, 'rb') as file:
#     model = pickle.load(file)

## Train more models with different hyperparameters

In [7]:
import os

num_topics_list = [25, 30]
alpha_list = [0.0001, 0.001, 0.01, 0.1, 1]
eta_list = [0.0001, 0.001, 0.01, 0.1, 1]

for num_topics in num_topics_list:
    for alpha in alpha_list:
        for eta in eta_list:
            model_path = f"models/lda_{num_topics}_{alpha}_{eta}.pkl"
            if not os.path.exists(model_path):
                model = LdaMulticore(corpus=corpus,
                    id2word=corpus.dictionary.id2token,
                    num_topics=num_topics,
                    alpha=alpha,
                    eta=eta,
                    random_state=42,
                    chunksize=100,
                    passes=5,
                    per_word_topics=True)
                with open(model_path, 'wb') as file:
                    pickle.dump(model, file)
                print(f"Model {model_path} complete.")
            else:
                print(model_path, "already exists")

models/lda_25_0.0001_0.0001.pkl already exists
models/lda_25_0.0001_0.001.pkl already exists
models/lda_25_0.0001_0.01.pkl already exists
models/lda_25_0.0001_0.1.pkl already exists
models/lda_25_0.0001_1.pkl already exists
models/lda_25_0.001_0.0001.pkl already exists
models/lda_25_0.001_0.001.pkl already exists
models/lda_25_0.001_0.01.pkl already exists
models/lda_25_0.001_0.1.pkl already exists
models/lda_25_0.001_1.pkl already exists
models/lda_25_0.01_0.0001.pkl already exists
models/lda_25_0.01_0.001.pkl already exists
models/lda_25_0.01_0.01.pkl already exists
models/lda_25_0.01_0.1.pkl already exists
models/lda_25_0.01_1.pkl already exists
models/lda_25_0.1_0.0001.pkl already exists
models/lda_25_0.1_0.001.pkl already exists
models/lda_25_0.1_0.01.pkl already exists
models/lda_25_0.1_0.1.pkl already exists
models/lda_25_0.1_1.pkl already exists
models/lda_25_1_0.0001.pkl already exists
models/lda_25_1_0.001.pkl already exists
models/lda_25_1_0.01.pkl already exists
models/lda_

## Measure topic [coherence](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf)

In [14]:
model_list = ['models/' + filename for filename in os.listdir('models') if filename.endswith('.pkl')]

In [15]:
# Compute Coherence Score
path = model_list[0]
with open(path, 'rb') as file:
    model = pickle.load(file)

coherence_model_lda = CoherenceModel(model=model, texts=corpus, dictionary=corpus.dictionary, coherence='c_v')

In [16]:
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs



Coherence Score:  nan
