In [1]:
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pickle as pkl
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path
import matplotlib.pyplot as plt

# Load Data

In [2]:
# Load raw corpus dataframe
# load cleaned corpus
with open('data/data_lemmatized.pkl', 'rb') as f:
    data_lemmatized = pkl.load(f)
with open('data/cleaned_corpus.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open("data/id2word.pkl", 'rb') as f:
    id2word= pkl.load(f)

In [3]:
max_topics = 40
min_topics = 2

In [4]:
# # Load perplexity and coherence scores for plotting
# with open("data/perplexity_recalculated.pkl", 'rb') as f:
#     perplexity = pkl.load(f)
# with open("data/coherence_recalculated.pkl", 'rb') as f:
#     coherence = pkl.load(f)         

In [None]:
# init storage
perplexity = {}
coherence = {}

for topics in range(min_topics, max_topics):
    
    # load the model
    try: 
        fname = f'trained_models/trained_lda_model_{topics}'
        lda_model = LdaModel.load(fname)
    except:
        print(f'No trained model for {topics} topics')
        continue

    #Compute Perplexity
    perplexity[topics] = lda_model.log_perplexity(corpus) 

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence[topics] = coherence_model_lda.get_coherence()

    with open("data/perplexity_recalculated.pkl", 'wb') as f:
        pkl.dump(perplexity, f)
    with open("data/coherence_recalculated.pkl", 'wb') as f:
        pkl.dump(coherence, f)         

    # screen report
    print(f"Num Topics = {topics}: Perplexity = {perplexity[topics]}, Coherence = {coherence[topics]}")

Num Topics = 2: Perplexity = -7.609894813951374, Coherence = 0.36438118953575294
Num Topics = 3: Perplexity = -7.546708886077306, Coherence = 0.3819799345514476
Num Topics = 4: Perplexity = -7.50943619181426, Coherence = 0.41022892485924534
Num Topics = 5: Perplexity = -7.486081360161937, Coherence = 0.4220901894918227
Num Topics = 6: Perplexity = -7.476420829759688, Coherence = 0.4410712057346193
Num Topics = 7: Perplexity = -7.4749713620413845, Coherence = 0.4837496161680563
Num Topics = 8: Perplexity = -7.503085750994842, Coherence = 0.48865928083578536
Num Topics = 9: Perplexity = -7.539730749325506, Coherence = 0.47519965489049176
Num Topics = 10: Perplexity = -7.5848152260721164, Coherence = 0.47974381799874644
Num Topics = 11: Perplexity = -7.623699217763337, Coherence = 0.49272180287774303
Num Topics = 12: Perplexity = -7.662205717441724, Coherence = 0.49866675574650166


In [None]:
# Load perplexity and coherence scores for plotting
with open("data/perplexity_recalculated.pkl", 'rb') as f:
    perplexity = pkl.load(f)
with open("data/coherence_recalculated.pkl", 'rb') as f:
    coherence = pkl.load(f)         

In [None]:
# Plot coherence and perplexity scores

# grab colors
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

# init figure
fig, ax = plt.subplots(figsize=(12,6))

# plot perplexity
lists = sorted(perplexity.items())
x, y = zip(*lists) 
pltp = ax.plot(x, y, label='perplexity', linewidth=5) 

# plot coherence
ax2 = ax.twinx() 
lists = sorted(coherence.items())
x, y = zip(*lists) 
pltc = ax2.plot(x, y, label='coherence', linewidth=5, color = colors[2])

# axis labels
ax.set_xlabel('Number of Topics', fontsize=14)
ax.set_ylabel('Perplexity', fontsize=14)
ax2.set_ylabel('Coherence', fontsize=14)
ax.set_title('Finding the Optimal Number of Topics', fontsize=20)

# legend
ax.legend(pltp+pltc, ['Perplexity', 'Coherence'], fontsize=14, loc='upper left')

# aesthetics
ax.grid()

# Save figure
plt.savefig('figures/perplexity_coherence_logscale.png')