In [1]:
import pickle
import numpy as np

from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [65]:
tokens = pickle.load(open('../Preprocess/tokens_period.pkl', 'rb'))

### Gensim

Perform a loop that iterates from 2 to 50 topics and reports lower bound for perplexity of each.

In [66]:
dictionary = corpora.Dictionary(tokens)
dictionary.filter_extremes(0.1, 0.9)
dictionary.save('LDA_dictionary.gensim')

Split data into traning and testing.

In [67]:
corpus = [dictionary.doc2bow(text) for text in tokens]
pickle.dump(corpus, open('LDA_corpus.pkl', 'wb'))

train_corpus, test_corpus = np.split(corpus, [int(.7*len(corpus))])
print(len(train_corpus))
print(len(test_corpus))

115
50


In [None]:
for i in range(2, 51):
    ldamodel = LdaModel(train_corpus, num_topics = i, id2word=dictionary, passes=15, random_state=0)
    print("Topic {}:".format(i), ldamodel.log_perplexity(test_corpus))

Topic 2: -8.745019721323834
Topic 3: -8.940966245708825
Topic 4: -9.100257663588323
Topic 5: -9.270489064202733
Topic 6: -9.45019592293517
Topic 7: -9.586046039546957
Topic 8: -9.70801753568684
Topic 9: -9.863599635771822
Topic 10: -9.99292548182046
Topic 11: -10.1380703382823
Topic 12: -10.284586045963827
Topic 13: -10.454587636265869
Topic 14: -10.580754957328343
Topic 15: -10.684929429879695
Topic 16: -10.834258074515594
Topic 17: -10.977352529171716
Topic 18: -11.105535149324732
Topic 19: -11.235559046297373
Topic 20: -11.384488613777991
Topic 21: -11.466305363662864
Topic 22: -11.585290252821803
Topic 23: -11.661493061837199
Topic 24: -11.801682523006576
Topic 25: -11.941011476113408
Topic 26: -12.044187359387836
Topic 27: -12.166697532698265
Topic 28: -12.290921392385698
Topic 29: -12.393243801021558
Topic 30: -12.51230808772648
Topic 31: -12.626654534946397
Topic 32: -12.73455462910962
Topic 33: -12.82202881187144
Topic 34: -12.963479850578192
Topic 35: -13.078634642822038
Topic

### sklearn

Use GridSearch to optimize learning parameters for LDA. The only thing we are optimizing 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [6]:
def dummy_fun(tokens):
    return tokens

In [7]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=dummy_fun,
                            preprocessor=dummy_fun,
                            min_df=10,
                            max_df=0.9,
                            token_pattern=None) 
X = vectorizer.fit_transform(tokens)

In [8]:
search_params = {'n_components': list(range(1, 51))}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(X)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [9]:
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)

Best Model's Params:  {'n_components': 1}


### Gensim with Mallet

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

In [10]:
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
import gensim

In [30]:
mallet_path = "../mallet-2.0.8/bin/mallet"

In [54]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus=corpus, 
                                                 num_topics=num_topics, 
                                                 id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [68]:
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

In [69]:
# Show Topics
for t_num, words in ldamallet.show_topics(formatted=False):
    print("Topic number {}:".format(t_num))
    for word, score in words:
        print(word, score)
    print()

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, 
                                           texts=tokens, 
                                           dictionary=dictionary, 
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

Topic number 0:
show 0.055226824457593686
process 0.03944773175542406
tool 0.03747534516765286
test 0.03747534516765286
evaluation 0.03550295857988166
disease 0.03353057199211045
diagnostic 0.029585798816568046
step 0.027613412228796843
level 0.023668639053254437
improvement 0.021696252465483234

Topic number 5:
processor 0.04288939051918736
program 0.033860045146726865
element 0.033860045146726865
set 0.029345372460496615
instruction 0.02708803611738149
dataflow 0.02708803611738149
trace 0.02708803611738149
current 0.02708803611738149
processing 0.024830699774266364
technique 0.01805869074492099

Topic number 6:
algorithm 0.10053859964093358
decision 0.07899461400359066
tree 0.06822262118491922
classification 0.04129263913824058
classifier 0.03770197486535009
domain 0.03052064631956912
show 0.026929982046678635
accuracy 0.02333931777378815
measure 0.02154398563734291
bayesian 0.017953321364452424

Topic number 9:
st 0.12939841089670828
episode 0.06583427922814983
segment 0.04199772985

In [70]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, 
                                                        corpus=corpus, 
                                                        texts=tokens, 
                                                        start=2, 
                                                        limit=50, 
                                                        step=2)

In [71]:
limit=50; start=2; step=2;
x = range(start, limit, step)
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.2938
Num Topics = 4  has Coherence Value of 0.3714
Num Topics = 6  has Coherence Value of 0.3614
Num Topics = 8  has Coherence Value of 0.3835
Num Topics = 10  has Coherence Value of 0.4244
Num Topics = 12  has Coherence Value of 0.3883
Num Topics = 14  has Coherence Value of 0.3891
Num Topics = 16  has Coherence Value of 0.425
Num Topics = 18  has Coherence Value of 0.4245
Num Topics = 20  has Coherence Value of 0.4288
Num Topics = 22  has Coherence Value of 0.4497
Num Topics = 24  has Coherence Value of 0.4629
Num Topics = 26  has Coherence Value of 0.4748
Num Topics = 28  has Coherence Value of 0.4932
Num Topics = 30  has Coherence Value of 0.4997
Num Topics = 32  has Coherence Value of 0.5061
Num Topics = 34  has Coherence Value of 0.5397
Num Topics = 36  has Coherence Value of 0.5265
Num Topics = 38  has Coherence Value of 0.5537
Num Topics = 40  has Coherence Value of 0.5214
Num Topics = 42  has Coherence Value of 0.547
Num Topics = 44  ha

In [72]:
import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure()

fig = go.Figure(data=go.Scatter(x=list(x), y=coherence_values))

fig.update_layout(
    title=go.layout.Title(
        text="Coherence values for number of topics",
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Number of topics",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Coherence value",
        )
    )
)


fig.show()