In [1]:
import pickle
import numpy as np

from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [3]:
tokens = pickle.load(open('../Preprocess/tokens.pkl', 'rb'))

### Gensim

Perform a loop that iterates from 2 to 50 topics and reports lower bound for perplexity of each.

In [4]:
dictionary = corpora.Dictionary(tokens)
dictionary.filter_extremes(0.1, 0.9)
dictionary.save('LDA_dictionary.gensim')

Split data into traning and testing.

In [5]:
corpus = [dictionary.doc2bow(text) for text in tokens]
pickle.dump(corpus, open('LDA_corpus.pkl', 'wb'))

train_corpus, test_corpus = np.split(corpus, [int(.7*len(corpus))])
print(len(train_corpus))
print(len(test_corpus))

1247
535


In [6]:
for i in range(2, 51):
    ldamodel = LdaModel(train_corpus, num_topics = i, id2word=dictionary, passes=15, random_state=0)
    print("Topic {}:".format(i), ldamodel.log_perplexity(test_corpus))

Topic 2: -8.745019721323834
Topic 3: -8.940966245708825


KeyboardInterrupt: 

### sklearn

Use GridSearch to optimize learning parameters for LDA. The only thing we are optimizing 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [6]:
def dummy_fun(tokens):
    return tokens

In [7]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=dummy_fun,
                            preprocessor=dummy_fun,
                            min_df=10,
                            max_df=0.9,
                            token_pattern=None) 
X = vectorizer.fit_transform(tokens)

In [None]:
search_params = {'n_components': list(range(1, 51))}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(X)

In [None]:
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)

### Gensim with Mallet

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

In [7]:
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
import gensim

In [13]:
mallet_path = "/usr/local/lib/python3.6/dist-packages/mallet-2.0.8/bin/mallet"

In [14]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus=corpus, 
                                                 num_topics=num_topics, 
                                                 id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [15]:
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

CalledProcessError: Command '/usr/local/lib/python3.6/dist-packages/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/6daf5d_corpus.txt --output /tmp/6daf5d_corpus.mallet' returned non-zero exit status 127.

In [12]:
# Show Topics
for t_num, words in ldamallet.show_topics(formatted=False):
    print("Topic number {}:".format(t_num))
    for word, score in words:
        print(word, score)
    print()

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, 
                                           texts=tokens, 
                                           dictionary=dictionary, 
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

NameError: name 'ldamallet' is not defined

In [31]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, 
                                                        corpus=corpus, 
                                                        texts=tokens, 
                                                        start=2, 
                                                        limit=50, 
                                                        step=5)

In [32]:
limit=50; start=2; step=5;
x = range(start, limit, step)
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.3573
Num Topics = 7  has Coherence Value of 0.3597
Num Topics = 12  has Coherence Value of 0.3587
Num Topics = 17  has Coherence Value of 0.3816
Num Topics = 22  has Coherence Value of 0.3612
Num Topics = 27  has Coherence Value of 0.3489
Num Topics = 32  has Coherence Value of 0.3415
Num Topics = 37  has Coherence Value of 0.346
Num Topics = 42  has Coherence Value of 0.3328
Num Topics = 47  has Coherence Value of 0.3315
