In [1]:
import pandas as pd
import re
import spacy
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim

In [2]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
    output = []
    for sent in texts:
        doc = nlp(str(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
    return output

In [3]:
data = pd.read_csv("/opt/splunk/etc/apps/Multimodal/lookups/topic_model.csv")
text_list= data['text'].tolist()

In [5]:
tokens = lemmatization(text_list)
dictionary = corpora.Dictionary(tokens)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokens]

In [4]:
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)
corpus = corpora.MmCorpus('corpus.mm')

In [38]:
number=3

In [39]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=corpus, id2word=dictionary, num_topics=number, random_state=100, chunksize=1000, passes=50,iterations=100)

In [40]:
import math
def perplexity(ldamodel, testset, dictionary, size_dictionary, num_topics):
    """calculate the perplexity of a lda-model"""
    prep = 0.0
    prob_doc_sum = 0.0
    topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...]
    for topic_id in range(num_topics):
        topic_word = ldamodel.show_topic(topic_id, size_dictionary)
        dic = {}
        for word, probability in topic_word:
            dic[word] = probability
        topic_word_list.append(dic)
    doc_topics_ist = [] #store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]
    for doc in testset:
        doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0))
    testset_word_num = 0
    for i in range(len(testset)):
        prob_doc = 0.0 # the probablity of the doc
        doc = testset[i]
        doc_word_num = 0 # the num of words in the doc
        for word_id, num in dict(doc).items():
            prob_word = 0.0 # the probablity of the word 
            doc_word_num += num
            word = dictionary[word_id]
            for topic_id in range(num_topics):
                # cal p(w) : p(w) = sumz(p(z)*p(w|z))
                prob_topic = doc_topics_ist[i][topic_id][1]
                prob_topic_word = topic_word_list[topic_id][word]
                prob_word += prob_topic*prob_topic_word
            prob_doc += math.log(prob_word) # p(d) = sum(log(p(w)))
        prob_doc_sum += prob_doc
        testset_word_num += doc_word_num
    prep = math.exp(-prob_doc_sum/testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd))
    return prep

In [41]:
testset = []
for i in range(0,corpus.num_docs,max(1,int(corpus.num_docs/number))):
    try:
        testset.append(corpus[i])
    except:
        continue

In [42]:
prep = perplexity(lda_model, testset, dictionary, len(dictionary.keys()), number)
prep

57.36039257407317

In [43]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [44]:
import pickle
lda_model.save('lda.model')
pickle.dump(doc_term_matrix,open('doc_term_matrix.pkl','wb'))
pickle.dump(dictionary,open('dictionary.pkl','wb'))

In [45]:
newlda = LDA.load('lda.model')
newdoc_term_matrix=pickle.load(open('doc_term_matrix.pkl', 'rb'))
newdictionary=pickle.load(open('dictionary.pkl', 'rb'))

In [46]:
vis = pyLDAvis.gensim.prepare(newlda, newdoc_term_matrix, newdictionary,mds='mmds')

In [47]:
pyLDAvis.enable_notebook()

In [48]:
pyLDAvis.display(vis)