# Comparison of two LDA models & visualize difference

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
logging.info("check")

## 1. Prepare dataset (tokenize & stem)

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups()

In [3]:
from string import punctuation

from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

tokenizer = RegexpTokenizer('\s+', gaps=True)
stemmer = PorterStemmer()
translate_tab = {ord(p): u" " for p in punctuation}

def text2tokens(raw_text):
    clean_text = raw_text.lower().translate(translate_tab)
    tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    return filter(lambda token: len(token) > 2, stemmed_tokens) # skip short tokens

dataset = [text2tokens(txt) for txt in newsgroups['data']]

## 2. Fit dictionary

In [4]:
from gensim.corpora import Dictionary

dictionary = Dictionary(documents=dataset, prune_at=None)
print(dictionary)

dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)
dictionary.compactify()
print(dictionary)

d2b_dataset = [dictionary.doc2bow(doc) for doc in dataset]

Dictionary(105726 unique tokens: [u'fawn', u'3ds2scn', u'circuitri', u'l1tbk', u'woodi']...)
Dictionary(18450 unique tokens: [u'circuitri', u'orthogon', u'woodi', u'osiri', u'yellow']...)


## 3. Fit Lda & dump model every each epoch

In [5]:
import os

model_dir = "models/"
model_name_pattern = "lda_epoch_{}.model"
m_path = os.path.join(model_dir, model_name_pattern)


if not os.path.exists(model_dir):
    os.mkdir(model_dir)

In [6]:
from gensim.models import LdaMulticore

lda = LdaMulticore(num_topics=15, id2word=dictionary, workers=4, eval_every=None, passes=5, batch=True)

for ep in range(3):
    logging.error("Epoch %d", ep + 1)
    lda.update(d2b_dataset)
    lda.save(m_path.format(ep + 1))

2017-05-07 17:12:57,058 : ERROR : Epoch 1
2017-05-07 17:13:55,619 : ERROR : Epoch 2
2017-05-07 17:14:45,761 : ERROR : Epoch 3


## And now a question: "What is a difference between models from epoch?"

## 4. Show difference between models

In [7]:
from gensim.model_difference import topic2topic_difference
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()


p1, p2 = m_path.format(2), m_path.format(3)
m1, m2 = LdaMulticore.load(p1), LdaMulticore.load(p2)

In [9]:
mdiff, annotation = topic2topic_difference(m1, m2, distance='kulback_leibler')
text = [["+++ {}<br>--- {}".format(", ".join(int_tokens), 
                                   ", ".join(diff_tokens)) for (int_tokens, diff_tokens) in row] 
        for row in annotation]

data = go.Heatmap(z=mdiff, colorscale='RdBu', text=text)

layout = go.Layout(width=950, height=950, title="Topic difference (kulback_leibler)",
                   xaxis=dict(title="topic"), yaxis=dict(title="topic"))

py.iplot(go.Figure(data=[data], layout=layout))

In [10]:
mdiff, annotation = topic2topic_difference(m1, m2, distance='jaccard')
text = [["+++ {}<br>--- {}".format(", ".join(int_tokens), 
                                   ", ".join(diff_tokens)) for (int_tokens, diff_tokens) in row] 
        for row in annotation]

data = go.Heatmap(z=mdiff, colorscale='RdBu', text=text)

layout = go.Layout(width=950, height=950, title="Topic difference (jaccard)",
                   xaxis=dict(title="topic"), yaxis=dict(title="topic"))

py.iplot(go.Figure(data=[data], layout=layout))