In [1]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
import plotly.figure_factory as FF

py.init_notebook_mode()

# Train Model

In [2]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary, MmCorpus
import pandas as pd
import re
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

Using TensorFlow backend.


In [3]:
df_fake = pd.read_csv('fake.csv')
df_fake[['title', 'text', 'language']].head()
df_fake = df_fake.loc[(pd.notnull(df_fake.text)) & (df_fake.language == 'english')]

# remove stopwords and punctuations
def preprocess(row):
    return strip_punctuation(remove_stopwords(row.lower()))
    
df_fake['text'] = df_fake['text'].apply(preprocess)

# Convert data to required input format by LDA
texts = []
for line in df_fake.text:
    lowered = line.lower()
    words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE)
    texts.append(words)
# Create a dictionary representation of the documents.
dictionary = Dictionary(texts)

# Filter out words that occur less than 2 documents, or more than 30% of the documents.
dictionary.filter_extremes(no_below=2, no_above=0.4)
# Bag-of-words representation of the documents.
corpus_fake = [dictionary.doc2bow(text) for text in texts]

In [19]:
lda_fake = LdaModel(corpus=corpus_fake, id2word=dictionary, num_topics=35, chunksize=1500, iterations=200, alpha='auto')
lda_fake.save('lda_35')

In [3]:
lda_fake = LdaModel.load('lda_35')

## Basic Dendrogram

In [4]:
from gensim.matutils import jenson_shannon

topic_dist = lda_fake.state.get_lambda()

def js_dist(X):
    return pdist(X, lambda u, v: jenson_shannon(u, v))

dendro = FF.create_dendrogram(topic_dist, distfun=js_dist, labels=range(1, 36))
dendro['layout'].update({'width':800, 'height':500})
py.iplot(dendro)

## Dendrogram with a Heatmap

In [5]:
# Initialize figure by creating upper dendrogram
figure = FF.create_dendrogram(topic_dist, orientation='bottom', distfun=js_dist, labels = range(1, 36))
for i in range(len(figure['data'])):
    figure['data'][i]['yaxis'] = 'y2'

In [6]:
mdiff, annotation = lda_fake.diff(lda_fake, distance="jenson_shannon", normed=False)

# get reordered topic list
dendro_leaves = figure['layout']['xaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves-1))

# reorder distance matrix
heat_data = mdiff[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]

In [8]:
annotation_html = [["+++ {}<br>--- {}".format(", ".join(int_tokens), 
                                          ", ".join(diff_tokens)) 
                        for (int_tokens, diff_tokens) in row] 
                       for row in annotation]

heatmap = go.Data([
    go.Heatmap(
        z = heat_data,
        colorscale = 'YIGnBu',
        text = annotation_html
    )
])

heatmap[0]['x'] = figure['layout']['xaxis']['tickvals']
heatmap[0]['y'] = figure['layout']['xaxis']['tickvals']

# Add Heatmap Data to Figure
figure['data'].extend(heatmap)

dendro_leaves = [x+1 for x in dendro_leaves]

# Edit Layout
figure['layout'].update({'width':800, 'height':800,
                         'showlegend':False, 'hovermode': 'closest',
                         })

# Edit xaxis
figure['layout']['xaxis'].update({'domain': [.25, 1],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  "showticklabels": True, 
                                  "tickmode": "array",
                                  "ticktext" : dendro_leaves,
                                  "tickvals" : figure['layout']['xaxis']['tickvals'],
                                  'zeroline': False,
                                  'ticks':""})
# Edit xaxis2
figure['layout'].update({'xaxis2': {'domain': [0, .15],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""}})

# Edit yaxis
figure['layout']['yaxis'].update({'domain': [0, 0.75],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  "showticklabels": True, 
                                  "tickmode": "array",
                                  "ticktext" : dendro_leaves,
                                  "tickvals" : figure['layout']['xaxis']['tickvals'],
                                  'zeroline': False,
                                  'ticks': ""})
# Edit yaxis2
figure['layout'].update({'yaxis2':{'domain':[0.75, 1],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""}})

py.iplot(figure)

In [10]:
import pyLDAvis as ldavis
import pyLDAvis.gensim
ldavis.enable_notebook()

viz = ldavis.gensim.prepare(lda_fake, corpus_fake, dictionary, sort_topics=False)
viz



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix



In [9]:
def plot_difference(mdiff, title="", annotation=None):
    """
    Helper function for plot difference between models
    """
    annotation_html = None
    if annotation is not None:
        annotation_html = [["+++ {}<br>--- {}".format(", ".join(int_tokens), 
                                              ", ".join(diff_tokens)) 
                            for (int_tokens, diff_tokens) in row] 
                           for row in annotation]
        
    data = go.Heatmap(z=mdiff, colorscale='YIGnBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title,
                       xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    py.iplot(dict(data=[data], layout=layout))

In [10]:
mdiff, annotation = lda_fake.diff(lda_fake, distance='jenson_shannon', num_words=50)
plot_difference(mdiff, title="Topic difference (one model) [jenson_shannon distance]", annotation=annotation)