In [14]:
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as FF
py.init_notebook_mode()

import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary, MmCorpus

df_fake = pd.read_csv('fake.csv')
df_fake[['title', 'text', 'language']].head()
df_fake = df_fake.loc[(pd.notnull(df_fake.text)) & (df_fake.language == 'english')]
df_fake.shape

(12357, 20)

In [2]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
import gensim
import re
from gensim.models import LdaModel
from gensim.corpora import Dictionary, MmCorpus

# remove stopwords and punctuations
def preprocess(row):
    return strip_punctuation(remove_stopwords(row.lower()))
    
df_fake['text'] = df_fake['text'].apply(preprocess)

# Convert data to required input format by LDA
texts = []
for line in df_fake.text:
    lowered = line.lower()
    words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE)
    texts.append(words)

vocabulary = Dictionary(texts)
vocabulary.filter_extremes(no_below=3, no_above=0.5)

In [3]:
len(vocabulary)

42038

In [4]:
MmCorpus.serialize('./fake_news.mm', (vocabulary.doc2bow(doc) for doc in texts))
vocabulary.save('./fake_news.vocab')
del texts
corpus_fake = MmCorpus('./fake_news.mm')

In [38]:
lda_fake = LdaModel(corpus=corpus_fake, id2word=vocabulary, num_topics=35, chunksize=1500, iterations=200, alpha='auto')

In [7]:
# lda_fake = LdaModel.load('fake_news_35.lda')

In [41]:
import pyLDAvis as ldavis
import pyLDAvis.gensim

ldavis.enable_notebook()
prepared_data = ldavis.gensim.prepare(lda_fake, corpus_fake, vocabulary, sort_topics=False)
prepared_data



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix



In [15]:
def plot_difference(mdiff, title="", annotation=None):
    """
    Helper function for plot difference between models
    """
    annotation_html = None
    if annotation is not None:
        annotation_html = [["+++ {}<br>--- {}".format(", ".join(int_tokens), 
                                              ", ".join(diff_tokens)) 
                            for (int_tokens, diff_tokens) in row] 
                           for row in annotation]
        
    data = go.Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title,
                       xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    py.iplot(dict(data=[data], layout=layout))

In [16]:
mdiff, annotation = lda_fake.diff(lda_fake, distance='jenson_shannon', num_words=50)
plot_difference(mdiff, title="Topic difference [jenson_shannon distance]", annotation=annotation)