# Part 3: topic modelling

In [2]:
# -*- coding: utf-8 -*-
# goto https://github.com/n-lo/Tweets_analysis_tryout for other parts
%matplotlib inline

from __future__ import print_function
import pprint, datetime
import pandas as pd
import numpy as np
import gensim, re
import pyLDAvis.gensim
from dateutil import parser
from ggplot import *

import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 14, 10

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# e.g. logging.info("%i tweets found..." % len(df))

pp = pprint.PrettyPrinter(indent=4)

### This is part three of data exploration, see part one for quick data look and sentiment analysis, part two for word cloud.

## Load in data

Dataset contains Tweets between 09-08-2016 and 23-08-2016, with sentiment score from part one sentiment analysis.  

Load dataframe from hdf5 file (see db2df.ipynb for loading data from MongoDB to dataframe)

In [3]:
print("Load df from hdf5 file.")
hdf = pd.HDFStore('onp_sentiment.h5')
df = hdf['df']
hdf.close()
print("%d Tweets loaded." % len(df))

Load df from hdf5 file.
9486 Tweets loaded.


#### Note:
When the data set gets too big, can try random select samples:  

from sklearn.cross_validation import train_test_split  
import random  
random.seed(1234)  
train, test = train_test_split(df, test_size = 0.5)  
del(df)  
df = train  

# Topic modelling

Try out some topic modelling and see what the tweets about.

### 1. Tokenize tweets text

In [4]:
sw = set(gensim.parsing.preprocessing.STOPWORDS)
my_sw = ['http', 'https', 'RT']

In [5]:
def remove_junk(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@+", " ", text)
    return text

In [6]:
def split_wd(text, stopwords=sw):
    new_sw = set(sw)
    new_sw |= set(my_sw)
    return [wd
            for wd in gensim.utils.tokenize(text, lower=True)
            if wd not in new_sw and len(wd) > 3]

In [7]:
words_onp = []

# = 0
for index, row in df.iterrows():
    text = row['text']
    text = remove_junk(text)
    tokens = split_wd(text, sw)

    words_onp.append(tokens)
#   n += 1
#   if n >= 10:
#       break

### 2. Vectorize the word tokens

#### First assign IDs to the tokens

In [8]:
dictionary = gensim.corpora.Dictionary(words_onp)

#### Remove tokens only appear once, and save the dictionary to disk.

In [9]:
once = [token_id for token_id, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(once)
dictionary.compactify()
dictionary.save('models/onp_tm.dict')
#print(dictionary)

#### Now convert to vectors and save to disk

In [10]:
corpus = [dictionary.doc2bow(t) for t in words_onp]
gensim.corpora.MmCorpus.serialize('models/onp_tm.mm', corpus)
#print(corpus)

### 3. Prepare the model

#### Initialize a model, and save to disk

Do not modify model valuable 'tfidf' from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [11]:
tfidf = gensim.models.TfidfModel(corpus)
tfidf.save('models/onp_tm.tfidf')

#### Put a model wrapper around the corpus, and save to disk

In [12]:
corpus_tfidf = tfidf[corpus]
gensim.corpora.MmCorpus.serialize('models/onp_tm.mm_corpus_tfidf', corpus_tfidf)
#for doc in corpus_tfidf:
#    print(doc)

#### Set number of topics to model

In [13]:
n_topics = 5

### 4. Use Latent Dirichlet Allocation (LDA) model

#### Two methods:
#### A) with online LDA

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, 
                                            num_topics=n_topics, 
                                            update_every=1, chunksize=10000, passes=5)

#### B) or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model to disk

In [15]:
lda_model.save('models/onp_tm.lda')

#### Print out most frequent words in each of the topics

In [16]:
top_words = [[word for word in lda_model.show_topic(topicno, topn=5)] 
             for topicno in range(lda_model.num_topics)]
pp.pprint(top_words)

[   [   (u'hanson', 0.011922363650115526),
        (u'qanda', 0.011473320802754684),
        (u'pauline', 0.011094896871790499),
        (u'nation', 0.011076265871551041),
        (u'malcolm', 0.0090486029170585795)],
    [   (u'rally', 0.03398625728474846),
        (u'nauru', 0.033271345338744721),
        (u'wilsonsecurity', 0.03071066808112409),
        (u'kon__k', 0.030667158488315614),
        (u'supremacist', 0.030660671536618792)],
    [   (u'squat', 0.018315980112206094),
        (u'toilets', 0.017023791782655385),
        (u'right', 0.013399973432148084),
        (u'offices', 0.012437345439820485),
        (u'melbourne', 0.012077186646590509)],
    [   (u'dressed', 0.033171004538378561),
        (u'storm', 0.032060450942781114),
        (u'muslims', 0.031311538389819524),
        (u'supporters', 0.028187059744568022),
        (u'church', 0.027219690450427467)],
    [   (u'skull', 0.02130379536236169),
        (u'nazi', 0.021189106653404313),
        (u'ross', 0.021095704984894

### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('models/onp_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('models/onp_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('models/onp_tm.lda')

In [18]:
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
pyLDAvis.display(vis_data)

Each circle on the left represent a topic, while on the left shows the frequent keywords within a highlighted/selected topic.  
The lambda slider is used to adjust the relevence/ranking terms within a topic, a ratio between the term frequency in a topic and overall in the data set. Study shows 0.6 to be the optimal value for correctly identifing a topic, however this can be data-dependant.  

See Sievert & Shirley (2014) for details (http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf).







### Modelling with sentiment taking into account

### (A) Positive sentiment

In [19]:
words_onp_pos = []

# = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] > 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        words_onp_pos.append(tokens)
#   n += 1
#   if n >= 10:
#       break

#### Vectorize the word tokens

In [20]:
# first assign IDs to the tokens
dictionary_pos = gensim.corpora.Dictionary(words_onp_pos)
# remove tokens only appear once
once_pos = [token_id for token_id, docfreq in dictionary_pos.dfs.iteritems() if docfreq == 1]
dictionary_pos.filter_tokens(once_pos)
dictionary_pos.compactify()
dictionary_pos.save('models/onp_pos_tm.dict')
#print(dictionary)

# now convert to vectors
corpus_pos = [dictionary_pos.doc2bow(t) for t in words_onp_pos]
gensim.corpora.MmCorpus.serialize('models/onp_pos_tm.mm', corpus_pos)
#print(corpus)

#### Prepare the model

Initialize a model, do not modify valuable tfidf from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [21]:
tfidf_pos = gensim.models.TfidfModel(corpus_pos)
tfidf_pos.save('models/onp_pos_tm.tfidf')

# put a model wrapper around corpus
corpus_tfidf_pos = tfidf_pos[corpus_pos]
gensim.corpora.MmCorpus.serialize('models/onp_pos_tm.mm_corpus_tfidf', corpus_tfidf_pos)
#for doc in corpus_tfidf:
#    print(doc)

In [34]:
# number of topics to model
n_topics = 5

#### Use Latent Dirichlet Allocation (LDA) model

#### 1. with online LDA

In [35]:
lda_model_pos = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_pos, id2word=dictionary_pos, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### 2. or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model

In [36]:
lda_model_pos.save('models/onp_pos_tm.lda')

#### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('models/onp_pos_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('models/onp_pos_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('models/onp_pos_tm.lda')

In [37]:
vis_data_pos = pyLDAvis.gensim.prepare(lda_model_pos, corpus_tfidf_pos, dictionary_pos)
pyLDAvis.display(vis_data_pos)

### (B) Negative sentiment

In [38]:
words_onp_neg = []

# = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] < 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        words_onp_neg.append(tokens)
#   n += 1
#   if n >= 10:
#       break

#### Vectorize the word tokens

In [39]:
# first assign IDs to the tokens
dictionary_neg = gensim.corpora.Dictionary(words_onp_neg)
# remove tokens only appear once
once_neg = [token_id for token_id, docfreq in dictionary_neg.dfs.iteritems() if docfreq == 1]
dictionary_neg.filter_tokens(once_neg)
dictionary_neg.compactify()
dictionary_neg.save('models/onp_neg_tm.dict')
#print(dictionary)

# now convert to vectors
corpus_neg = [dictionary_neg.doc2bow(t) for t in words_onp_neg]
gensim.corpora.MmCorpus.serialize('models/onp_neg_tm.mm', corpus_neg)
#print(corpus)

#### Prepare the model

Initialize a model, do not modify valuable tfidf from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [40]:
tfidf_neg = gensim.models.TfidfModel(corpus_neg)
tfidf_neg.save('models/onp_neg_tm.tfidf')

# put a model wrapper around corpus
corpus_tfidf_neg = tfidf_neg[corpus_neg]
gensim.corpora.MmCorpus.serialize('models/onp_neg_tm.mm_corpus_tfidf', corpus_tfidf_neg)
#for doc in corpus_tfidf:
#    print(doc)

In [41]:
# number of topics to model
n_topics = 5

#### Use Latent Dirichlet Allocation (LDA) model

#### 1. with online LDA

In [42]:
lda_model_neg = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_neg, id2word=dictionary_neg, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### 2. or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model

In [43]:
lda_model_neg.save('models/onp_neg_tm.lda')
#top_words = [[word for word in lda_model_neg.show_topic(topicno, topn=5)] 
#             for topicno in range(lda_model_neg.num_topics)]
#print(top_words)

#### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('models/onp_neg_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('models/onp_neg_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('models/onp_neg_tm.lda')

In [44]:
vis_data_neg = pyLDAvis.gensim.prepare(lda_model_neg, corpus_tfidf_neg, dictionary_neg)
pyLDAvis.display(vis_data_neg)

## Similar words with word2vec

In [45]:
m = gensim.models.Word2Vec(words_onp, size=100, window=5, min_count=3, workers=4)
pp.pprint(m.most_similar("nation"))

[   (u'senator', 0.9957336783409119),
    (u'malcolm', 0.9939283132553101),
    (u'profbriancox', 0.9926587343215942),
    (u'roberts', 0.9921092391014099),
    (u'elect', 0.9916069507598877),
    (u'takes', 0.9908980131149292),
    (u'readfearn', 0.9842835068702698),
    (u'abctv', 0.9841217398643494),
    (u'denied', 0.9823267459869385),
    (u'scientist', 0.981223464012146)]


In [46]:
m = gensim.models.Word2Vec(words_onp_neg, size=100, window=5, min_count=3, workers=4)
pp.pprint(m.most_similar("nation"))

[   (u'malcolm', 0.9970388412475586),
    (u'senator', 0.9967544674873352),
    (u'roberts', 0.9961132407188416),
    (u'denier', 0.9943068027496338),
    (u'icymi', 0.9939748048782349),
    (u'denied', 0.993387758731842),
    (u'scaa', 0.9931299090385437),
    (u'abctv', 0.9930740594863892),
    (u'mocks', 0.9928045868873596),
    (u'discredited', 0.9915701746940613)]


In [47]:
m = gensim.models.Word2Vec(words_onp_pos, size=100, window=5, min_count=3, workers=4)
pp.pprint(m.most_similar("nation"))

[   (u'qanda', 0.9998025894165039),
    (u'roberts', 0.9997572898864746),
    (u'malcolm', 0.99972003698349),
    (u'believe', 0.9997132420539856),
    (u'senator', 0.9997057914733887),
    (u'profbriancox', 0.9996393918991089),
    (u'scientist', 0.9996350407600403),
    (u'years', 0.9996035099029541),
    (u'esteemed', 0.9995802640914917),
    (u'world', 0.9995610117912292)]


## Todo:  

In the next part of topic modelling, try grouping frequent consecutive words into phrases.