# Part 3: topic modelling

In [None]:
# -*- coding: utf-8 -*-
%matplotlib inline

from __future__ import print_function
import pprint, datetime
import pandas as pd
import numpy as np
import gensim, re
import pyLDAvis.gensim
from dateutil import parser
from ggplot import *

import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 14, 10

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# e.g. logging.info("%i tweets found..." % len(df))

pp = pprint.PrettyPrinter(indent=4)

### This is part three of data exploration, see part one for quick data look and sentiment analysis, part two for word cloud.

## Load in data

Dataset contains Tweets between 09-08-2016 and 22-08-2016, with sentiment score from part one sentiment analysis.  

Load dataframe from hdf5 file (see db2df.ipynb for loading data from MongoDB to dataframe)

In [2]:
print("Load df from hdf5 file.")
hdf = pd.HDFStore('onp_sentiment.h5')
df = hdf['df']
hdf.close()
print("%d Tweets loaded." % len(df))

Load df from hdf5 file.
8963 Tweets loaded.


#### Note:
When the data set gets too big, can try random select samples:  

from sklearn.cross_validation import train_test_split  
import random  
random.seed(1234)  
train, test = train_test_split(df, test_size = 0.5)  
del(df)  
df = train  

# Topic modelling

Try out some topic modelling and see what the tweets about.

### 1. Tokenize tweets text

In [3]:
sw = set(gensim.parsing.preprocessing.STOPWORDS)
my_sw = ['http', 'https', 'RT']

In [4]:
def remove_junk(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@+", " ", text)
    return text

In [5]:
def split_wd(text, stopwords=sw):
    new_sw = set(sw)
    new_sw |= set(my_sw)
    return [wd
            for wd in gensim.utils.tokenize(text, lower=True)
            if wd not in new_sw and len(wd) > 3]

In [6]:
words_onp = []

# = 0
for index, row in df.iterrows():
    text = row['text']
    text = remove_junk(text)
    tokens = split_wd(text, sw)

    words_onp.append(tokens)
#   n += 1
#   if n >= 10:
#       break

### 2. Vectorize the word tokens

#### First assign IDs to the tokens

In [7]:
dictionary = gensim.corpora.Dictionary(words_onp)

#### Remove tokens only appear once, and save the dictionary to disk.

In [8]:
once = [token_id for token_id, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(once)
dictionary.compactify()
dictionary.save('onp_tm.dict')
#print(dictionary)

#### Now convert to vectors and save to disk

In [9]:
corpus = [dictionary.doc2bow(t) for t in words_onp]
gensim.corpora.MmCorpus.serialize('onp_tm.mm', corpus)
#print(corpus)

### 3. Prepare the model

#### Initialize a model, and save to disk

Do not modify model valuable 'tfidf' from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [10]:
tfidf = gensim.models.TfidfModel(corpus)
tfidf.save('onp_tm.tfidf')

#### Put a model wrapper around the corpus, and save to disk

In [11]:
corpus_tfidf = tfidf[corpus]
gensim.corpora.MmCorpus.serialize('onp_tm.mm_corpus_tfidf', corpus_tfidf)
#for doc in corpus_tfidf:
#    print(doc)

#### Set number of topics to model

In [12]:
n_topics = 10

### 4. Use Latent Dirichlet Allocation (LDA) model

#### Two methods:
#### A) with online LDA

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, 
                                            num_topics=n_topics, 
                                            update_every=1, chunksize=10000, passes=5)

#### B) or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model to disk

In [14]:
lda_model.save('onp_tm.lda')

#### Print out most frequent words in each of the topics

In [15]:
top_words = [[word for word in lda_model.show_topic(topicno, topn=5)] 
             for topicno in range(lda_model.num_topics)]
pp.pprint(top_words)

[   [   (u'supremacist', 0.053304646494726798),
        (u'reshare', 0.053266113776126001),
        (u'kon__k', 0.053243107811923776),
        (u'wilsonsecurity', 0.053129295175593047),
        (u'photo', 0.053013854248906984)],
    [   (u'dressed', 0.057023852908672165),
        (u'storm', 0.056066454307267297),
        (u'muslims', 0.055850832586030082),
        (u'church', 0.052677814361864729),
        (u'right', 0.052248284586305385)],
    [   (u'workmanalice', 0.014671828737342766),
        (u'good', 0.013272936422575389),
        (u'senators', 0.012644151995168215),
        (u'vote', 0.012238208926153666),
        (u'montessori', 0.011952758867763978)],
    [   (u'office', 0.021773373758906502),
        (u'parish', 0.019107986876082263),
        (u'artbylynettag', 0.019009127505596756),
        (u'offices', 0.018818586885287845),
        (u'frbower', 0.01881754713765461)],
    [   (u'aren', 0.028475197481752711),
        (u'representative', 0.028384879954764387),
        (u'mari

### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('onp_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('onp_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('onp_tm.lda')

In [17]:
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
pyLDAvis.display(vis_data)

Each circle on the left represent a topic, while on the left shows the frequent keywords within a highlighted/selected topic.  
The lambda slider is used to adjust the relevence/ranking terms within a topic, a ratio between the term frequency in a topic and overall in the data set. Study shows 0.6 to be the optimal value for correctly identifing a topic, however this can be data-dependant.  

See Sievert & Shirley (2014) for details (http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf).







### Modelling with sentiment taking into account

### (A) Positive sentiment

In [18]:
words_onp_pos = []

# = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] > 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        words_onp_pos.append(tokens)
#   n += 1
#   if n >= 10:
#       break

#### Vectorize the word tokens

In [19]:
# first assign IDs to the tokens
dictionary_pos = gensim.corpora.Dictionary(words_onp_pos)
# remove tokens only appear once
once_pos = [token_id for token_id, docfreq in dictionary_pos.dfs.iteritems() if docfreq == 1]
dictionary_pos.filter_tokens(once_pos)
dictionary_pos.compactify()
dictionary_pos.save('onp_pos_tm.dict')
#print(dictionary)

# now convert to vectors
corpus_pos = [dictionary_pos.doc2bow(t) for t in words_onp_pos]
gensim.corpora.MmCorpus.serialize('onp_pos_tm.mm', corpus_pos)
#print(corpus)

#### Prepare the model

Initialize a model, do not modify valuable tfidf from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [20]:
tfidf_pos = gensim.models.TfidfModel(corpus_pos)
tfidf_pos.save('onp_pos_tm.tfidf')

# put a model wrapper around corpus
corpus_tfidf_pos = tfidf_pos[corpus_pos]
gensim.corpora.MmCorpus.serialize('onp_pos_tm.mm_corpus_tfidf', corpus_tfidf_pos)
#for doc in corpus_tfidf:
#    print(doc)

In [21]:
# number of topics to model
n_topics = 10

#### Use Latent Dirichlet Allocation (LDA) model

#### 1. with online LDA

In [22]:
lda_model_pos = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_pos, id2word=dictionary_pos, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### 2. or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model

In [23]:
lda_model_pos.save('onp_pos_tm.lda')

#### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('onp_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('onp_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('onp_tm.lda')

In [24]:
vis_data_pos = pyLDAvis.gensim.prepare(lda_model_pos, corpus_tfidf_pos, dictionary_pos)
pyLDAvis.display(vis_data_pos)

### (B) Negative sentiment

In [25]:
words_onp_neg = []

# = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] < 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        words_onp_neg.append(tokens)
#   n += 1
#   if n >= 10:
#       break

#### Vectorize the word tokens

In [26]:
# first assign IDs to the tokens
dictionary_neg = gensim.corpora.Dictionary(words_onp_neg)
# remove tokens only appear once
once_neg = [token_id for token_id, docfreq in dictionary_neg.dfs.iteritems() if docfreq == 1]
dictionary_neg.filter_tokens(once_neg)
dictionary_neg.compactify()
dictionary_neg.save('onp_neg_tm.dict')
#print(dictionary)

# now convert to vectors
corpus_neg = [dictionary_neg.doc2bow(t) for t in words_onp_neg]
gensim.corpora.MmCorpus.serialize('onp_neg_tm.mm', corpus_neg)
#print(corpus)

#### Prepare the model

Initialize a model, do not modify valuable tfidf from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [27]:
tfidf_neg = gensim.models.TfidfModel(corpus_neg)
tfidf_neg.save('onp_neg_tm.tfidf')

# put a model wrapper around corpus
corpus_tfidf_neg = tfidf_neg[corpus_neg]
gensim.corpora.MmCorpus.serialize('onp_neg_tm.mm_corpus_tfidf', corpus_tfidf_neg)
#for doc in corpus_tfidf:
#    print(doc)

In [28]:
# number of topics to model
n_topics = 10

#### Use Latent Dirichlet Allocation (LDA) model

#### 1. with online LDA

In [29]:
lda_model_neg = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_neg, id2word=dictionary_neg, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### 2. or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model

In [30]:
lda_model_neg.save('onp_neg_tm.lda')
#top_words = [[word for word in lda_model_neg.show_topic(topicno, topn=5)] 
#             for topicno in range(lda_model_neg.num_topics)]
#print(top_words)

#### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('onp_tm.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('onp_tm.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('onp_tm.lda')

In [31]:
vis_data_neg = pyLDAvis.gensim.prepare(lda_model_neg, corpus_tfidf_neg, dictionary_neg)
pyLDAvis.display(vis_data_neg)

## Similar words with word2vec

In [33]:
m = gensim.models.Word2Vec(words_onp, size=100, window=5, min_count=3, workers=4)
pp.pprint(m.most_similar("nauru"))

[   (u'supremacist', 0.9810780882835388),
    (u'rally', 0.9760749936103821),
    (u'white', 0.9759616255760193),
    (u'staff', 0.959437370300293),
    (u'wilsonsecurity', 0.9543296098709106),
    (u'photo', 0.9402326345443726),
    (u'reshare', 0.9279546737670898),
    (u'guards', 0.9178486466407776),
    (u'kon__k', 0.9128305912017822),
    (u'suspended', 0.8936606645584106)]


## Todo:  

In the next part of topic modelling, try grouping frequent consecutive words into phrases.