# Part 4: topic modelling with phrases

In [2]:
# -*- coding: utf-8 -*-
# goto https://github.com/n-lo/Tweets_analysis_tryout for other parts
%matplotlib inline

from __future__ import print_function
import pprint, datetime
import pandas as pd
import numpy as np
import gensim, re
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
import pyLDAvis.gensim
from dateutil import parser
from ggplot import *

import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 14, 10

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# e.g. logging.info("%i tweets found..." % len(df))

pp = pprint.PrettyPrinter(indent=4)

### This is part four of data exploration, see part one for quick data look and sentiment analysis, part two for word cloud, part three for topic modelling.

## Load in data

Dataset contains Tweets between 09-08-2016 and 23-08-2016, with sentiment score from part one sentiment analysis.  

Load dataframe from hdf5 file (see db2df.ipynb for loading data from MongoDB to dataframe)

In [3]:
print("Load df from hdf5 file.")
hdf = pd.HDFStore('onp_sentiment.h5')
df = hdf['df']
hdf.close()
print("%d Tweets loaded." % len(df))

Load df from hdf5 file.
9486 Tweets loaded.


#### Note:
When the data set gets too big, can try random select samples:  

from sklearn.cross_validation import train_test_split  
import random  
random.seed(1234)  
train, test = train_test_split(df, test_size = 0.5)  
del(df)  
df = train  

# Topic modelling

### 1. Finding frequent phrases  

Here we use trigram association measure from NLTK.

#### Tokenize the text first  

In [4]:
sw = set(gensim.parsing.preprocessing.STOPWORDS)
my_sw = ['http', 'https', 'RT']

In [5]:
def split_wd(text, stopwords=sw):
    new_sw = set(sw)
    new_sw |= set(my_sw)
    return [wd
            for wd in gensim.utils.tokenize(text, lower=True)
            if wd not in new_sw and len(wd) > 3]

In [6]:
def remove_junk(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    #text = re.sub(r"@+", " ", text)
    return text

In [7]:
n = 0
for index, row in df.iterrows():
    text = row['text']
    text = remove_junk(text)
    tokens = split_wd(text, sw)
    if n == 0:
        words_onp_tuple = tokens
        n = 1
    else:
        words_onp_tuple = words_onp_tuple + tokens

#### Find common phrases

In [8]:
def phrases(wd, top_no = 1000, min_freq = 30):
    tcf = TrigramCollocationFinder.from_words(wd)
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_no)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_no)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return pat_gram2, pat_gram3

In [9]:
bigrams, trigrams = phrases(words_onp_tuple)

### 2. Tokenize

Taking into account of common phrases

In [10]:
words_onp = []

for index, row in df.iterrows():
    text = row['text']
    text = remove_junk(text)
    text = re.sub(trigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
    text = re.sub(bigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
    tokens = split_wd(text, sw)

    words_onp.append(tokens)

### 3. Vectorize the word tokens

In [11]:
# first assign IDs to the tokens
dictionary = gensim.corpora.Dictionary(words_onp)
# remove tokens only appear once
once = [token_id for token_id, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(once)
dictionary.compactify()
dictionary.save('models/onp_tm_phrases.dict')
#print(dictionary)

# now convert to vectors
corpus = [dictionary.doc2bow(t) for t in words_onp]
gensim.corpora.MmCorpus.serialize('models/onp_tm_phrases.mm', corpus)
#print(corpus)

### 4. Prepare the model

Initialize a model, do not modify valuable tfidf from now on.  
tfidf - Term Frequency * Inverse Document Frequency

In [12]:
tfidf = gensim.models.TfidfModel(corpus)
tfidf.save('models/onp_tm_phrases.tfidf')

# put a model wrapper around corpus
corpus_tfidf = tfidf[corpus]
gensim.corpora.MmCorpus.serialize('models/onp_tm_phrases.mm_corpus_tfidf', corpus_tfidf)
#for doc in corpus_tfidf:
#    print(doc)

In [13]:
# number of topics to model
n_topics = 10

#### Use Latent Dirichlet Allocation (LDA) model

#### 1. with online LDA

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, 
                                            num_topics=n_topics, 
                                            update_every=1, chunksize=10000, passes=5)

#### 2. or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

#### Now save the model

In [15]:
lda_model.save('models/onp_tm_phrases.lda')
#lda_model.print_topics(10)
#top_words = [[word for word in lda_model.show_topic(topicno, topn=5)] 
#             for topicno in range(lda_model.num_topics)]
#print(top_words)

#### Visualise the model with pyLDAvis

To load preivous saved model from disk:  

dictionary = gensim.corpora.Dictionary.load('models/onp_tm_phrases.dict')  
corpus_tfidf = gensim.corpora.MmCorpus('models/onp_tm_phrases.mm_corpus_tfidf')  
lda_model = gensim.models.ldamodel.LdaModel.load('models/onp_tm_phrases.lda')

In [17]:
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
pyLDAvis.display(vis_data)

### Modelling with sentiment taking into account

### (A) Positive sentiment

Just repeat the codes above with 'sentiment_afinn' > 0 rows selected.

In [18]:
n = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] > 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        if n == 0:
            words_onp_pos_tuple = tokens
            n = 1
        else:
            words_onp_pos_tuple = words_onp_pos_tuple + tokens

#### Find common phrases

In [19]:
bigrams, trigrams = phrases(words_onp_pos_tuple)

In [20]:
words_onp_pos = []

for index, row in df.iterrows():
    if (row['sentiment_afinn'] > 0):
        text = row['text']
        text = remove_junk(text)
        text = re.sub(trigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
        text = re.sub(bigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
        tokens = split_wd(text, sw)

        words_onp_pos.append(tokens)

In [21]:
# first assign IDs to the tokens
dictionary_pos = gensim.corpora.Dictionary(words_onp_pos)
# remove tokens only appear once
once_pos = [token_id for token_id, docfreq in dictionary_pos.dfs.iteritems() if docfreq == 1]
dictionary_pos.filter_tokens(once_pos)
dictionary_pos.compactify()
dictionary_pos.save('models/onp_tm_pos_phrases.dict')
#print(dictionary)

# now convert to vectors
corpus_pos = [dictionary_pos.doc2bow(t) for t in words_onp_pos]
gensim.corpora.MmCorpus.serialize('models/onp_tm_pos_phrases.mm', corpus_pos)
#print(corpus)

In [22]:
tfidf_pos = gensim.models.TfidfModel(corpus_pos)
tfidf_pos.save('models/onp_tm_pos_phrases.tfidf')

# put a model wrapper around corpus
corpus_tfidf_pos = tfidf[corpus_pos]
gensim.corpora.MmCorpus.serialize('models/onp_tm_pos_phrases.mm_corpus_tfidf', corpus_tfidf_pos)
#for doc in corpus_tfidf:
#    print(doc)

In [23]:
# number of topics to model
n_topics = 10

In [24]:
lda_model_pos = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_pos, id2word=dictionary_pos, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

In [25]:
lda_model_pos.save('models/onp_tm_pos_phrases.lda')
#lda_model_pos.print_topics(10)
#top_words = [[word for word in lda_model_pos.show_topic(topicno, topn=5)] 
#             for topicno in range(lda_model_pos.num_topics)]
#print(top_words)

In [26]:
vis_data_pos = pyLDAvis.gensim.prepare(lda_model_pos, corpus_tfidf_pos, dictionary_pos)
pyLDAvis.display(vis_data_pos)

### (B) Negative sentiment

In [27]:
n = 0
for index, row in df.iterrows():
    if (row['sentiment_afinn'] < 0):
        text = row['text']
        text = remove_junk(text)
        tokens = split_wd(text, sw)
        if n == 0:
            words_onp_neg_tuple = tokens
            n = 1
        else:
            words_onp_neg_tuple = words_onp_neg_tuple + tokens

#### Find common phrases

In [28]:
bigrams, trigrams = phrases(words_onp_neg_tuple)

In [29]:
words_onp_neg = []

for index, row in df.iterrows():
    if (row['sentiment_afinn'] < 0):
        text = row['text']
        text = remove_junk(text)
        text = re.sub(trigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
        text = re.sub(bigrams, lambda match: match.group(0).replace(u' ', u'_'), text)
        tokens = split_wd(text, sw)

        words_onp_neg.append(tokens)

In [30]:
# first assign IDs to the tokens
dictionary_neg = gensim.corpora.Dictionary(words_onp_neg)
# remove tokens only appear once
once_neg = [token_id for token_id, docfreq in dictionary_neg.dfs.iteritems() if docfreq == 1]
dictionary_neg.filter_tokens(once_neg)
dictionary_neg.compactify()
dictionary_neg.save('models/onp_tm_neg_phrases.dict')
#print(dictionary)

# now convert to vectors
corpus_neg = [dictionary_neg.doc2bow(t) for t in words_onp_neg]
gensim.corpora.MmCorpus.serialize('models/onp_tm_neg_phrases.mm', corpus_neg)
#print(corpus)

In [31]:
tfidf_neg = gensim.models.TfidfModel(corpus_neg)
tfidf_neg.save('models/onp_tm_neg_phrases.tfidf')

# put a model wrapper around corpus
corpus_tfidf_neg = tfidf[corpus_neg]
gensim.corpora.MmCorpus.serialize('models/onp_tm_neg_phrases.mm_corpus_tfidf', corpus_tfidf_neg)
#for doc in corpus_tfidf:
#    print(doc)

In [32]:
# number of topics to model
n_topics = 10

In [33]:
lda_model_neg = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf_neg, id2word=dictionary_neg, 
                                                num_topics=n_topics, 
                                                update_every=1, chunksize=10000, passes=5)

#### or batch LDA, no online updates   

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, update_every=0, passes=20)

In [34]:
lda_model_neg.save('models/onp_tm_neg_phrases.lda')
#lda_model_pos.print_topics(10)
#top_words = [[word for word in lda_model_pos.show_topic(topicno, topn=5)] 
#             for topicno in range(lda_model_pos.num_topics)]
#print(top_words)

In [35]:
vis_data_neg = pyLDAvis.gensim.prepare(lda_model_neg, corpus_tfidf_neg, dictionary_neg)
pyLDAvis.display(vis_data_neg)

## Similar words with word2vec

In [36]:
m = gensim.models.Word2Vec(words_onp_neg, size=100, window=5, min_count=3, workers=4)
pp.pprint(m.most_similar("nauru"))

[   (u'squat_toilets', 0.9998140335083008),
    (u'children', 0.9997979998588562),
    (u'church', 0.9997894167900085),
    (u'australia', 0.999769926071167),
    (u'time', 0.9997684955596924),
    (u'idiot', 0.9997632503509521),
    (u'people', 0.9997496604919434),
    (u'wrong', 0.9997479915618896),
    (u'ausgrid', 0.9997464418411255),
    (u'need', 0.9997459053993225)]
