# **Topic Modeling**

## **Import Dataset**

In [43]:
import pandas as pd

data = pd.read_csv('Tweets_cleaned_data.csv')

In [44]:
# data = data.sample(frac = 0.005)
# data = data.reset_index()

In [45]:
data.head()

Unnamed: 0,target,id,date,flag,user,text,text2,text3,hashtags,tokens,tokens_remove_stopwords,length_1
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...",awww thats a bummer you shoulda got david carr...,thats a bummer you got carr of third day to do...,,"['thats', 'a', 'bummer', 'you', 'got', 'carr',...","['thats', 'bummer', 'got', 'carr', 'third', 'd...",6
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...,is upset that he cant update his facebook by t...,is upset that he cant update his by it and mig...,,"['is', 'upset', 'that', 'he', 'cant', 'update'...","['upset', 'cant', 'update', 'might', 'cry', 'r...",10
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@kenichan i dived many times for the ball. man...,i dived many times for the ball managed to sav...,i many times for the ball to save the rest go ...,,"['i', 'many', 'times', 'for', 'the', 'ball', '...","['many', 'times', 'ball', 'save', 'rest', 'go']",6
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body itchy and like its on fire,,"['my', 'whole', 'body', 'itchy', 'and', 'like'...","['whole', 'body', 'itchy', 'like', 'fire']",5
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...,no its not at all mad why am i here because i ...,,"['no', 'its', 'not', 'at', 'all', 'mad', 'why'...","['mad', 'cant', 'see']",3


## **LDA (Latent Dirichlet Allocation)**

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# parameter setting
n_samples = data.shape[0]
n_features = 3000 # number of feature words
n_components = 0 # number of topic (just declare 0 at first)
n_top_words = 15 # number of top words in each topic

In [47]:
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(data['text3'].values.astype('U'))

In [48]:
tf

<1600000x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 6238720 stored elements in Compressed Sparse Row format>

In [49]:
# Find the best n_components by perplexity to specify number of topic used for training model

for i in range(5,11,1):    
    n_topics = i

    print("Fitting LDA models with tf features, "
          "n_samples=%d, n_features=%d n_components=%d "
          % (n_samples, n_features, n_topics))

    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)

    likelihood = lda.score(tf)
    perplexity = lda.perplexity(tf)

    if n_topics == 5 or (n_topics != 1 and perplexity < best_perplexity):
        best_n_components = i
        best_perplexity = perplexity

    print('sklearn log-likelihood: %.3f' % likelihood)
    print('sklearn perplexity: %.3f' % perplexity)

n_components = best_n_components

Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=5 
sklearn log-likelihood: -45006498.639
sklearn perplexity: 1153.582
Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=6 
sklearn log-likelihood: -45280786.244
sklearn perplexity: 1204.231
Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=7 
sklearn log-likelihood: -45471528.747
sklearn perplexity: 1240.758
Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=8 
sklearn log-likelihood: -45625955.398
sklearn perplexity: 1271.140
Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=9 
sklearn log-likelihood: -45842020.809
sklearn perplexity: 1314.903
Fitting LDA models with tf features, n_samples=1600000, n_features=3000 n_components=10 
sklearn log-likelihood: -45944664.717
sklearn perplexity: 1336.217


In [50]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=5, random_state=0)

In [51]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % (topic_idx+1)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [52]:
tf_feature_names = tf_vectorizer.get_feature_names()

In [53]:
print_top_words(lda, tf_feature_names, n_top_words)

Topic #1: good know day got morning really just hope happy dont night thank think thats make
Topic #2: thanks ill watching just hey did yes tonight best doing today like song ya later
Topic #3: going nice want awesome day say week need sure twitter look pretty birthday looking working
Topic #4: just new fun home nan time days amazing glad movie like way check let old
Topic #5: love today work wait come like oh getting day right feel weekend life follow got



In [54]:
lda_interpret = lda.transform(tf)
lda_label = []

for i in range(len(lda_interpret)):
    lda_label.append(np.argmax(lda_interpret[i])+1)

data['topic_LDA'] = lda_label

In [55]:
data['topic_LDA'].value_counts()

topic_LDA
1    456386
5    317759
2    278496
4    276308
3    271051
Name: count, dtype: int64

In [56]:
lda_doc_by_topic = pd.DataFrame(lda_interpret, columns=[("topic_%d"%(i+1)) for i in range(n_components)]) 
lda_doc_by_topic

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.839412,0.040000,0.040175,0.040000,0.040413
1,0.028571,0.028614,0.028571,0.028571,0.885672
2,0.840000,0.040000,0.040000,0.040000,0.040000
3,0.050615,0.797972,0.050004,0.050521,0.050888
4,0.598170,0.100000,0.100000,0.100000,0.101830
...,...,...,...,...,...
1599995,0.033546,0.235421,0.200000,0.331033,0.200000
1599996,0.299210,0.300000,0.050000,0.300000,0.050790
1599997,0.066667,0.400000,0.066667,0.066667,0.400000
1599998,0.639055,0.040171,0.240000,0.040491,0.040283


In [57]:
lda_topic_by_word = pd.DataFrame(lda.components_, columns=tf_feature_names)
lda_topic_by_word

Unnamed: 0,abandoned,ability,able,absolute,absolutely,accent,accept,accepted,access,accident,...,youve,yr,yuck,yucky,yummy,zac,zero,zombie,zone,zoo
0,83.498625,0.201753,0.202986,0.203731,0.202488,0.202365,0.202414,0.202345,0.201912,0.201294,...,3307.861134,0.202571,0.202038,0.203097,0.202444,0.202628,0.202643,0.203497,412.092462,0.202679
1,0.202408,169.692192,0.203134,0.203386,0.202549,0.202703,0.203393,0.202488,0.202664,294.040748,...,0.20323,1185.157795,0.201064,0.202817,3477.31971,0.203605,342.282684,0.20364,0.203897,0.202266
2,0.204182,0.2022,3492.663758,0.203616,0.20263,0.203528,0.203084,0.203436,542.217395,0.201681,...,0.203341,0.203033,0.202043,89.121477,0.203202,0.202706,0.202703,0.203184,0.203971,0.203823
3,0.203797,0.203255,0.202823,276.472816,0.20298,0.202434,359.818404,0.202688,0.203302,0.201951,...,0.203165,0.203756,201.292657,0.203457,0.203415,309.302671,0.203309,0.204757,0.205091,717.931632
4,0.20277,0.201865,0.202493,0.203129,2663.414785,413.234064,0.203362,322.350511,0.202624,0.201581,...,0.203004,0.203114,0.202517,0.202484,0.202732,0.203425,0.201612,273.289137,0.204189,0.203712


## **GSDMM (Gibbs Sampling Dirichlet Multinomial Mixture)**

In [58]:
from tqdm import tqdm
import gsdmm

In [59]:
import ast
docs = []
for i in range(len(data)):
    docs.append(ast.literal_eval(data['tokens_remove_stopwords'].values[i]))

In [60]:
mgp = gsdmm.MovieGroupProcess(K=n_components, alpha=0.1, beta=0.5, n_iters=5)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

In stage 0: transferred 1271746 clusters with 5 clusters populated
In stage 1: transferred 1236730 clusters with 5 clusters populated
In stage 2: transferred 1192846 clusters with 5 clusters populated
In stage 3: transferred 1074449 clusters with 5 clusters populated
In stage 4: transferred 880702 clusters with 5 clusters populated


In [61]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Topic %s: %s'%(cluster+1,sort_dicts))

def topic_allocation(df, docs, mgp):
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label+1)
    df['topic_GSDMM'] = topic_allocations

In [62]:
doc_count = np.array(mgp.cluster_doc_count)
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, n_top_words)

Topic 1: [('like', 21891), ('dont', 17361), ('get', 16207), ('got', 14772), ('one', 12838), ('really', 11180), ('know', 10951), ('think', 10692), ('still', 9989), ('today', 9773), ('want', 9315), ('cant', 9293), ('feel', 9100), ('need', 8929), ('go', 8501)]
Topic 2: [('like', 11444), ('good', 10608), ('get', 9235), ('one', 8537), ('got', 8153), ('love', 7849), ('new', 7460), ('go', 7317), ('time', 7192), ('today', 7020), ('going', 6738), ('cant', 6696), ('day', 6513), ('want', 6360), ('dont', 5811)]
Topic 3: [('day', 50886), ('good', 36357), ('work', 35974), ('today', 34807), ('go', 33372), ('going', 32114), ('back', 26168), ('night', 25916), ('home', 24391), ('morning', 23167), ('time', 22209), ('get', 22086), ('tomorrow', 21109), ('got', 19870), ('last', 18954)]
Topic 4: [('love', 24358), ('cant', 15499), ('like', 15058), ('new', 15042), ('see', 13949), ('u', 13806), ('good', 12635), ('dont', 10859), ('one', 10317), ('go', 10150), ('know', 9941), ('watching', 9485), ('really', 9224),

In [63]:
topic_allocation(data, docs, mgp)

100%|██████████| 1600000/1600000 [02:16<00:00, 11736.55it/s]


In [64]:
data['topic_GSDMM'].value_counts()

topic_GSDMM
3    513577
5    413121
1    257087
4    244021
2    172194
Name: count, dtype: int64

In [65]:
doc_score = []

for doc in tqdm(docs):
    score = mgp.score(doc)
    doc_score.append(score)

doc_score = np.array(doc_score)

100%|██████████| 1600000/1600000 [02:07<00:00, 12560.48it/s]


In [66]:
gsdmm_doc_by_topic = pd.DataFrame(doc_score, columns=[("topic_%d"%(i+1)) for i in range(n_components)]) 
gsdmm_doc_by_topic

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.044519,0.083944,0.100030,0.735720,0.035788
1,0.756307,0.008895,0.145259,0.005365,0.084175
2,0.333088,0.105224,0.316931,0.112683,0.132074
3,0.874997,0.039213,0.075351,0.010362,0.000077
4,0.053134,0.031021,0.094107,0.481101,0.340637
...,...,...,...,...,...
1599995,0.010311,0.002492,0.966059,0.017642,0.003495
1599996,0.066799,0.032209,0.008024,0.281482,0.611486
1599997,0.042322,0.419802,0.294648,0.212386,0.030842
1599998,0.002469,0.008111,0.802928,0.074360,0.112132


In [67]:
def cluster_importance(mgp):
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

In [68]:
phi = cluster_importance(mgp)
topic_vocab = []
vocab_list = list(vocab)

for i in range(n_components):
    temp = []
    for j in range(len(vocab_list)):
        try:
            temp.append(phi[i][vocab_list[j]])
        except:
            temp.append(0)
    topic_vocab.append(temp)

In [69]:
gsdmm_topic_by_word = pd.DataFrame(topic_vocab, columns=vocab_list) 
gsdmm_topic_by_word

Unnamed: 0,siss,reaver,autonomic,metatarsal,lagger,agger,fit,toggle,township,clearance,...,thirty,matchbox,unstraightened,china,peel,balancer,saga,calorific,eastbound,philander
0,9.970169e-07,0.0,0.0,0.0,0.0,0.0,0.000416,1.661695e-06,0.0,3e-06,...,2.2e-05,9.970169e-07,2.326373e-06,1.8e-05,2.6e-05,2e-06,3e-06,0.0,2.326373e-06,0.0
1,1.157935e-05,0.0,0.0,0.0,0.0,1e-06,0.000271,0.0,0.0,2.4e-05,...,1.1e-05,3.593592e-06,0.0,0.000134,2e-05,0.0,7e-06,2e-06,1.99644e-06,0.0
2,1.058178e-05,0.0,0.0,0.0,7.054521e-07,0.0,8.9e-05,0.0,0.0,1e-06,...,3.1e-05,2.586658e-06,7.054521e-07,3.9e-05,1.1e-05,0.0,2e-06,0.0,7.054521e-07,0.0
3,1.107802e-05,1e-06,0.0,0.0,0.0,0.0,0.000146,1.786777e-06,1.072066e-06,0.0,...,6e-06,3.216199e-06,0.0,3.6e-05,3e-06,0.0,7.4e-05,0.0,0.0,0.0
4,3.515899e-06,0.0,8.113612e-07,8.113612e-07,0.0,0.0,9.7e-05,8.113612e-07,8.113612e-07,0.0,...,2e-06,0.0,0.0,0.000102,0.0,0.0,0.0,0.0,0.0,8.113612e-07


## Export CSV

In [70]:
# Dataset with topic label (LDA, GSDMM)
data.to_csv('Tweets_topic_modeling.csv', index=False)

In [71]:
# LDA document by topic 
# probability of each document relate to each topic
lda_doc_by_topic.to_csv('LDA_doc_by_topic.csv', index=False)

In [72]:
# LDA topic by word
# importance of the word to each topic
lda_topic_by_word.to_csv('LDA_topic_by_word.csv', index=False)

In [73]:
# GSDMM document by topic 
# probability of each document relate to each topic
gsdmm_doc_by_topic.to_csv('GSDMM_doc_by_topic.csv', index=False)

In [74]:
# GSDMM topic by word
# importance of the word to each topic
gsdmm_topic_by_word.to_csv('GSDMM_topic_by_word.csv', index=False)