In [1]:
import os
import math
import operator
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from textblob import TextBlob
from gsdmm import MovieGroupProcess

In [2]:
# data = pd.read_pickle("../../../sampled_processed_extant_posts.pkl")
data = pd.read_pickle("../../../sampled_processed_extant_posts_june_july.pkl")

In [3]:
print(f"rows in data: {len(data)}")

rows in data: 1517757


In [4]:
data.head()

Unnamed: 0,orig_selftext,has_long_token,text_clean_space,text_clean_punc_lower,len_clean,tokens,tokens_clean,tokens_lemma,bigrams,trigrams
2490481,Even if they hate Trump they could at least ac...,False,Even if they hate Trump they could at least ac...,even if they hate trump they could at least ac...,139,"[even, if, they, hate, trump, they, could, at,...","[even, hate, trump, could, least, acknowledge,...","[even, hate, trump, could, least, acknowledge,...","[(even, hate), (hate, trump), (trump, could), ...","[(even, hate, trump), (hate, trump, could), (t..."
2490482,Heterosexual reproduction is a mystery to admi...,False,Heterosexual reproduction is a mystery to admi...,heterosexual reproduction is a mystery to admi...,76,"[heterosexual, reproduction, is, a, mystery, t...","[heterosexual, reproduction, mystery, admins, ...","[heterosexual, reproduction, mystery, admins, ...","[(heterosexual, reproduction), (reproduction, ...","[(heterosexual, reproduction, mystery), (repro..."
2490483,Were you expecting the actual pope?,False,Were you expecting the actual pope?,were you expecting the actual pope,34,"[were, you, expecting, the, actual, pope]","[expecting, actual, pope]","[expecting, actual, pope]","[(expecting, actual), (actual, pope)]","[(expecting, actual, pope)]"
2490484,"Be fruitful and multiply, not fruity and blow ...",False,"Be fruitful and multiply, not fruity and blow ...",be fruitful and multiply not fruity and blow a...,50,"[be, fruitful, and, multiply, not, fruity, and...","[fruitful, multiply, fruity, blow, guy]","[fruitful, multiply, fruity, blow, guy]","[(fruitful, multiply), (multiply, fruity), (fr...","[(fruitful, multiply, fruity), (multiply, frui..."
2490485,The term has completely lost all meaning. It's...,False,The term has completely lost all meaning. It's...,the term has completely lost all meaning its j...,104,"[the, term, has, completely, lost, all, meanin...","[term, completely, lost, meaning, generalizati...","[term, completely, lost, meaning, generalizati...","[(term, completely), (completely, lost), (lost...","[(term, completely, lost), (completely, lost, ..."


In [5]:
# Train a new model 
import random
random.seed(1000)
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.8, n_iters=100)

In [6]:
vocab_counts = {}
token_length = {}
sample_data = data['tokens_clean'].sample(200000)
for token_list in sample_data:
    for token in token_list:
        token_length[token] = len(token)
        vocab_counts[token] = vocab_counts.get(token, 0) + 1

In [7]:
sorted_vocab_counts = dict( sorted(vocab_counts.items(), key=operator.itemgetter(1),reverse=True))
# print('Dictionary in descending order by value : ', sorted_vocab_counts)
# like, would, see, get , im, thats, may, got, 



In [8]:
sorted_vocab_len = dict( sorted(token_length.items(), key=operator.itemgetter(1),reverse=True))
# print('Dictionary in descending order by value : ', sorted_vocab_len)



In [9]:
vocab_data = [[k, v, vocab_counts[k]] for k, v in sorted_vocab_len.items()]
vocab_data[0:10]

[['httpsimgurcoma7fseed', 20, 1],
 ['pashtunwali91073page', 20, 1],
 ['politicallyconnected', 20, 2],
 ['httpsimgurcomomvf95l', 20, 1],
 ['brainwashprogrammers', 20, 1],
 ['minnnuhhhsurrrtaaaaa', 20, 1],
 ['magatrumplicanmerica', 20, 1],
 ['httpsanonhqcom700452', 20, 1],
 ['perspectiveworkingin', 20, 1],
 ['gthttparchivefoqpu2e', 20, 1]]

In [10]:
# import csv
# def write_list_to_csv(lines, file_name):
#     """
#     write list to csv
#     :param lines: list of data to write to csv
#     :param file_name: output file name
#     """
#     print(f"writine file to {file_name}")
#     with open(file_name, 'w') as writeFile:
#         writer = csv.writer(writeFile)
#         writer.writerows(lines)

In [11]:
# write_list_to_csv(vocab_data, "vocab_data_sample.csv")

In [12]:
vocab_size = len(vocab_data)
print(f"vocab size {vocab_size}")

vocab size 87805


In [13]:
fit1 = mgp.fit(sample_data , vocab_size)

In stage 0: transferred 188952 clusters with 20 clusters populated
In stage 1: transferred 101684 clusters with 20 clusters populated
In stage 2: transferred 15843 clusters with 20 clusters populated
In stage 3: transferred 4560 clusters with 20 clusters populated
In stage 4: transferred 3299 clusters with 20 clusters populated
In stage 5: transferred 3124 clusters with 20 clusters populated
In stage 6: transferred 2942 clusters with 18 clusters populated
In stage 7: transferred 2896 clusters with 15 clusters populated
In stage 8: transferred 2963 clusters with 14 clusters populated
In stage 9: transferred 2983 clusters with 12 clusters populated
In stage 10: transferred 2953 clusters with 10 clusters populated
In stage 11: transferred 2978 clusters with 7 clusters populated
In stage 12: transferred 2883 clusters with 8 clusters populated
In stage 13: transferred 2989 clusters with 8 clusters populated
In stage 14: transferred 3055 clusters with 7 clusters populated
In stage 15: transf

In [14]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [     0      0      2      0    962      0    310      0      0      0
      5   1320      1      0   2082      0      0      0      0 195318]


In [15]:
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-20:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [19 14 11  4  6 10  2 12  7  1  3  5  9  8 18 13 15 16 17  0]


In [16]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — — ')

In [17]:
top_words(mgp.cluster_word_distribution, top_index, 20)

Cluster 19 : [('people', 18235), ('one', 12063), ('trump', 11712), ('think', 9230), ('know', 8732), ('even', 8355), ('us', 8268), ('time', 7437), ('right', 7154), ('want', 6887), ('good', 6884), ('go', 6332), ('going', 6249), ('make', 6004), ('never', 5914), ('cant', 5854), ('also', 5782), ('way', 5771), ('shit', 5749), ('well', 5666)]
 — — — — — — — — — 
Cluster 14 : [('brick', 1602), ('centipede', 1214), ('every', 1165), ('coat', 904), ('wall', 843), ('rate', 646), ('patriot', 595), ('sure', 575), ('lets', 571), ('gets', 569), ('bricks', 569), ('make', 567), ('everyone', 565), ('united', 565), ('states', 565), ('handed', 563), ('goal', 562), ('love', 517), ('mph', 483), ('god', 480)]
 — — — — — — — — — 
Cluster 11 : [('automatically', 1396), ('moderators', 1124), ('please', 1010), ('removed', 708), ('action', 702), ('concerns', 702), ('bot', 701), ('questions', 701), ('performed', 700), ('contact', 700), ('linked', 643), ('comment', 563), ('rule', 434), ('reddit', 430), ('want', 427)