In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter

from tqdm import tqdm

from gensim import models
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

In [2]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE

output_notebook()
tsne_model = TSNE(n_components=2)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [5]:
tweets = pd.read_pickle('./data/data_tweets_selected.pkl')

## Without author aggregation

In [6]:
with open('./models/doc2vec_septune_model.kmeans', 'rb') as pickle_file:
    doc2vec_sep_kmeans = pickle.load(pickle_file)

doc2vec_sep_model = models.Doc2Vec.load('./models/doc2vec_septune_model.model')

tsne_kmeans_sep = tsne_model.fit_transform(doc2vec_sep_kmeans.transform(doc2vec_sep_model.docvecs)[:10000])

### Topic word list

In [7]:
MY_ENGLISH_STOP_WORDS = set(['rt'])
MY_ENGLISH_STOP_WORDS.update(ENGLISH_STOP_WORDS)

In [8]:
sep_vectorizer = TfidfVectorizer(min_df=3, lowercase=True, stop_words=MY_ENGLISH_STOP_WORDS)
sep_tfidf = sep_vectorizer.fit_transform(tweets.apply(lambda row: ' '.join(row.text), axis=1))

sep_feature_array = np.array(sep_vectorizer.get_feature_names())
for topic in range(0,20):
    tf_row = sum(sep_tfidf[[idx for idx, tp in enumerate(doc2vec_sep_kmeans.labels_) if tp == topic]])
    tfidf_sorting = np.argsort(tf_row.toarray()).flatten()[::-1]
    print('Topic %d: %s' % (topic, ' '.join(sep_feature_array[tfidf_sorting][:20])))

Topic 0: follow free join updates euro2016 tweet luck good shirt offer win want work email final check bio hello card matriculant
Topic 1: july 2016 pm えどがわイケメン tos null mdbjss xxx afe 01 29 17 08 09 ca 26 24 12 05 21
Topic 2: trump clinton hillary donald convention bernie dnc obama sanders vote speech president news republican gop campaign democratic says america election
Topic 3: gaga lady mtvhottest just love resolved huuugs post new need want mtvstars utc cest beautiful home 3ndback tweet like salute
Topic 4: marketing tech business news content howto social internet new media socialmedia growthhacking app gamedev google seo smm online tips indiedev
Topic 5: photos woman girl wife years later mom sex girlfriend daughter man photo dad looks guy shocking video gets caught pregnant
Topic 6: shoes size nike air jordan black retro mens sz white new adidas 11 running sneakers red grey 12 10 blue
Topic 7: police news attack turkey killed says man shooting dallas dead china coup death new 

### Viz

In [9]:
plot_kmeans_sep = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1,
                            tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")

plot_kmeans_sep.scatter(x=tsne_kmeans_sep[:,0], y=tsne_kmeans_sep[:,1], 
                        color=colormap[doc2vec_sep_kmeans.labels_[:10000]],
                        source=bp.ColumnDataSource({
                            "tweet": [' '.join(t) for t in tweets[:10000].text],
                            "cluster": doc2vec_sep_kmeans.labels_[:10000]
                        }))

hover = plot_kmeans_sep.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet - cluster: @cluster"}
show(plot_kmeans_sep)

## Within author aggregation

In [10]:
with open('./models/doc2vec_grptune_model.kmeans', 'rb') as pickle_file:
    doc2vec_grp_kmeans = pickle.load(pickle_file)

doc2vec_grp_model = models.Doc2Vec.load('./models/doc2vec_grptune_model.model')

tsne_kmeans_grp = tsne_model.fit_transform(doc2vec_grp_kmeans.transform(doc2vec_grp_model.docvecs))

### Topic word list

In [11]:
grp_tweets = tweets.groupby('user').agg({'text': 'sum'})
grp_vectorizer = TfidfVectorizer(min_df=3, lowercase=True, stop_words=MY_ENGLISH_STOP_WORDS)
grp_tfidf = grp_vectorizer.fit_transform(grp_tweets.apply(lambda row: ' '.join(row.text), axis=1))

grp_feature_array = np.array(grp_vectorizer.get_feature_names())
for topic in range(0,20):
    tf_row = sum(grp_tfidf[[idx for idx, tp in enumerate(doc2vec_grp_kmeans.labels_) if tp == topic]])
    tfidf_sorting = np.argsort(tf_row.toarray()).flatten()[::-1]
    print('Topic %d: %s' % (topic, ' '.join(grp_feature_array[tfidf_sorting][:20])))

Topic 0: code use like shop available lit wear 15 urbanattires https 10 order shipping sale discount dope caps free promo checkout
Topic 1: gbp mtvhottest follow gaga utc tweet free hi mgwv lady post spotted classroom following resolved humidity hpa love followtrick mtvstars
Topic 2: jobs corp rating zacks communityscene plc assistant published investment hiring job llc research shares manager news earnings management engineer opportunity
Topic 3: people sex things girls shocking girl mom porn just make know like woman photos life daughter happens confessions pictures later
Topic 4: radiohead prince app unabridged hamilton iphone whatsapp ipad mac tvseries paulmccartney utilities rend games metgala drama 2016 nonfiction new comedy
Topic 5: like want need love just life people ur girl got know shit beautiful time drake bae bio wanna make look
Topic 6: bid size shoes black nike new apple mens 16gb air leather sz jordan white unlocked iphone retro deals silver ebay
Topic 7: sex later woma

### Viz

In [14]:
plot_kmeans_grp = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1,
                            tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")

plot_kmeans_grp.scatter(x=tsne_kmeans_grp[:,0], y=tsne_kmeans_grp[:,1], 
                        color=colormap[doc2vec_grp_kmeans.labels_],
                        source=bp.ColumnDataSource({
                            "cluster": doc2vec_grp_kmeans.labels_
                        }))

grp_hover = plot_kmeans_grp.select(dict(type=HoverTool))
grp_hover.tooltips={"cluster": "@cluster"}
show(plot_kmeans_grp)