In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter

from tqdm import tqdm

from gensim import models
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

In [2]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE

output_notebook()
tsne_model = TSNE(n_components=2)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [5]:
tweets = pd.read_pickle('./data/data_tweets_selected.pkl')

## Without author aggregation

In [6]:
with open('./models/doc2vec_sep_model.kmeans', 'rb') as pickle_file:
    doc2vec_sep_kmeans = pickle.load(pickle_file)

doc2vec_sep_model = models.Doc2Vec.load('./models/doc2vec_sep_model.model')

tsne_kmeans_sep = tsne_model.fit_transform(doc2vec_sep_kmeans.transform(doc2vec_sep_model.docvecs)[:10000])

### Topic word list

In [7]:
MY_ENGLISH_STOP_WORDS = set(['rt'])
MY_ENGLISH_STOP_WORDS.update(ENGLISH_STOP_WORDS)

In [8]:
sep_vectorizer = TfidfVectorizer(min_df=3, lowercase=True, stop_words=MY_ENGLISH_STOP_WORDS)
sep_tfidf = sep_vectorizer.fit_transform(tweets.apply(lambda row: ' '.join(row.text), axis=1))

sep_feature_array = np.array(sep_vectorizer.get_feature_names())
for topic in range(0,20):
    tf_row = sum(sep_tfidf[[idx for idx, tp in enumerate(doc2vec_sep_kmeans.labels_) if tp == topic]])
    tfidf_sorting = np.argsort(tf_row.toarray()).flatten()[::-1]
    print('Topic %d: %s' % (topic, ' '.join(sep_feature_array[tfidf_sorting][:20])))

Topic 0: like love code new people just use day know make mind life news time want best things follow 10 need
Topic 1: july 2016 えどがわイケメン pm tos null mdbjss gaga lady xxx afe 08 29 mtvhottest 09 07 15 05 17 16
Topic 2: new https video just news need trump says like watch time life home love man help people set red great
Topic 3: new news 2016 july pm black free business says shoes size jobs air make like best tech video trump watch
Topic 4: 2016 july pm just free new utc follow code available want like resolved black 10 tweet cest join post lit
Topic 5: love just like 2016 july people life new want know time girl follow man day good pm things tos tweet
Topic 6: just like https love best ur want need trump girl make new 2016 time people today july day know got
Topic 7: new july just like news 2016 free love shoes https follow 10 size iphone wear black win time sex make
Topic 8: huuugs like new code love home free just use time black want 10 need life people tweet check follow news
Topic

### Viz

In [9]:
plot_kmeans_sep = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1,
                            tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")

plot_kmeans_sep.scatter(x=tsne_kmeans_sep[:,0], y=tsne_kmeans_sep[:,1], 
                        color=colormap[doc2vec_sep_kmeans.labels_[:10000]],
                        source=bp.ColumnDataSource({
                            "tweet": [' '.join(t) for t in tweets[:10000].text],
                            "cluster": doc2vec_sep_kmeans.labels_[:10000]
                        }))

hover = plot_kmeans_sep.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet - cluster: @cluster"}
show(plot_kmeans_sep)

## Within author aggregation

In [10]:
with open('./models/doc2vec_grp_model.kmeans', 'rb') as pickle_file:
    doc2vec_grp_kmeans = pickle.load(pickle_file)

doc2vec_grp_model = models.Doc2Vec.load('./models/doc2vec_grp_model.model')

tsne_kmeans_grp = tsne_model.fit_transform(doc2vec_grp_kmeans.transform(doc2vec_grp_model.docvecs))

### Topic word list

In [11]:
grp_tweets = tweets.groupby('user').agg({'text': 'sum'})
grp_vectorizer = TfidfVectorizer(min_df=3, lowercase=True, stop_words=MY_ENGLISH_STOP_WORDS)
grp_tfidf = grp_vectorizer.fit_transform(grp_tweets.apply(lambda row: ' '.join(row.text), axis=1))

grp_feature_array = np.array(grp_vectorizer.get_feature_names())
for topic in range(0,20):
    tf_row = sum(grp_tfidf[[idx for idx, tp in enumerate(doc2vec_grp_kmeans.labels_) if tp == topic]])
    tfidf_sorting = np.argsort(tf_row.toarray()).flatten()[::-1]
    print('Topic %d: %s' % (topic, ' '.join(grp_feature_array[tfidf_sorting][:20])))

Topic 0: radiohead prince app unabridged hamilton iphone whatsapp ipad mac tvseries paulmccartney utilities rend games metgala drama 2016 nonfiction new comedy
Topic 1: trump new police clinton says news brexit attack hillary convention 2016 man china donald 39 dallas watch france shooting killed
Topic 2: july えどがわイケメン 2016 tos pm null pushawardslizquens mdbjss mtvhottest afe health gaga just love s0t people xxx like deals new
Topic 3: twain shakespeare law mark william man like people good mencken time einstein does things life just albert know think make
Topic 4: code use like shop available lit wear urbanattires 15 https 10 order shipping sale discount caps dope free promo checkout
Topic 5: like love just want people need life ur know girl shit time isolated got wanna vocals beautiful make drake person
Topic 6: nfl size deals shoes nba ebay new football dvd buzz sale tickets cd black weight basketball fashion 8x10 nike ncaa
Topic 7: photos apply alert job sex nigeria nigerian buhari

### Viz

In [None]:
plot_kmeans_grp = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1,
                            tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")

plot_kmeans_grp.scatter(x=tsne_kmeans_grp[:,0], y=tsne_kmeans_grp[:,1], 
                        color=colormap[doc2vec_grp_kmeans.labels_],
                        source=bp.ColumnDataSource({
                            "cluster": doc2vec_grp_kmeans.labels_
                        }))

grp_hover = plot_kmeans_grp.select(dict(type=HoverTool))
grp_hover.tooltips={"cluster": "@cluster"}
show(plot_kmeans_grp)