In [24]:
from gensim import models
from gensim.corpora import Dictionary, MmCorpus

import numpy as np
from scipy.sparse import csr_matrix

import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [25]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE

output_notebook()
tsne_model = TSNE(n_components=2)

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

## Traditional LDA (without aggregation)

In [2]:
sep_corpus = MmCorpus('./models/lda_sep_corpus.mm')
sep_dictionary = Dictionary.load('./models/lda_sep_dict.dict')
sep_model = models.ldamulticore.LdaMulticore.load('./models/lda_sep_model.model')

### Topic word list

In [11]:
for row in sep_model.show_topics(num_topics=-1, num_words=15, formatted=False):
    print('Topic %d: %s' % (row[0], ' '.join([w[0] for w in row[1]])))

Topic 0: 05 lady gaga link check #mtvstars days utc #mtvhottest bio spotted boom best stock convention
Topic 1: game job art apply believe alert photo guy baby watch caught beat home return hits
Topic 2: back want people tweet new video great twitter like see ever feel boys day full
Topic 3: 09 22 18 ut going #ascendant listen 08 luck ac looking keep #mediumcoeli mc download
Topic 4: new #deals black end iphone apple case date buy app post 2 caps games #iphone
Topic 5: trump clinton get hillary special donald free email list tips access join #home vote updates
Topic 6: 31 shirt sz time #gamedev new started late second #final 5 boost #indiedev yeezy shocking
Topic 7: 2016 july pm #えどがわイケメン tos null mdbjss xxx 11 ca afe 20 s0t 12345 gun
Topic 8: go got year playing let thing high friends 19 one ever would mom life like
Topic 9: code 10 use get like 15 20 25 free available sale wear order https shipping
Topic 10: 3ndback #jobs media via social marketing business service read turn morning 

### Vis

In [None]:
tmp_data, tmp_col, tmp_row = [], [], []
for idx, doc in enumerate(sep_corpus[:10000]):
    for tp in sep_model[doc]:
        tmp_data.append(tp[1])
        tmp_row.append(idx)
        tmp_col.append(tp[0])

In [44]:
sep_doc_top = csr_matrix((tmp_data, (tmp_row, tmp_col)))

tsne_kmeans_sep = tsne_model.fit_transform(sep_doc_top.toarray())

plot_kmeans_sep = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans_sep.scatter(x=tsne_kmeans_sep[:,0], y=tsne_kmeans_sep[:,1], 
                        color=colormap[[row.toarray().argmax() for row in sep_doc_top]])

show(plot_kmeans_sep)

### LDA Vis

In [5]:
# Visualize result
sep_vis_data = gensimvis.prepare(sep_model, sep_corpus, sep_dictionary)
pyLDAvis.display(sep_vis_data)

## Author-topic LDA (within aggregation)

In [6]:
grp_corpus = MmCorpus('./models/lda_grp_corpus.mm')
grp_dictionary = Dictionary.load('./models/lda_grp_dict.dict')
grp_model = models.ldamulticore.LdaMulticore.load('./models/lda_grp_model.model')

### Topic word list

In [12]:
for row in grp_model.show_topics(num_topics=-1, num_words=15, formatted=False):
    print('Topic %d: %s' % (row[0], ' '.join([w[0] for w in row[1]])))

Topic 0: code shop pinned tweet available wear sale https order #tech another share lit gold #fashion
Topic 1: currently mind heart pm thank double following #nowplaying fairy quote #jobs code johnson part #nfl
Topic 2: resolved join play #pushawardslizquens gbp streaming invite part code page 01 bet someone https #bigdata
Topic 3: #news photos via ut size sex nike null jordan air #mediumcoeli post source tumblr #ascendant
Topic 4: via boys news town ebay vintage r deal size bid #adult share w https set
Topic 5: buy pm beautiful mind following #えどがわイケメン vol #deals direction nothing places others blessed humble fashion
Topic 6: lady gaga check days offer stock #health win apply personalized via deals #deals business #home
Topic 7: pm tos #えどがわイケメン null afe 11 16 17 14 26 21 29 13 18 25
Topic 8: #えどがわイケメン pm tos mdbjss follow #gamedev ca tweet #indiedev 25 app 16 win gbp 24
Topic 9: course #mtvhottest follow gaga lady design institute #jobs #mgwv trump engineering via certified qa qc
Top

### Vis

In [45]:
tmp_data, tmp_col, tmp_row = [], [], []
for idx, doc in enumerate(grp_corpus[:10000]):
    for tp in grp_model[doc]:
        tmp_data.append(tp[1])
        tmp_row.append(idx)
        tmp_col.append(tp[0])

grp_doc_top = csr_matrix((tmp_data, (tmp_row, tmp_col)))

tsne_kmeans_grp = tsne_model.fit_transform(grp_doc_top.toarray())

plot_kmeans_grp = bp.figure(plot_width=900, plot_height=700,
                            x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans_grp.scatter(x=tsne_kmeans_grp[:,0], y=tsne_kmeans_grp[:,1], 
                        color=colormap[[row.toarray().argmax() for row in grp_doc_top]])

show(plot_kmeans_grp)

### LDA Vis

In [9]:
# Visualize result
grp_vis_data = gensimvis.prepare(grp_model, grp_corpus, grp_dictionary)
pyLDAvis.display(grp_vis_data)