In [16]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
from bokeh.plotting import output_file, save, show

In [17]:
hs=pd.read_excel('./data/headspace.xlsx')

In [18]:
hs.date=hs.date.str.replace('\'', '')
hs.date=hs.date.str.replace('[', '')
hs.date=hs.date.str.replace(']', '')
hs.date=pd.to_datetime(hs['date'],format='%Y, %m, %d')

In [19]:
hs['texts']=hs['title']+hs['text']

In [20]:
hs.fillna('0',inplace=True)

In [21]:
corpus=hs['texts'].tolist()
corpusStr=' '.join(corpus)

In [15]:
corpus=[x.lower() for x in corpus]

In [17]:
n_topics = 10 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=3, stop_words='english')
cvz = cvectorizer.fit_transform(corpus)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 949
INFO:lda:vocab_size: 9641
INFO:lda:n_words: 302818
INFO:lda:n_topics: 10
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -3341452
INFO:lda:<10> log likelihood: -2779184
INFO:lda:<20> log likelihood: -2666687
INFO:lda:<30> log likelihood: -2630591
INFO:lda:<40> log likelihood: -2612660
INFO:lda:<50> log likelihood: -2602992
INFO:lda:<60> log likelihood: -2597170
INFO:lda:<70> log likelihood: -2590768
INFO:lda:<80> log likelihood: -2585638
INFO:lda:<90> log likelihood: -2582414
INFO:lda:<100> log likelihood: -2580316
INFO:lda:<110> log likelihood: -2576830
INFO:lda:<120> log likelihood: -2574729
INFO:lda:<130> log likelihood: -2573421
INFO:lda:<140> log likelihood: -2571472
INFO:lda:<150> log likelihood: -2570447
INFO:lda:<160> log likelihood: -2569686
INFO:lda:<170> log likelihood: -2568292
INFO:lda:<180> log likelihood: -2565466
INFO:lda:<190> log likelihood: -2564753
INFO:lda:<200> log likelihood: -2563987
INFO:lda:<210> log likelihood: -2563831
INFO:lda:<2

In [19]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [21]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 150 samples in 0.001s...
[t-SNE] Computed neighbors for 150 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 150 / 150
[t-SNE] Mean sigma: 0.204104
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.818001
[t-SNE] Error after 700 iterations: 0.130505


In [41]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [42]:
_lda_keys = []
for i in range(X_topics.shape[0]):
    _lda_keys += X_topics[i].argmax(),
    topic_summaries = []
    topic_word = lda_model.topic_word_  # all topic words
    vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [56]:
# title = "Headspace LDA viz".format(
#     X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)

title="HS LDA viz"
plot_lda = bp.figure(plot_width=700, plot_height=500,
                       title=title,
                       tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                       x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                   color=colormap[_lda_keys][:num_example]),
#                    source=bp.ColumnDataSource({
#                      "content": news[:num_example],
#                      "topic_key": _lda_keys[:num_example]
#                      }))

  # randomly choose a news (in a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
topic_coord = np.nan_to_num(topic_coord)
for topic_num in _lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

  # plot crucial words
for i in range(X_topics.shape[1]):
    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

  # hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save(plot_lda, '20_news_tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
#     X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words))
save(plot_lda)

'/Volumes/GoogleDrive/Mój dysk/mindfulness/hs.html'

In [22]:
from __future__ import print_function
import pandas as pd
import numpy as np
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.utils import tokenize
# from gensim.corpora import Dictionary, MmCorpus
from gensim import corpora
import nltk
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [23]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(corpus)
print(dtm_tf.shape)

(949, 3881)


In [24]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(corpus)
print(dtm_tfidf.shape)

(949, 3881)


In [32]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=8, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [33]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')