In [9]:
import pandas as pd
import numpy as np
import gensim

In [18]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from gensim.models import CoherenceModel
import gensim.corpora as corpora

import joblib

from sklearn.cluster import KMeans

In [33]:
data = joblib.load('../data/cleaned_data')

In [3]:
#pre-trained word2vec model
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../../misc/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [15]:
stop_words = stopwords.words('english')

In [21]:
data.head()

Unnamed: 0,id,datetime,text,new_text
0,548662191340421120,Sat Dec 27 02:10:34 +0000 2014,risks in using social media to spot signs of m...,"['risk', 'using', 'social', 'medium', 'spot', ..."
1,548579831169163265,Fri Dec 26 20:43:18 +0000 2014,paulaspan the most effective nationwide diabet...,"['paulaspan', 'effective', 'nationwide', 'diab..."
2,548579045269852161,Fri Dec 26 20:40:11 +0000 2014,the new old age blog diabetes prevention that ...,"['new', 'old', 'age', 'blog', 'diabetes', 'pre..."
3,548444679529041920,Fri Dec 26 11:46:15 +0000 2014,well comfort casseroles for winter dinners,"['well', 'comfort', 'casserole', 'winter', 'di..."
4,548311901227474944,Fri Dec 26 02:58:39 +0000 2014,highlevel knowledge before veterans affairs sc...,"['highlevel', 'knowledge', 'veteran', 'affair'..."


##### Main topics

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation as LDA

In [55]:
count_vec = CountVectorizer(stop_words='english', lowercase=False)

In [62]:
tweets = data['new_text'].apply(lambda x: ' '.join(x))

In [64]:
count_data = count_vec.fit_transform(tweets)

In [65]:
lda = LDA(n_components=12, n_jobs=-1)
lda.fit(count_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=12, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [66]:
count_data

<62817x32377 sparse matrix of type '<class 'numpy.int64'>'
	with 447437 stored elements in Compressed Sparse Row format>

In [102]:
words = count_vec.get_feature_names()

In [72]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [73]:
print_topics(lda, count_vec, 10)


Topic #0:
drug heart fda disease new treatment study cancer test question

Topic #1:
weight way help loss tip lose brain workout make exercise

Topic #2:
food healthy recipe today nh try way make calorie day

Topic #3:
flu new virus research safety bird outbreak rule deadly scientist

Topic #4:
ebola leone sierra year trial case say tell rise help

Topic #5:
ebola health care hospital say patient new mental worker africa

Topic #6:
day school say work baby wont video sugar make expert

Topic #7:
study risk kid child say teen linked brain help parent

Topic #8:
patient healthtalk doctor help ebola diabetes dy win hospital everydayhealth

Topic #9:
cancer woman new age breast old say study blog court

Topic #10:
health nh insurance obamacare law care report state exchange new

Topic #11:
goodhealth cynthiasass know amp like food change thing health look


Let's pick topic 5 in above result and try to get sub topics

In [93]:
topic5 = lda.components_[5].copy()

In [94]:
topic5.sort()

Top 700 words explain more than 75% of the contribution to the topic

In [95]:
topic5[-700:].sum()*100/topic5.sum()

75.58973225600249

In [96]:
lda.components_[5]

array([0.08333379, 0.08333333, 0.08333425, ..., 0.08333333, 0.08333333,
       0.08333333])

##### Words relevant to main topic

In [105]:
topic_words = [words[i] for i in lda.components_[5].argsort()[-700:]]

###### get Word2Vec representation for each words

In [110]:
mat = []
words = []
for i in topic_words:
    vec = np.zeros(300)
    try:
        vec = w2v_model[i]
        words.append(i)
        mat.append(vec)
    except:
        mat.append(vec)
    

###### Fit KMeans clustering on the words

In [113]:
km = KMeans(n_clusters=3)

In [114]:
km.fit(mat)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [115]:
pred = km.predict(mat)

In [121]:
score = km.transform(mat)

##### Sub topics

###### Sub Topic 1 in Topic 5

In [139]:
np.array(topic_words)[score[:,0].argsort()[:10]]

array(['princeton', 'bellevue', 'christie', 'alberta', 'duncan', 'emory',
       'er', 'hhs', 'cbc', 'thats'], dtype='<U14')

###### Sub Topic 2 in Topic 5

In [140]:
np.array(topic_words)[score[:,1].argsort()[:10]]

array(['100', 'cuomo', 'burwell', 'jennyagold', 'adriearsenault',
       'vinson', '2014', 'sebelius', 'ptsd', 'gilead'], dtype='<U14')

###### Sub Topic 3 in Topic 5

In [146]:
np.array(topic_words)[score[:,2].argsort()[:10]]

array(['100', 'labour', 'cuomo', 'bcsolomon', 'didnt', '14',
       'reuterslive', 'jrovner', 'alvinhtran', 'endoflife'], dtype='<U14')

***Note*** : Obviously the sub topics are not so intuitive. May be one needs to deep dive into this approach to understand pros and cons

**Approach walkthrough**:

1. For a specific Main topic, pick top N words

2. Get word embedding for these top N words

3. Using Kmeans cluster the words : It will give similar words in a group

4. To get the top words: get the words which are closer to the centroid

**Next steps**:
    1. While getting the top words in sub-topic, try to incorporate the weight obtained in LDA