In [1]:
import pandas as pd
import gc
import os
import gensim

from platform import python_version
print(python_version())

In [None]:
data = joblib.load('../data/cleaned_data')

#### Topic Modeling

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation as LDA

from gensim.models import CoherenceModel
import gensim.corpora as corpora

In [45]:
id2word = corpora.Dictionary(data.new_text)

In [46]:
corpus = [id2word.doc2bow(text) for text in data.new_text]

In [47]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=21,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto')

In [48]:
from pprint import pprint

In [49]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.083*"dont" + 0.070*"miss" + 0.067*"week" + '
  '0.067*"viewsfromthenhsfrontline" + 0.064*"staff" + 0.047*"read" + '
  '0.025*"member" + 0.023*"might" + 0.018*"mention" + 0.018*"enough"'),
 (1,
  '0.128*"mental" + 0.087*"social" + 0.057*"community" + 0.048*"sign" + '
  '0.047*"problem" + 0.037*"linked" + 0.027*"surgery" + 0.021*"tackle" + '
  '0.021*"tell" + 0.019*"alcohol"'),
 (2,
  '0.048*"say" + 0.046*"today" + 0.026*"aampe" + 0.025*"case" + '
  '0.024*"network" + 0.018*"crisis" + 0.017*"go" + 0.016*"public" + '
  '0.015*"kid" + 0.014*"ebola"'),
 (3,
  '0.088*"patient" + 0.039*"doctor" + 0.036*"get" + 0.036*"like" + 0.034*"–" + '
  '0.025*"make" + 0.018*"see" + 0.018*"christmas" + 0.017*"…" + 0.016*"love"'),
 (4,
  '0.102*"new" + 0.072*"study" + 0.043*"life" + 0.035*"woman" + 0.033*"job" + '
  '0.031*"find" + 0.028*"report" + 0.027*"nurse" + 0.025*"could" + '
  '0.024*"mentalhealth"'),
 (5,
  '0.057*"’" + 0.049*"via" + 0.029*"working" + 0.028*"one" + 0.027*"good" + '
  '0.0

In [55]:
coh_model_lda = CoherenceModel(model=lda_model, texts=data.new_text, dictionary=id2word, coherence='c_v')

In [56]:
coh_model_lda.get_coherence()

0.3074013410004034

##### Grid Search on number of topics to get optimal number

In [59]:
num_topics = range(6, 15)

In [65]:
best_coh_score = 0
for n in num_topics:
    print('Topic Modeling using {} Topics'.format(n))
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics= n, 
                                           random_state=21,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto')
    coh_model_lda = CoherenceModel(model=lda_model, 
                                   texts=data.new_text, 
                                   dictionary=id2word)
    coh_score = coh_model_lda.get_coherence()
    print('coherence score is {}'.format(coh_score))
    
    if coh_score > best_coh_score:
        best_coh_score = coh_score

Topic Modeling using 6 Topics
coherence score is 0.26277051702441523
Topic Modeling using 7 Topics
coherence score is 0.28527480721341975
Topic Modeling using 8 Topics
coherence score is 0.3048739050662722
Topic Modeling using 9 Topics
coherence score is 0.31658622737923936
Topic Modeling using 10 Topics
coherence score is 0.3074013410004034
Topic Modeling using 11 Topics
coherence score is 0.3249915413161666
Topic Modeling using 12 Topics
coherence score is 0.35607307751179595
Topic Modeling using 13 Topics
coherence score is 0.33309490784515156
Topic Modeling using 14 Topics
coherence score is 0.33502345938000166


***Based on Coherence score 12 is the best number of topics***

##### Some of the example topics

In [62]:
print_topics(lda, count_vec, 10)


Topic #0:
new age smoke old year today blog suicid case cartoon

Topic #1:
ebola outbreak say vaccin case flu death africa cdc liberia

Topic #2:
drug diabet surgeri pay studi new help approv win fda

Topic #3:
thi healthi tri goodhealth recip day cynthiasass eat make amp

Topic #4:
nh today join healthtalk doctor patient miss thi work amp

Topic #5:
studi say kid risk doctor use drug teen parent cancer

Topic #6:
health insur care law report obamacar plan mental state rate

Topic #7:
weight diet food way make whi drink lose studi help

Topic #8:
cancer women studi heart babi risk breast new men nh

Topic #9:
hospit ebola patient new stori medic cell wa help citi


In [38]:
print_topics(lda, count_vec, 10)


Topic #0:
cancer study risk heart disease women help kids breast says

Topic #1:
goodhealth work fitness workout weight ways like cynthiasass food exercise

Topic #2:
sierra new leone free diabetes nursing 0000 ebola measles california

Topic #3:
ebola says flu patient outbreak cdc virus hospital vaccine new

Topic #4:
health nhs care insurance law obamacare mental new healthcare todays

Topic #5:
healthy day recipes amp try pharmalot good morning recipe pharma

Topic #6:
doctors medical patients medicare care nhs hospitals stefaniei health marijuana

Topic #7:
healthtalk diet know make join today things does dont weight

Topic #8:
drug study fda new finds blood risk cancer use drugs

Topic #9:
new age old cynthiasass goodhealth medicaid blog amp people food
