# Topic Modeling

## LDA (Latent Dirichlet Allocation) 

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
headlines_df=pd.read_csv("abcnews-date-text.csv")

In [11]:
NUM_SAMPLES=10000

In [12]:
headlines_df=headlines_df.sample(NUM_SAMPLES,replace=False).reset_index(drop=True)

In [13]:
headlines_df.head()

Unnamed: 0,publish_date,headline_text
0,20140729,political experts respond to promise tracker
1,20040624,still hope for alma pool
2,20040629,chaffey dam to undergo safety work
3,20080819,elderly man stable after being run over by tra...
4,20140717,jail psych gone


In [14]:
x=headlines_df["headline_text"]

In [15]:
count_vectorizer=CountVectorizer(ngram_range=(2,2))

In [16]:
transformed_vector=count_vectorizer.fit_transform(x)

In [17]:
transformed_vector.shape

(10000, 44347)

In [18]:
Num_Topics=20
lda_model=LatentDirichletAllocation(n_components=Num_Topics,max_iter=20)
lda=lda_model.fit_transform(transformed_vector)

In [22]:
feature_names=count_vectorizer.get_feature_names()

In [23]:
def print_topic(identifier,top_words=10):
    for topic_id,topic_word_probs in enumerate(lda_model.components_):
        if topic_id==identifier:
            print("Topic: ",topic_id)
            top_feature_names=[feature_names[i] for i in topic_word_probs.argsort()[:-top_words-1:-1]]

            print("".join(top_feature_names))

In [24]:
for i in range(14):
    print(print_topic(i,top_words=3))

Topic:  0
to beunder firefire over
None
Topic:  1
the drumdrum tuesdaydrum friday
None
Topic:  2
accused ofto bemental health
None
Topic:  3
killed inout ofto be
None
Topic:  4
to beof theabc weather
None
Topic:  5
urged tocalls forin the
None
Topic:  6
jailed forof thecourt over
None
Topic:  7
man chargedcharged overto be
None
Topic:  8
urged toto thecourt over
None
Topic:  9
to beclimate changeto face
None
Topic:  10
found inabc sportin hospital
None
Topic:  11
out ofto beon the
None
Topic:  12
accused ofto becalls for
None
Topic:  13
urged todonald trumpto be
None


In [25]:
topics=[]
for i in range(NUM_SAMPLES):
    topics.append(lda[i].argmax())

In [26]:
headlines_df["Topics"]=topics

In [27]:
headlines_df.head()

Unnamed: 0,publish_date,headline_text,Topics
0,20140729,political experts respond to promise tracker,15
1,20040624,still hope for alma pool,15
2,20040629,chaffey dam to undergo safety work,1
3,20080819,elderly man stable after being run over by tra...,2
4,20140717,jail psych gone,1


In [35]:
import pyLDAvis.sklearn

In [36]:
pyLDAvis.enable_notebook()

In [37]:
panel=pyLDAvis.sklearn.prepare(lda_model,transformed_vector,count_vectorizer,mds='tsne')
panel