# LDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("articles1.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [4]:
df = df.iloc[:, 9:10]

In [5]:
df.head()

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,"After the bullet shells get counted, the blood..."
2,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Death may be the great equalizer, but it isn’t..."
4,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
len(df)

50000

In [8]:
#df["content"][0]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_df = 0.09, min_df = 2, stop_words = "english")

In [13]:
dtm = cv.fit_transform(df["content"])

In [14]:
dtm

<50000x91032 sparse matrix of type '<class 'numpy.int64'>'
	with 8090562 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
LDA = LatentDirichletAllocation(n_components = 12, random_state = 42)

In [18]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=12, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [19]:
#Grab the vocabulary of words

len(cv.get_feature_names())

91032

In [20]:
type(cv.get_feature_names())

list

In [21]:
#grab the topics

In [22]:
len(LDA.components_)

12

In [23]:
type(LDA.components_)

numpy.ndarray

In [24]:
LDA.components_.shape

(12, 91032)

In [25]:
LDA.components_

array([[8.33352292e-02, 8.33365092e-02, 8.33333333e-02, ...,
        2.08326091e+00, 8.33333333e-02, 8.33388054e-02],
       [6.88516371e+00, 8.33334880e-02, 8.33333333e-02, ...,
        8.33333333e-02, 8.33333333e-02, 8.33336602e-02],
       [9.53066637e+00, 8.33338358e-02, 8.33333333e-02, ...,
        8.33382884e-02, 8.33349385e-02, 8.33339988e-02],
       ...,
       [2.97399913e+02, 8.33333333e-02, 8.33333333e-02, ...,
        8.33437705e-02, 8.33333333e-02, 8.33333333e-02],
       [3.76640986e+00, 8.33336125e-02, 8.33333333e-02, ...,
        8.33377640e-02, 8.33336380e-02, 8.33339339e-02],
       [1.23927448e+03, 8.33336893e-02, 8.34886218e-02, ...,
        8.33333333e-02, 8.33333333e-02, 8.33333333e-02]])

In [26]:
single_topic = LDA.components_[0]

In [27]:
#sorting by giving the index number
single_topic.argsort()

array([40103, 43230, 54888, ..., 31159, 24928, 87647], dtype=int64)

In [28]:
#example
import numpy as np

In [30]:
arr = np.array([100, 1, 200])

In [31]:
arr

array([100,   1, 200])

In [32]:
arr.argsort()

array([1, 0, 2], dtype=int64)

In [33]:
#ARGSORT ----> Index positions sorted from least to greatest
#Top 10 values(10 greatest values)
#last 10 values of argsort()
single_topic.argsort()[-10:]#grab the last 10 values of .argsort()

array([13727, 59976, 90536, 23749, 25211, 77915, 51230, 31159, 24928,
       87647], dtype=int64)

In [34]:
top_ten_words = single_topic.argsort()[-10:]

In [36]:
for index in top_ten_words:
    print(cv.get_feature_names()[index])

cancer
patients
zika
disease
drug
study
medical
food
dr
water


In [39]:
top_twenty_words = single_topic.argsort()[-20:]

In [40]:
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

storm
brain
risk
virus
california
doctors
hospital
residents
research
cases
cancer
patients
zika
disease
drug
study
medical
food
dr
water


In [41]:
#grob the highest probability word per topic

In [42]:
for i, topic in enumerate(LDA.components_):
    print(f"The top 15 words for the topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print("\n")
    print("\n")

The top 15 words for the topic #0
['doctors', 'hospital', 'residents', 'research', 'cases', 'cancer', 'patients', 'zika', 'disease', 'drug', 'study', 'medical', 'food', 'dr', 'water']




The top 15 words for the topic #1
['billion', 'online', 'ceo', 'users', 'tech', 'internet', 'intelligence', 'technology', 'data', 'google', 'companies', 'apple', 'putin', 'russian', 'russia']




The top 15 words for the topic #2
['culture', 'girls', 'sex', 'parents', 'yiannopoulos', 'transgender', 'gender', 'campus', 'gay', 'schools', 'milo', 'student', 'education', 'college', 'students']




The top 15 words for the topic #3
['turkey', 'britain', 'israel', 'terrorist', 'europe', 'syrian', 'iraq', 'forces', 'iran', 'european', 'minister', 'muslim', 'syria', 'islamic', 'isis']




The top 15 words for the topic #4
['room', 'parents', 'girl', 'sex', 'wife', 'friends', 'child', 'son', 'husband', 'fox', 'daughter', 'sexual', 'mother', 'father', 'ms']




The top 15 words for the topic #5
['actress', 'sho

In [43]:
dtm

<50000x91032 sparse matrix of type '<class 'numpy.int64'>'
	with 8090562 stored elements in Compressed Sparse Row format>

In [45]:
topic_results = LDA.transform(dtm)

In [46]:
topic_results[0]

array([3.04144966e-04, 3.04153309e-04, 3.04148514e-04, 3.04144667e-04,
       3.04149195e-04, 3.04144937e-04, 5.64841380e-01, 3.93262017e-01,
       3.04145716e-04, 3.04147408e-04, 3.04144536e-04, 3.91592798e-02])

In [47]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.56, 0.39, 0.  , 0.  , 0.  ,
       0.04])

In [51]:
topic_results[0].argmax()

6

In [52]:
#finding the index number of the topic with maximum probabilty

In [53]:
df["topic"] = topic_results.argmax(axis=1)

In [54]:
df

Unnamed: 0,content,topic
0,WASHINGTON — Congressional Republicans have...,6
1,"After the bullet shells get counted, the blood...",8
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",5
3,"Death may be the great equalizer, but it isn’t...",5
4,"SEOUL, South Korea — North Korea’s leader, ...",6
5,"LONDON — Queen Elizabeth II, who has been b...",9
6,BEIJING — President Tsai of Taiwan sharpl...,6
7,"Danny Cahill stood, slightly dazed, in a blizz...",0
8,"Just how is Hillary Kerr, the founder of ...",5
9,Angels are everywhere in the Muñiz family’s ap...,4


# Non-Negative Matrix Factorization

In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("articles1.csv")

In [11]:
df = df.iloc[:, 9:10]

In [12]:
df.head()

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,"After the bullet shells get counted, the blood..."
2,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Death may be the great equalizer, but it isn’t..."
4,"SEOUL, South Korea — North Korea’s leader, ..."


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = "english")

In [15]:
dtm = tfidf.fit_transform(df["content"])

In [16]:
dtm

<50000x91380 sparse matrix of type '<class 'numpy.float64'>'
	with 10807637 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.decomposition import NMF

In [20]:
nmf_model = NMF(n_components = 7, random_state = 42)

In [21]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [24]:
tfidf.get_feature_names()[300]

'122nd'

In [26]:
for index, topic in enumerate(nmf_model.components_):
    print(f"Top 15 words for the topic # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print("\n")

Top 15 words for the topic # 0
['voters', 'going', 'election', 'party', 'nominee', 'rubio', 'presidential', 'gop', 'said', 'campaign', 'president', 'republican', 'cruz', 'donald', 'trump']


Top 15 words for the topic # 1
['know', 'time', 'health', 'year', 'years', 'company', 'going', 'don', 'think', 'women', 'just', 'new', 'like', 'people', 'said']


Top 15 words for the topic # 2
['did', 'obama', '_____', 'officials', 'states', 'senator', 'campaign', 'party', 'new', 'united', 'trump', 'mrs', 'ms', 'said', 'mr']


Top 15 words for the topic # 3
['poll', 'bernie', 'email', 'emails', 'presidential', 'percent', 'foundation', 'mrs', 'voters', 'state', 'democratic', 'campaign', 'sanders', 'hillary', 'clinton']


Top 15 words for the topic # 4
['president', 'state', 'states', 'islamic', 'iran', 'syrian', 'obama', 'said', 'china', 'united', 'military', 'north', 'isis', 'syria', 'korea']


Top 15 words for the topic # 5
['enforcement', 'killed', 'shot', 'black', 'attack', 'city', 'shooting', 

In [27]:
topic_results = nmf_model.transform(dtm)

In [28]:
topic_results.argmax(axis = 1)

array([6, 5, 2, ..., 0, 1, 1], dtype=int64)

In [29]:
df["Topic"] = topic_results.argmax(axis = 1)

In [30]:
df.head()

Unnamed: 0,content,Topic
0,WASHINGTON — Congressional Republicans have...,6
1,"After the bullet shells get counted, the blood...",5
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",2
3,"Death may be the great equalizer, but it isn’t...",1
4,"SEOUL, South Korea — North Korea’s leader, ...",4
