# Topic clustering with NMF  
reference : https://predictivehacks.com/topic-modelling-with-nmf-in-python/

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [3]:
doc = pd.read_csv('news-data.csv')

In [6]:
doc.info(); doc.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103663 entries, 0 to 1103662
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1103663 non-null  int64 
 1   headline_text  1103663 non-null  object
dtypes: int64(1), object(1)
memory usage: 16.8+ MB


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


## Create TF-IDF

In [8]:
tfidf = TfidfVectorizer(max_df = 50, stop_words='english',)

In [9]:
X = tfidf.fit_transform(doc['headline_text'])

In [10]:
X.shape

(1103663, 85287)

In [12]:
nmf = NMF(n_components=10, random_state=1)
nmf.fit(X)



NMF(n_components=10, random_state=1)

In [23]:
nmf_featues = nmf.transform(X)
nmf_components = nmf.components_

In [26]:
print(X.shape); print(nmf_featues.shape); print(nmf_components.shape)

(1103663, 85287)
(1103663, 10)
(10, 85287)


In [27]:
components_df = pd.DataFrame(data = nmf.components_, columns=tfidf.get_feature_names())

In [28]:
components_df.head()

Unnamed: 0,002,005,006,007,0101,010115,010213,010215,010216,010312,...,zydelig,zygar,zygiefs,zygier,zyl,zylvester,zynga,zyngier,zz,zzz
0,0.0,0.0,7.984606e-08,2.159463e-09,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.681604e-09,7.863469e-09,1.96437e-07,0.0,2.582639e-08,0.0,0.0,0.0,0.0
1,0.0,0.0,6.583072e-11,9.281664e-09,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.412834e-09,3.891805e-09,2.584864e-09,0.0,4.844994e-11,0.0,0.0,0.0,0.0
2,0.0,0.0,5.22146e-10,3.452457e-09,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.402462e-10,2.059843e-08,6.267952e-08,0.0,9.348107e-09,0.0,0.0,0.0,0.0
3,0.0,0.0,4.480969e-09,4.75989e-10,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.877383e-10,2.739323e-09,4.759464e-09,0.0,2.066289e-10,0.0,0.0,0.0,0.0
4,0.0,0.0,7.078448e-09,4.953638e-09,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.103785e-10,1.463888e-08,2.316926e-08,0.0,2.408973e-09,0.0,0.0,0.0,0.0


Highest topic name

In [35]:
for i in range(components_df.shape[0]):
    print(i)
    print(components_df.iloc[i,:].nlargest(10))

0
scuba         4.986055
ryles         0.418904
19yo          0.256134
looters       0.212070
instructor    0.167226
shallow       0.147304
dawesville    0.063867
1968          0.059482
grapple       0.015564
haunted       0.015526
Name: 0, dtype: float64
1
greenough      2.618933
fels           0.259088
hamlet         0.032273
repo           0.031917
maleys         0.028882
gasfield       0.028882
archaeology    0.021585
unearthing     0.021576
ingrid         0.017856
payback        0.014068
Name: 1, dtype: float64
2
demerit        2.854901
ract           0.096860
fishermens     0.079257
justify        0.075841
watford        0.039635
corrects       0.039065
tunnels        0.007648
bluefin        0.007377
cooperative    0.006645
commitments    0.005139
Name: 2, dtype: float64
3
meekatharra     2.714847
assisting       0.137146
laneway         0.105685
23m             0.067462
brawling        0.052112
maroochydore    0.038883
solvent         0.037988
doray           0.036630
inflaming 

In [37]:
nmf_featues.shape

(1103663, 10)

In [73]:
nmf_featues[:,0].argsort()[::-1]

array([328955, 926730, 781591, ..., 669747, 669745,      0])

In [74]:
doc.iloc[nmf_featues[:,0].argsort()[::-1]]

Unnamed: 0,publish_date,headline_text
328955,20070906,distant space collision meant doom for dinosaurs
926730,20141230,reptile park on edge as keepers stage successf...
781591,20130424,indigenous pow gunner percy to be remembered i...
87375,20040428,bets ruling restarts cane toad races
1096681,20171028,spain strips catalonia of autonomy after indep...
...,...,...
669753,20120202,ch alpaca shearing
669748,20120202,candidates vie for calder ward spot
669747,20120202,call to bring elephants to australia
669745,20120202,building approvals fall


In [78]:
components_df.iloc[0].nlargest(10)

scuba         4.986055
ryles         0.418904
19yo          0.256134
looters       0.212070
instructor    0.167226
shallow       0.147304
dawesville    0.063867
1968          0.059482
grapple       0.015564
haunted       0.015526
Name: 0, dtype: float64