In [1]:
# Importing Necessary packages

import numpy as np
 
from sklearn.datasets import fetch_20newsgroups
 
from sklearn.feature_extraction.text import TfidfVectorizer
 
from sklearn.decomposition import NMF

In [6]:
# Importing Data
text_data= fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data
text_data[:3]

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't an

In [12]:
# converting the given text term-document matrix
 
vectorizer = TfidfVectorizer(max_features=1500, min_df=10, stop_words='english')
 
X = vectorizer.fit_transform(text_data)
 
words = np.array(vectorizer.get_feature_names())
print(X)
print("X=",words)

  (0, 1472)	0.18550765645757622
  (0, 278)	0.6305581416061171
  (0, 1191)	0.17201525862610717
  (0, 411)	0.1424921558904033
  (0, 469)	0.20099797303395192
  (0, 808)	0.183033665833931
  (0, 767)	0.18711856186440218
  (0, 484)	0.1714763727922697
  (0, 273)	0.14279390121865665
  (0, 1118)	0.12154002727766958
  (0, 1256)	0.15350324219124503
  (0, 128)	0.190572546028195
  (0, 1218)	0.19781957502373115
  (0, 1158)	0.16511514318854434
  (0, 247)	0.17513150125349705
  (0, 757)	0.09424560560725694
  (0, 887)	0.176487811904008
  (0, 506)	0.1941399556509409
  (0, 1495)	0.1274990882101728
  (0, 672)	0.169271507288906
  (0, 707)	0.16068505607893965
  (0, 809)	0.1439640091285723
  (0, 829)	0.1359651513113477
  (1, 411)	0.14622796373696134
  (1, 546)	0.20534935893537723
  :	:
  (11312, 1486)	0.183845539553728
  (11312, 1409)	0.2006451645457405
  (11312, 926)	0.2458009890045144
  (11312, 1100)	0.1839292570975713
  (11312, 1276)	0.39611960235510485
  (11312, 1302)	0.2391477981479836
  (11312, 647)	0.2

In [13]:
# Applying Non-Negative Matrix Factorization
 
nmf = NMF(n_components=10, solver="mu")
 
W = nmf.fit_transform(X)
 
H = nmf.components_

In [14]:
for i, topic in enumerate(H):
 
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: really,people,ve,time,good,know,think,like,just,don
Topic 2: info,help,looking,card,hi,know,advance,mail,does,thanks
Topic 3: church,does,christians,christian,faith,believe,christ,bible,jesus,god
Topic 4: league,win,hockey,play,players,season,year,games,team,game
Topic 5: bus,floppy,card,controller,ide,hard,drives,disk,scsi,drive
Topic 6: 20,price,condition,shipping,offer,space,10,sale,new,00
Topic 7: problem,running,using,use,program,files,window,dos,file,windows
Topic 8: law,use,algorithm,escrow,government,keys,clipper,encryption,chip,key
Topic 9: state,war,turkish,armenians,government,armenian,jews,israeli,israel,people
Topic 10: email,internet,pub,article,ftp,com,university,cs,soon,edu


In [19]:
print(W[:10,:10])

[[3.14912746e-02 2.94542038e-02 0.00000000e+00 3.33333245e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.20557576e-03 2.95497861e-02 1.07989433e-08 5.19817369e-04
  3.18118742e-02 8.04393768e-03 0.00000000e+00 4.99785893e-03
  2.82899920e-08 2.95957405e-04]
 [6.57082024e-02 6.11330960e-02 0.00000000e+00 8.18622592e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.51420347e-03 2.70163687e-02 0.00000000e+00 0.00000000e+00
  0.00000000e+00 2.25431949e-02 0.00000000e+00 8.78948967e-02
  0.00000000e+00 4.75400023e-17]
 [3.43312512e-02 6.34924081e-04 3.12610965e-03 0.00000000e+00
  0.00000000e+00 2.41521383e-02 1.04304968e-02 0.00000000e+00
  0.00000000e+00 1.10050280e-02]
 [1.54660994e-02 0.00000000e+00 3.72488017e-03 0.00000000e+00
  2.73645855e-10 3.59298123e-03 8.25479272e-03 0.00000000e+00
  1.79357458e-02 3.97412464e-03]
 [7.64105742e-03 6.41034640e-02 3.08040695e-04 2.52852526e

In [20]:
print(H[:10, :10])

[[1.81147375e-17 1.26182249e-02 2.93518811e-05 1.08240436e-02
  6.18732299e-07 1.27435805e-05 9.91130274e-09 1.12246344e-05
  4.51400032e-69 3.01041384e-54]
 [2.21534787e-12 0.00000000e+00 1.33321050e-09 2.96731084e-12
  2.65374551e-03 3.91087884e-04 2.98944644e-04 6.24554050e-10
  9.53864192e-31 2.71257642e-38]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.31863318e-11 4.40713132e-02 1.77561863e-03 2.19458585e-03
  2.15120339e-03 2.61656616e-06 2.14906622e-03 2.30356588e-04
  3.83769479e-08 1.28390795e-07]
 [3.98775665e-13 4.07296556e-03 0.00000000e+00 9.13681465e-03
  0.00000000e+00 0.00000000e+00 4.33946044e-03 0.00000000e+00
  1.28457487e-09 2.25454495e-11]
 [1.00421506e+00 2.39129457e-01 8.01133515e-02 5.32229171e-02
  3.68883911e-02 7.27891875e-02 4.50046335e-02 4.26041069e-02
  4.65075342e-03 2.51480151e-03]
 [0.00000000e+00 0.00000000e+00 2.17982651e-02 0.00000000e