In [1]:
import os
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [32]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print("|".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [4]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
#data_samples = dataset.data[:n_samples]

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


In [36]:
n_samples = 3000
n_features = 1000
n_topics = 10
n_top_words = 20

In [42]:
data_samples = dataset.data

In [43]:
pd.DataFrame(data_samples).shape

(11314, 1)

In [44]:
data_samples[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

In [12]:
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [25]:
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

In [33]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
just|people|don|like|think|know|good|time|make|use|way|really|right|say|ve|want|government|did|ll|going
Topic #1:
windows|file|dos|files|program|using|use|os|problem|help|running|drivers|pc|ftp|ms|version|available|screen|software|work
Topic #2:
god|jesus|bible|faith|christian|christ|christians|does|heaven|sin|believe|lord|life|church|mary|atheism|belief|human|love|religion
Topic #3:
edu|soon|com|send|university|internet|mit|ftp|mail|cc|article|pub|information|hope|mac|email|blood|home|contact|program
Topic #4:
thanks|know|does|mail|advance|hi|info|interested|email|anybody|card|looking|help|like|appreciated|information|video|send|list|need
Topic #5:
drive|drives|hard|disk|floppy|software|mac|scsi|computer|controller|power|apple|mb|rom|pc|problem|card|internal|problems|cable
Topic #6:
window|manager|application|motif|problem|display|graphics|use|standard|time|possible|try|using|screen|tried|doesn|faq|sun|certain|area
Topic #7:
game|team|games|year|win|play|season|players|nhl|r

In [31]:
nmf.components_

array([[ 0.        ,  0.06652185,  0.14421642, ...,  0.27390615,
         0.16496368,  0.05528831],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.03183867,  0.        ],
       ..., 
       [ 0.        ,  0.01208093,  0.04691671, ...,  0.03698489,
         0.        ,  0.        ],
       [ 0.        ,  0.1188873 ,  0.01927582, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.78157279,  0.01706168,  0.27060781, ...,  0.        ,
         0.        ,  0.        ]])

In [13]:
type(tfidf)

scipy.sparse.csr.csr_matrix