In [2]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
#http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

n_samples = 1000
n_features = 600
n_topics = 10
n_top_words = 20

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

t0 = time()
print("Loading dataset and extracting TF-IDF features...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data[:n_samples])
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Loading dataset and extracting TF-IDF features...
done in 1.967s.
Fitting the NMF model with n_samples=1000 and n_features=600...
done in 448.793s.
Topic #0:
think don pretty list switch win easy going opinion yes early general com appears unless large left add hp case

Topic #1:
edu soon internet send com home mit good university cs robert address need info mail post years reply ftp sun

Topic #2:
car good year cars bike power new insurance buy got great small used light time ve stop started like years

Topic #3:
thanks know mail advance interested hi email does list com want send help anybody post info like net tell reply

Topic #4:
windows problem software using drive use file card help window monitor dos work pc application files version available drivers disk

Topic #5:
just ll heard sure new does thought got like thing want mean way read bit doesn maybe bad say right

Topic #6:
key chip clipper government encryption keys use phone public law standard doesn court yes legal going d

