# 20 newsgroups dataset shows NMF parts-based decomposition
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from pprint import pprint
pprint(list(newsgroups_train.target_names)) 

['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
newsgroups_train.filenames.shape

(2461,)

In [3]:
newsgroups_train.target.shape

(2461,)

In [4]:
newsgroups_train.target[:10] #  The target attribute is the integer index of the category

array([2, 3, 3, 1, 4, 0, 2, 2, 1, 2])

## Prepare vectors from train set using CountVectorizer
See http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

In [169]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [170]:
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors

<2461x1465 sparse matrix of type '<class 'numpy.int64'>'
	with 93788 stored elements in Compressed Sparse Row format>

In [171]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['text', 'document', 'analyze'])

True

In [172]:
len(vectorizer.get_feature_names())

1465

In [173]:
vectorizer.get_feature_names()[500:510]

['father',
 'fault',
 'fbi',
 'fear',
 'federal',
 'feds',
 'feel',
 'feet',
 'field',
 'fight']

In [174]:
vectorizer.vocabulary_.get('attitude')

144

In [202]:
feature_names = np.array(vectorizer.get_feature_names())
sparse_array_of_features = vectorizer.transform(['Something completely new.'])
#feature_names[sparse_array_of_features.toarray().nonzero()[1]]
feature_names[sparse_array_of_features.tocoo().col]

array(['completely', 'new'], 
      dtype='<U15')

In [176]:
vectors.nnz / float(vectors.shape[0])

38.10971149939049

## Extract components using NMF from sklearn

In [177]:
from sklearn.decomposition import NMF

model = NMF(n_components=10)
model.fit(vectors)
vectors_transformed = model.transform(vectors)

In [181]:
print(model.components_.shape)

(10, 1465)


In [182]:
feature_names = np.array(vectorizer.get_feature_names())

for component in model.components_:
    indices = np.argpartition(component, -10)[-8:] # take 10 words with biggest value    
    print(feature_names[indices])    

['united' 'firearms' 'states' 'congress' 'mr' 'file' 'gun' 'control']
['did' 'said' 'think' 'mr' 'know' 'going' 'don' 'president']
['earth' 'shuttle' 'information' 'available' 'edu' 'nasa' 'space' 'lunar']
['believe' 'religious' 'god' 'atheism' 'true' 'does' 'people' 'atheists']
['program' 'american' 'government' 'think' 'official' 'russian' 'president'
 'administration']
['plan' 'nuclear' 'time' 'british' 'war' 'south' 'military' 'new']
['course' 'david' 'said' 'king' 'lord' 'people' 'jesus' 'matthew']
['secretary' 'school' 'want' 'summer' 'young' 'people' 'jobs' 'work']
['market' 'services' 'year' 'satellites' 'commercial' 'satellite' 'launch'
 'space']
['person' 'license' 'shall' 'dangerous' 'military' 'firearm' 'section'
 'weapon']


In [183]:
model.transform(vectors[0])

array([[ 0.00170674,  0.03223595,  0.        ,  0.07593286,  0.        ,
         0.00189783,  0.0280003 ,  0.12034418,  0.        ,  0.07486981]])

## Prepare vectors from train set using CountVectorizer with bi-grams of words

In [203]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,5), 
                                    stop_words='english', max_df=0.95, min_df=0.001)
analyze = bigram_vectorizer.build_analyzer()

In [204]:
analyze('Bi-grams are cool!') == (['bi grams', 'grams cool', 'bi grams cool'])

False

In [186]:
bigram_vectors = bigram_vectorizer.fit_transform(newsgroups_train.data)
bigram_vectors

<2461x5503 sparse matrix of type '<class 'numpy.int64'>'
	with 25106 stored elements in Compressed Sparse Row format>

In [187]:
len(bigram_vectorizer.get_feature_names())

5503

In [188]:
bigram_vectorizer.get_feature_names()[500:510]

['basis values',
 'batf fbi',
 'batf needed',
 'batf proper',
 'bauer arndt',
 'bay area',
 'bbs 513',
 'bbs accessed',
 'bd did',
 'bear arms']

In [189]:
bigram_vectors.nnz / float(bigram_vectors.shape[0])

10.2015440877692

In [191]:
from sklearn.decomposition import NMF

bigram_model = NMF(n_components=10)
bigram_model.fit(bigram_vectors)
bigram_vectors_transformed = bigram_model.transform(bigram_vectors)

In [192]:
print(bigram_model.components_.shape)

(10, 5503)


In [194]:
feature_names = np.array(bigram_vectorizer.get_feature_names())

for component in bigram_model.components_:
    indices = np.argpartition(component, -10)[-10:] # take 10 words with biggest value    
    print(feature_names[indices])    

['stimulus package' 'senator dole' 'working groups' 'white house'
 'task force' 'going continue' 'stephanopoulos think' 'mr stephanopoulos'
 'don know' 'stephanopoulos don']
['journal medicine' 'england journal' 'new england' 'waiting period'
 'title 18' '18 united' 'states code' 'gun control' 'united states'
 'second amendment']
['mr teel' '1993 15' 'don like' 'uchicago edu' 'ted frank' 'apr 1993'
 'dendrite cs' 'ajteel dendrite' 'colorado edu' 'cs colorado']
['thu apr' 'apr 14' 'arc nasa' 'pub space' 'nasa gov' 'directory pub'
 'available anonymous' 'anonymous ftp' 'sci space' 'space shuttle']
['work force' 'high school' 'business community' 'young people'
 'summer jobs' 'jobs program' 'president think' 'private sector'
 'mr president' 'secretary reich']
['don think' 'continue work' 'task force' 'russian aid' 'working groups'
 'dee dee' 'stimulus package' 'don know' 'jobs package' 'health care']
['think important' 'soviet union' 'president clinton' 'southern hemisphere'
 'private sec