# 20 newsgroups dataset shows NMF parts-based decomposition
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [256]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from pprint import pprint
pprint(list(newsgroups_train.target_names)) 

['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


In [257]:
newsgroups_train.filenames.shape

(2461,)

In [258]:
newsgroups_train.target.shape

(2461,)

In [259]:
newsgroups_train.target[:10] #  The target attribute is the integer index of the category

array([2, 3, 3, 1, 4, 0, 2, 2, 1, 2])

## Prepare vectors from train set using CountVectorizer
See http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

In [260]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [261]:
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors

<2461x1465 sparse matrix of type '<class 'numpy.int64'>'
	with 93788 stored elements in Compressed Sparse Row format>

In [262]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['text', 'document', 'analyze'])

True

In [263]:
len(vectorizer.get_feature_names())

1465

In [264]:
vectorizer.get_feature_names()[500:510]

['father',
 'fault',
 'fbi',
 'fear',
 'federal',
 'feds',
 'feel',
 'feet',
 'field',
 'fight']

In [265]:
vectorizer.vocabulary_.get('attitude')

144

In [266]:
feature_names = np.array(vectorizer.get_feature_names())
sparse_array_of_features = vectorizer.transform(['Something completely new.'])
#feature_names[sparse_array_of_features.toarray().nonzero()[1]]
feature_names[sparse_array_of_features.tocoo().col]

array(['completely', 'new'], 
      dtype='<U15')

In [267]:
vectors.nnz / float(vectors.shape[0])

38.10971149939049

## Extract components using NMF from sklearn

In [268]:
from sklearn.decomposition import NMF

model = NMF(n_components=10)
model.fit(vectors)
vectors_transformed = model.transform(vectors)

In [269]:
print(model.components_.shape)

(10, 1465)


In [271]:
def reconsruction_error(X, reconstructed_X):
    return np.sqrt(np.power((X - reconstructed_X), 2).sum())
    
reconsruction_error(
    X,
    reconstructed_X=vectors_transformed.dot(model.components_))

557.53214316718129

In [273]:
feature_names = np.array(vectorizer.get_feature_names())

for component in model.components_:
    indices = np.argpartition(component, -10)[-8:] # take 10 words with biggest value    
    print(feature_names[indices])    

['united' 'firearms' 'states' 'congress' 'mr' 'file' 'gun' 'control']
['did' 'said' 'think' 'mr' 'know' 'going' 'don' 'president']
['earth' 'shuttle' 'information' 'available' 'edu' 'nasa' 'space' 'lunar']
['believe' 'religious' 'god' 'atheism' 'true' 'does' 'people' 'atheists']
['program' 'american' 'government' 'think' 'official' 'russian' 'president'
 'administration']
['plan' 'nuclear' 'time' 'british' 'war' 'south' 'military' 'new']
['course' 'david' 'said' 'king' 'lord' 'people' 'jesus' 'matthew']
['secretary' 'school' 'want' 'summer' 'young' 'people' 'jobs' 'work']
['market' 'services' 'year' 'satellites' 'commercial' 'satellite' 'launch'
 'space']
['person' 'license' 'shall' 'dangerous' 'military' 'firearm' 'section'
 'weapon']


In [274]:
model.transform(vectors[0])

array([[ 0.00170674,  0.03223594,  0.        ,  0.07593279,  0.        ,
         0.00189783,  0.0280003 ,  0.12034452,  0.        ,  0.07487013]])

## Prepare vectors from train set using CountVectorizer with bi-grams of words

In [275]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,5), 
                                    stop_words='english', max_df=0.95, min_df=0.001)
analyze = bigram_vectorizer.build_analyzer()

In [276]:
analyze('Bi-grams are cool!') == (['bi grams', 'grams cool', 'bi grams cool'])

False

In [277]:
bigram_vectors = bigram_vectorizer.fit_transform(newsgroups_train.data)
bigram_vectors

<2461x18895 sparse matrix of type '<class 'numpy.int64'>'
	with 196207 stored elements in Compressed Sparse Row format>

In [278]:
len(bigram_vectorizer.get_feature_names())

18895

In [279]:
bigram_vectorizer.get_feature_names()[500:510]

['415',
 '415 864',
 '415 864 0952',
 '415 864 0952 fax',
 '415 864 0952 fax 415',
 '415 864 7506',
 '415 864 7506 71034',
 '415 864 7506 71034 2711',
 '416',
 '42']

In [280]:
bigram_vectors.nnz / float(bigram_vectors.shape[0])

79.72653392929703

In [281]:
from sklearn.decomposition import NMF

bigram_model = NMF(n_components=10)
bigram_model.fit(bigram_vectors)
bigram_vectors_transformed = bigram_model.transform(bigram_vectors)

In [282]:
print(bigram_model.components_.shape)

(10, 18895)


In [283]:
feature_names = np.array(bigram_vectorizer.get_feature_names())

for component in bigram_model.components_:
    indices = np.argpartition(component, -10)[-10:] # take 10 words with biggest value    
    print(feature_names[indices])    

['don know' 'said' 'think' 'going' 'know' 'president' 'mr stephanopoulos'
 'mr' 'stephanopoulos' 'don']
['rkba' 'control' 'gun' 'gun control' 'mr' 'file' 'congress' 'firearms'
 'united' 'states']
['people' 'believe' 'true' 'religion' 'does' 'argument' 'religious' 'god'
 'atheism' 'atheists']
['launches' 'data' 'year' 'services' 'satellites' 'satellite' 'market'
 'commercial' 'space' 'launch']
['american' 'president' 'government' 'administration' 'official' 'russian'
 'program' 'senior' 'russia' 'think']
['ships' 'naval' 'georgia' 'secret' 'island' 'nuclear' 'military' 'new'
 'south' 'war']
['king' 'lord' 'david' 'isaiah' 'messiah' 'jesus' 'prophecy' 'matthew'
 'people' 'said']
['mars' 'earth' 'shuttle' 'data' 'lunar' 'information' 'available' 'edu'
 'nasa' 'space']
['said' 'don' 'll' 'think' 'dee' 'going' 'options' 'package' 'president'
 'ms']
['secretary' 'want' 'think' 'school' 'young people' 'summer' 'jobs'
 'people' 'young' 'work']


## Using Keras for NNMF

In [284]:
import tensorflow as tf

import keras
from keras.layers import Embedding, Reshape, Dense, Activation
from keras.models import Sequential
from keras.constraints import nonneg

from keras import backend as K

sess = tf.InteractiveSession()
K.set_session(sess)

print(tf.__version__)
print(keras.__version__)

X = vectors
n_texts, n_words = X.shape
embedding_size = 10

0.12.1
1.2.1


In [285]:
model = Sequential()
weights_contraint = nonneg()
text_to_embedding = Embedding(n_texts, embedding_size, input_length=1, W_constraint=weights_contraint)
model.add(text_to_embedding)
model.add(Reshape((embedding_size,)))
model.add(Dense(output_dim=n_words, input_dim=embedding_size, bias=False, W_constraint=weights_contraint))

In [286]:
print(X[1])
model.predict(np.array([1]))

  (0, 1004)	1
  (0, 1291)	1
  (0, 731)	1
  (0, 811)	1
  (0, 106)	1


array([[-0.00046321, -0.00295585, -0.00397265, ..., -0.00422178,
         0.00104016,  0.00144252]], dtype=float32)

In [287]:
model.compile(optimizer='Adam',
              loss='mean_squared_error') # the way you compare the ground truth and result

In [288]:
X_text_ids = np.arange(n_texts)
X_text_ids

array([   0,    1,    2, ..., 2458, 2459, 2460])

In [289]:
model.fit(x=X_text_ids, y=X.toarray(), batch_size=256, verbose=0, nb_epoch=500)

<keras.callbacks.History at 0x13a653780>

In [292]:
reconsruction_error(
    X,
    reconstructed_X=model.predict(X_text_ids))

760.7298891632098

In [293]:
feature_names = np.array(vectorizer.get_feature_names())

for component in model.layers[2].weights[0].eval():
    indices = np.argpartition(component, -10)[-8:]
    print(feature_names[indices])

['god' 'believe' 'know' 'don' 'evidence' 'mr' 'president' 'does']
['president' 'know' 'does' 'jesus' 'bible' 'christian' 'mr' 'god']
['faq' 'send' 'president' 'space' 'information' 'edu' 'nasa' 'mr']
['know' 'mr' 'good' 'like' 'think' 'people' 'just' 'don']
['going' 'work' 'year' 'mr' 'nasa' 'president' 'space' 'program']
['don' 'people' 'think' 'know' 'going' 'did' 'mr' 'president']
['work' 'know' 'going' 'think' 'mr' 'people' 'president' 'jobs']
['study' '000' 'new' 'gun' 'guns' 'mr' 'firearms' 'weapons']
['know' 'don' 'think' 'states' 'mr' 'does' 'state' 'president']
['orbit' 'mission' 'satellite' 'data' 'nasa' 'shuttle' 'earth' 'space']
