# 20 newsgroups dataset shows NMF parts-based decomposition
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from pprint import pprint
pprint(list(newsgroups_train.target_names))

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
newsgroups_train.filenames.shape

(2461,)

In [3]:
newsgroups_train.target.shape

(2461,)

In [4]:
newsgroups_train.target[:10] #  The target attribute is the integer index of the category

array([2, 3, 3, 1, 4, 0, 2, 2, 1, 2])

## Prepare vectors from train set using CountVectorizer
See http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
X = vectorizer.fit_transform(newsgroups_train.data)

## NMF from sklearn

In [72]:
from sklearn.decomposition import NMF

model = NMF(n_components=10)
documented_to_topics_weights = model.fit_transform(X)

print(model.reconstruction_err_)

def reconsruction_error(X, reconstructed_X):
    return np.sqrt(np.power((X - reconstructed_X), 2).sum())
    
reconsruction_error(
    X,
    reconstructed_X=documented_to_topics_weights.dot(model.components_))

557.5321444659795


557.53214446598042

## Extract components using NMF from keras

In [85]:
import tensorflow as tf

import numpy as np
import keras
from keras.layers import Embedding, Reshape, Dense
from keras.models import Sequential
from keras.constraints import nonneg

from keras import backend as K

sess = tf.InteractiveSession()
K.set_session(sess)

print(tf.__version__)
print(keras.__version__)


n_docs, n_words = X.shape
K = embedding_size = 10

X_doc_ids = np.arange(n_docs)


0.12.0-rc0
1.1.0


In [86]:
weights_contraint = nonneg()
docs_to_embeddings = Embedding(input_dim=n_docs, output_dim=embedding_size, input_length=1, 
                                init='uniform', W_constraint=weights_contraint)

embeddings_to_docs = Dense(output_dim=n_words, bias=False, W_constraint=weights_contraint)

model = Sequential([
    docs_to_embeddings,
    Reshape((embedding_size,)), # getting rid of the superfluous dimension of input_length=1
    embeddings_to_docs])

In [None]:
from keras.optimizers import Adam, SGD

model.compile(loss='mean_squared_error', optimizer=Adam())

model.fit(
    x=X_doc_ids, 
    y=X.toarray(),
    batch_size=32, verbose=0, nb_epoch=500)

reconsruction_error(
    X,
    reconstructed_X=model.predict(X_doc_ids))

In [84]:
feature_names = np.array(vectorizer.get_feature_names())

for component in model.layers[2].weights[0].eval():
    indices = np.argpartition(component, -10)[-8:]
    print(feature_names[indices])

['file' 'example' 'argument' 'does' 'used' 'new' 'true' 'god']
['law' 'crime' 'firearms' 'guns' 'gun' 'president' '000' 'mr']
['going' 'president' 'said' 'know' 'll' 'states' 'mr' 'did']
['nasa' 'launch' 'earth' 'data' 'satellite' '10' 'orbit' 'space']
['list' 'information' 'launch' 'space' 'nasa' 'file' 'edu' 'mr']
['mr' 'president' 'bible' 'god' 'don' 'know' 'does' 'jesus']
['people' 'like' 'president' 'just' 'don' 'know' 'mr' 'think']
