# Comparison between sklearn's and gensim's implementations of NMF

In [4]:
%load_ext line_profiler
from gensim.models.nmf import Nmf as GensimNmf
from gensim.parsing.preprocessing import preprocess_documents
from gensim import matutils
from sklearn.decomposition.nmf import NMF as SklearnNmf
import sklearn.decomposition.nmf
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from matplotlib import pyplot as plt

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


# 20newsgroups

In [5]:
from gensim.parsing.preprocessing import preprocess_documents

documents = preprocess_documents(fetch_20newsgroups().data[:1000])

Downloading 20news dataset. This may take a few minutes.
2018-06-05 13:03:04,253 : INFO : Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
2018-06-05 13:03:04,255 : INFO : Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [6]:
from gensim.corpora import Dictionary

dictionary = Dictionary(documents)

dictionary.filter_extremes()

2018-06-05 13:03:26,753 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-05 13:03:26,886 : INFO : built Dictionary(17622 unique tokens: ['addit', 'bodi', 'bricklin', 'brought', 'bumper']...) from 1000 documents (total 136081 corpus positions)
2018-06-05 13:03:26,906 : INFO : discarding 14411 tokens: [('bricklin', 2), ('bumper', 4), ('edu', 661), ('funki', 4), ('lerxst', 2), ('line', 989), ('organ', 952), ('rac', 1), ('subject', 1000), ('tellm', 2)]...
2018-06-05 13:03:26,907 : INFO : keeping 3211 tokens which were in no less than 5 and no more than 500 (=50.0%) documents
2018-06-05 13:03:26,912 : INFO : resulting dictionary: Dictionary(3211 unique tokens: ['addit', 'bodi', 'brought', 'call', 'car']...)


In [7]:
corpus = [
    dictionary.doc2bow(document)
    for document
    in documents
]

bow_matrix = matutils.corpus2dense(corpus, len(dictionary), len(corpus))

## Sklearn NMF

In [8]:
%%time
# %%prun

sklearn_nmf = SklearnNmf(n_components=5, tol=1e-5, max_iter=int(1e9), random_state=42)

W = sklearn_nmf.fit_transform(bow_matrix)
H = sklearn_nmf.components_

CPU times: user 875 ms, sys: 385 ms, total: 1.26 s
Wall time: 711 ms


In [9]:
# %lprun -f sklearn.decomposition.nmf._fit_coordinate_descent sklearn_nmf.fit_transform(bow_matrix)

In [10]:
np.linalg.norm(bow_matrix - W.dot(H), 'fro')

482.0895496423899

## Gensim NMF

In [45]:
%%time
# %%prun

PASSES = 3

np.random.seed(42)

gensim_nmf = GensimNmf(
    corpus,
    chunksize=len(corpus),
    num_topics=5,
    id2word=dictionary,
    lambda_=1000,
    kappa=1.,
    passes=PASSES,
    normalize=False
)

2018-06-05 13:08:47,801 : INFO : Loss (no outliers): 593.2289895825425	Loss (with outliers): 593.2289895825425
2018-06-05 13:08:48,289 : INFO : Loss (no outliers): 502.92121729493823	Loss (with outliers): 502.92121729493823
2018-06-05 13:08:48,805 : INFO : Loss (no outliers): 486.88611940507246	Loss (with outliers): 486.88611940507246


CPU times: user 2.47 s, sys: 1.82 s, total: 4.29 s
Wall time: 1.49 s


In [46]:
# %lprun -f GensimNmf._solve_w GensimNmf(corpus, chunksize=len(corpus), num_topics=5, id2word=dictionary, lambda_=1., kappa=1.)

In [47]:
W = gensim_nmf.get_topics().T
H = np.hstack(gensim_nmf[bow] for bow in corpus)

In [48]:
np.linalg.norm(matutils.corpus2dense(corpus, len(dictionary), len(documents)) - W.dot(H), 'fro')

484.3090246375028

In [49]:
gensim_nmf.show_topics()

[(0,
  '0.117*"jesu" + 0.065*"matthew" + 0.035*"peopl" + 0.033*"christian" + 0.031*"prophet" + 0.029*"dai" + 0.028*"said" + 0.027*"messiah" + 0.024*"come" + 0.023*"king"'),
 (1,
  '0.049*"armenian" + 0.030*"peopl" + 0.018*"turkish" + 0.016*"post" + 0.015*"russian" + 0.014*"genocid" + 0.013*"time" + 0.013*"year" + 0.012*"com" + 0.012*"articl"'),
 (2,
  '0.359*"max" + 0.007*"umd" + 0.005*"hst" + 0.002*"gee" + 0.001*"distribut" + 0.001*"univers" + 0.001*"repli" + 0.001*"usa" + 0.001*"keyword" + 0.001*"net"'),
 (3,
  '0.083*"health" + 0.060*"us" + 0.041*"year" + 0.038*"report" + 0.035*"state" + 0.033*"diseas" + 0.032*"case" + 0.031*"public" + 0.030*"ag" + 0.030*"person"'),
 (4,
  '0.105*"argument" + 0.064*"conclus" + 0.056*"exampl" + 0.052*"premis" + 0.051*"true" + 0.034*"occur" + 0.032*"logic" + 0.031*"fals" + 0.029*"form" + 0.028*"assert"')]

From the personal experience I can say that the higher number of passes and shuffle of the trainset significantly improves performance.

Then, of course, you should perform hyperparameter tuning.

# Image of stars
### (For the sake of visualization of performance on sparse trainset)

In [17]:
from PIL import Image
img = Image.open('stars_scaled.jpg').convert('L')
img

ModuleNotFoundError: No module named 'PIL'

In [None]:
img_matrix = np.uint8(img.getdata()).reshape(img.size[::-1])
img_matrix.shape

## Sklearn NMF

In [None]:
%%time

sklearn_nmf = SklearnNmf(n_components=10, tol=1e-5, max_iter=int(1e9))

W = sklearn_nmf.fit_transform(img_matrix)
H = sklearn_nmf.components_

In [None]:
Image.fromarray(np.uint8(W.dot(H)), 'L')

## Gensim NMF

In [None]:
np.random.seed(42)

img_corpus = matutils.Dense2Corpus(img_matrix[np.random.choice(img_matrix.shape[0], img_matrix.shape[0], replace=False)].T)

In [None]:
%%time

import itertools

gensim_nmf = GensimNmf(
    img_corpus,
    chunksize=len(corpus),
    num_topics=10,
    passes=2,
    id2word={k: k for k in range(img_matrix.shape[1])},
    lambda_=1000,
    kappa=1,
    normalize=False
)

In [None]:
W = gensim_nmf.get_topics().T
H = np.hstack(gensim_nmf[bow] for bow in matutils.Dense2Corpus(img_matrix.T))

### Reconstructed matrix:

In [None]:
Image.fromarray(np.uint8(W.dot(H).T), 'L')