# Comparison between sklearn's and gensim's implementations of NMF

In [1]:
from gensim.models.nmf import NMF as GensimNmf
from gensim.parsing.preprocessing import preprocess_documents
from sklearn.decomposition.nmf import NMF as SklearnNmf
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from matplotlib import pyplot as plt

In [2]:
vectorizer = CountVectorizer()

In [3]:
bow_matrix = vectorizer.fit_transform(fetch_20newsgroups().data)
bow_matrix = bow_matrix.todense()[:100]

## Sklearn NMF

In [4]:
%%time

sklearn_nmf = SklearnNmf(n_components=5, tol=1e-5, max_iter=int(1e9))

W = sklearn_nmf.fit_transform(bow_matrix)
H = sklearn_nmf.components_

CPU times: user 54.4 s, sys: 38 s, total: 1min 32s
Wall time: 1min 29s


In [5]:
np.linalg.norm(bow_matrix - W.dot(H), 'fro')

184.40183405328017

## Gensim NMF

In [6]:
%%time

gensim_nmf = GensimNmf(n_components=5)

n_samples = np.array(bow_matrix).shape[0]

gensim_nmf.fit(np.array(bow_matrix))
W, H = gensim_nmf.get_factor_matrices()

CPU times: user 2min 7s, sys: 8.22 s, total: 2min 15s
Wall time: 2min 5s


In [7]:
np.linalg.norm(bow_matrix - W.dot(H), 'fro')

353.4495647218574