# Wikipedia training

### In this tutorial we will:
 - Learn how to train the NMF topic model on the English Wikipedia corpus
 - Compare it with LDA and Sklearn NMF
 - Evaluate results

In [1]:
import itertools
import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse
from smart_open import smart_open
import time
import os
import psutil
from contextlib import contextmanager
from multiprocessing import Process
from tqdm import tqdm, tqdm_notebook
import joblib

import gensim.downloader as api
from gensim import matutils
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.models.nmf import Nmf as GensimNmf
from sklearn.decomposition.nmf import NMF as SklearnNmf
from gensim.parsing.preprocessing import preprocess_string

tqdm.pandas()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# Preprocessing

### Load wikipedia dump
Let's use `gensim.downloader.api` for that

In [2]:
data = api.load("wiki-english-20171001")
article = next(iter(data))

for section_title, section_text in zip(
    article['section_titles'],
    article['section_texts']
):
    print("Section title: %s" % section_title)
    print("Section text: %s" % section_text[:100])

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on volun
Section title: Etymology and terminology
Section text: 

The word ''anarchism'' is composed from the word ''anarchy'' and the suffix ''-ism'', themselves d
Section title: History
Section text: 

===Origins===
Woodcut from a Diggers document by William Everard

The earliest anarchist themes ca
Section title: Anarchist schools of thought
Section text: 
Portrait of philosopher Pierre-Joseph Proudhon (1809–1865) by Gustave Courbet. Proudhon was the pri
Section title: Internal issues and debates
Section text: 
consistent with anarchist values is a controversial subject among anarchists.

Anarchism is a philo
Section title: Topics of interest
Section text: Intersecting and overlapping between various schools of thought, certain topics of interest and inte
Section title: Criticisms
Section text: 
Criticisms of anarchism include moral criticisms and pra

Preprocess and save articles

In [3]:
def save_preprocessed_articles(filename, articles):
    with smart_open(filename, 'w+', encoding="utf8") as writer:
        for article in tqdm_notebook(articles):
            article_text = " ".join(
                " ".join(section)
                for section
                in zip(
                    article['section_titles'],
                    article['section_texts']
                )
            )
            article_text = preprocess_string(article_text)

            writer.write(json.dumps(article_text) + '\n')


def get_preprocessed_articles(filename):
    with smart_open(filename, 'r', encoding="utf8") as reader:
        for line in tqdm_notebook(reader):
            yield json.loads(
                line
            )

In [4]:
SAVE_ARTICLES = False

if SAVE_ARTICLES:
    save_preprocessed_articles('wiki_articles.jsonlines', data)

### Create and save dictionary

In [5]:
SAVE_DICTIONARY = False

if SAVE_DICTIONARY:
    dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))
    dictionary.save('wiki.dict')

### Load and filter dictionary

In [6]:
dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes()
dictionary.compactify()

2019-02-06 22:27:02,588 : INFO : loading Dictionary object from wiki.dict
2019-02-06 22:27:03,510 : INFO : loaded wiki.dict
2019-02-06 22:27:09,054 : INFO : discarding 1910146 tokens: [('abdelrahim', 49), ('abstention', 120), ('anarcha', 101), ('anarchica', 40), ('anarchosyndicalist', 20), ('antimilitar', 68), ('arbet', 194), ('archo', 100), ('arkhē', 5), ('autonomedia', 118)]...
2019-02-06 22:27:09,055 : INFO : keeping 100000 tokens which were in no less than 5 and no more than 2462447 (=50.0%) documents
2019-02-06 22:27:09,956 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['tago', 'süß', 'akrita', 'divert', 'construccion']...)


### MmCorpus wrapper
In this way we'll:

- Make sure that documents are shuffled
- Be able to train-test split corpus without rewriting it

In [7]:
class RandomCorpus(MmCorpus):
    def __init__(self, random_seed=42, testset=False, testsize=1000, *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        random_state = np.random.RandomState(random_seed)
        
        self.indices = random_state.permutation(range(self.num_docs))
        test_nnz = sum(len(self[doc_idx]) for doc_idx in self.indices[:testsize])
        
        if testset:
            self.indices = self.indices[:testsize]
            self.num_docs = testsize
            self.num_nnz = test_nnz
        else:
            self.indices = self.indices[testsize:]
            self.num_docs -= testsize
            self.num_nnz -= test_nnz

    def __iter__(self):
        for doc_id in self.indices:
            yield self[doc_id]

### Create and save corpus

In [8]:
SAVE_CORPUS = False

if SAVE_CORPUS:
    corpus = (
        dictionary.doc2bow(article)
        for article
        in get_preprocessed_articles('wiki_articles.jsonlines')
    )
    
    RandomCorpus.serialize('wiki.mm', corpus)

### Load train and test corpus
Using `RandomCorpus` wrapper

In [9]:
train_corpus = RandomCorpus(
    random_seed=42, testset=False, testsize=2000, fname='wiki.mm'
)
test_corpus = RandomCorpus(
    random_seed=42, testset=True, testsize=2000, fname='wiki.mm'
)

2019-02-06 22:27:10,847 : INFO : loaded corpus index from wiki.mm.index
2019-02-06 22:27:10,847 : INFO : initializing cython corpus reader from wiki.mm
2019-02-06 22:27:10,848 : INFO : accepted corpus with 4924894 documents, 100000 features, 683326444 non-zero entries
2019-02-06 22:27:17,213 : INFO : loaded corpus index from wiki.mm.index
2019-02-06 22:27:17,215 : INFO : initializing cython corpus reader from wiki.mm
2019-02-06 22:27:17,218 : INFO : accepted corpus with 4924894 documents, 100000 features, 683326444 non-zero entries


### Convert corpora to csc and save

It's necessary in order to train Sklearn NMF. These matrices take **a lot** of RAM even though they're sparse (about 8GB).

In [10]:
SAVE_CSC = False

if SAVE_CSC:
    train_csc = matutils.corpus2csc(train_corpus, len(dictionary))
    scipy.sparse.save_npz('train_csc.npz', train_csc)
    
    test_csc = matutils.corpus2csc(test_corpus, len(dictionary))
    scipy.sparse.save_npz('test_csc.npz', test_csc)

## Metrics

- `train time` - time to train a model
- `mean_ram` - mean RAM consumption during the training
- `max_ram` - maximum RAM consumpiton during the training
- `perplexity` - perplexity score. Another usual TM metric
- `coherence` - coherence score (not defined for sklearn NMF). Classic metric for topic models.
- `l2_norm` - l2 norm of `v - Wh`

In [11]:
@contextmanager
def measure_ram(output, tick=2):
    def _measure_ram(pid, output, start_memory, tick=5):
        py = psutil.Process(pid)
        with open(output, 'w') as outfile:
            while True:
                memory = py.memory_info().rss - start_memory
                outfile.write("{}\n".format(memory))
                outfile.flush()
                time.sleep(tick)

    pid = os.getpid()
    start_memory = psutil.Process(pid).memory_info().rss
    p = Process(target=_measure_ram, args=(pid, output, start_memory, 5))
    p.start()
    yield
    p.terminate()

def get_train_time_and_ram(func, name):
    memprof_filename = "{}.memprof".format(name)
    
    start = time.time()

    with measure_ram(memprof_filename, 5):
        result = func()        
        
    elapsed_time = pd.to_timedelta(time.time() - start, unit='s').round('s')
    
    memprof_df = pd.read_csv(memprof_filename, squeeze=True)
    
    mean_ram = "{} MB".format(
        int(memprof_df.mean() // 2**20),
    )
    
    max_ram = "{} MB".format(int(memprof_df.max() // 2**20))

    return elapsed_time, mean_ram, max_ram, result


def get_tm_metrics(model, test_corpus):
    W = model.get_topics().T
    H = np.zeros((model.num_topics, len(test_corpus)))
    for bow_id, bow in enumerate(test_corpus):
        for topic_id, word_count in model.get_document_topics(bow):
            H[topic_id, bow_id] = word_count

    pred_factors = W.dot(H)
    
    dense_corpus = matutils.corpus2dense(test_corpus, pred_factors.shape[0])

    l2_norm = get_tm_l2_norm(pred_factors, dense_corpus)
    
    pred_factors /= pred_factors.sum(axis=0)
    
    perplexity = get_tm_perplexity(pred_factors, dense_corpus)

    model.normalize = True

    coherence = CoherenceModel(
        model=model,
        corpus=test_corpus,
        coherence='u_mass'
    ).get_coherence()
    
    topics = model.show_topics(5)

    model.normalize = False

    return dict(
        perplexity=round(perplexity, 4),
        coherence=round(coherence, 4),
        l2_norm=round(l2_norm, 4),
        topics=topics,
    )


def get_tm_perplexity(pred_factors, dense_corpus):
    return np.exp(-(np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum())


def get_tm_l2_norm(pred_factors, dense_corpus):
    return np.linalg.norm(dense_corpus - pred_factors)


def get_sklearn_topics(model, id2word, top_n=5):
    topic_probas = model.components_.T
    topic_probas = topic_probas / topic_probas.sum(axis=0)
    
    sparsity = np.zeros(topic_probas.shape[1])

    for row in topic_probas:
        sparsity += (row == 0)

    sparsity /= topic_probas.shape[1]
    
    topic_probas = topic_probas[:, sparsity.argsort()[::-1]][:, :top_n]
    
    token_indices = topic_probas.argsort(axis=0)[:-11:-1, :]
    topic_probas.sort(axis=0)
    topic_probas = topic_probas[:-11:-1, :]
    
    topics = []
    
    for topic_idx in range(topic_probas.shape[1]):
        tokens = [
            id2word[token_idx]
            for token_idx
            in token_indices[:, topic_idx]
        ]
        topic = (
            '{}*"{}"'.format(round(proba, 3), token)
            for proba, token
            in zip(topic_probas[:, topic_idx], tokens)
        )
        topic = " + ".join(topic)
        topics.append((topic_idx, topic))
    
    return topics


def get_sklearn_metrics(model, test_corpus, dictionary):
    W = model.components_.T
    H = model.transform((test_corpus).T).T
    pred_factors = W.dot(H)
    
    l2_norm = np.linalg.norm(test_corpus - pred_factors)
    
    pred_factors /= pred_factors.sum(axis=0)

    perplexity = np.exp(
        -(np.log(pred_factors, where=pred_factors > 0) * test_corpus).sum()
        / test_corpus.sum()
    )
    
    topics = get_sklearn_topics(model, dictionary, top_n=5)

    return dict(
        perplexity=perplexity,
        l2_norm=l2_norm,
        topics=topics,
    )

Define the dataframe in which we'll store metrics

In [12]:
tm_metrics = pd.DataFrame(columns=[
    'model', 'train_time', 'mean_ram', 'max_ram', 'perplexity', 'coherence', 'l2_norm', 'topics'
])

### Define common params for the models

In [13]:
params = dict(
    corpus=train_corpus,
    chunksize=2000,
    num_topics=50,
    id2word=dictionary,
    passes=1,
    eval_every=10,
    minimum_probability=0,
    random_state=42,
)

## Training

### Train Gensim NMF and save it
Normalization is turned off to compute metrics correctly

In [14]:
row = dict()
row['model'] = 'gensim_nmf'
row['train_time'], row['mean_ram'], row['max_ram'], nmf = get_train_time_and_ram(
    lambda: GensimNmf(
        normalize=False,
        **params
    ),
    'gensim_nmf',
)

nmf.save('gensim_nmf.model')

2019-02-06 22:28:18,836 : INFO : Loss: 0.04683116478141604
2019-02-06 22:28:47,687 : INFO : Loss: 0.03585093916474252
2019-02-06 22:29:03,704 : INFO : Loss: 0.016218137567651437
==TRUNCATED==
2019-02-06 22:52:21,184 : INFO : Loss: 0.00019501346145086494
2019-02-06 22:52:26,080 : INFO : Loss: 0.00010551328334260371
2019-02-06 22:52:30,927 : INFO : Loss: 0.0001089520591937033
2019-02-06 22:52:35,753 : INFO : Loss: 0.0
2019-02-06 22:52:36,577 : INFO : Loss: 0.00030404947061429366
2019-02-06 22:52:36,589 : INFO : saving Nmf object under gensim_nmf.model, separately None
2019-02-06 22:52:36,966 : INFO : saved gensim_nmf.model


### Load Gensim NMF and store metrics

In [15]:
nmf = GensimNmf.load('gensim_nmf.model')
row.update(get_tm_metrics(nmf, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

2019-02-06 22:52:36,983 : INFO : loading Nmf object from gensim_nmf.model
2019-02-06 22:52:37,292 : INFO : loading id2word recursively from gensim_nmf.model.id2word.* with mmap=None
2019-02-06 22:52:37,293 : INFO : loaded gensim_nmf.model
  result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)
2019-02-06 22:53:31,168 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2019-02-06 22:53:31,278 : INFO : CorpusAccumulator accumulated stats from 2000 documents


### Train LDA and save it
That's a common model to do Topic Modeling

In [16]:
row = dict()
row['model'] = 'lda'
row['train_time'], row['mean_ram'], row['max_ram'], lda = get_train_time_and_ram(
    lambda: LdaModel(**params),
    'lda',
)
lda.save('lda.model')

2019-02-06 22:53:31,779 : INFO : using symmetric alpha at 0.02
2019-02-06 22:53:31,780 : INFO : using symmetric eta at 0.02
2019-02-06 22:53:31,798 : INFO : using serial LDA version on this node
2019-02-06 22:53:32,320 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 4922894 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2019-02-06 22:53:32,473 : INFO : PROGRESS: pass 0, at document #2000/4922894
2019-02-06 22:53:34,391 : INFO : merging changes from 2000 documents into a model of 4922894 documents
2019-02-06 22:53:34,709 : INFO : topic #36 (0.020): 0.006*"new" + 0.005*"record" + 0.004*"servic" + 0.004*"includ" + 0.003*"year" + 0.003*"state" + 0.003*"design" + 0.003*"work" + 0.003*"time" + 0.003*"american"
2019-02-06 22:53:34,711 : INFO : topic #19 (0.020): 0.005*"team" + 0.005*"new" + 0.004*"nation" + 0.004*"state" + 0.004*"includ"

### Load LDA and store metrics

In [17]:
lda = LdaModel.load('lda.model')
row.update(get_tm_metrics(lda, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

2019-02-07 00:19:19,731 : INFO : loading LdaModel object from lda.model
2019-02-07 00:19:19,734 : INFO : loading expElogbeta from lda.model.expElogbeta.npy with mmap=None
2019-02-07 00:19:19,740 : INFO : setting ignored attribute id2word to None
2019-02-07 00:19:19,741 : INFO : setting ignored attribute dispatcher to None
2019-02-07 00:19:19,742 : INFO : setting ignored attribute state to None
2019-02-07 00:19:19,743 : INFO : loaded lda.model
2019-02-07 00:19:19,743 : INFO : loading LdaState object from lda.model.state
2019-02-07 00:19:19,837 : INFO : loaded lda.model.state
  result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)
2019-02-07 00:19:41,681 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2019-02-07 00:19:41,790 : INFO : CorpusAccumulator accumulated stats from 2000 documents


### Train Sklearn NMF and store metrics

In [18]:
row = dict()
row['model'] = 'sklearn_nmf'
sklearn_nmf = SklearnNmf(n_components=50, tol=1e-2, random_state=42)
row['train_time'], row['mean_ram'], row['max_ram'], sklearn_nmf = get_train_time_and_ram(
    lambda: sklearn_nmf.fit(scipy.sparse.load_npz('train_csc.npz').T),
    'sklearn_nmf',
)

joblib.dump(sklearn_nmf, 'sklearn_nmf.joblib')

['sklearn_nmf.joblib']

### Load Sklearn NMF and store metrics

In [19]:
sklearn_nmf = joblib.load('sklearn_nmf.joblib')
row.update(get_sklearn_metrics(
    sklearn_nmf, scipy.sparse.load_npz('test_csc.npz').toarray(), dictionary
))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

In [20]:
tm_metrics.replace(np.nan, '-', inplace=True)

## Results

In [21]:
tm_metrics.drop('topics', axis=1)

Unnamed: 0,model,train_time,mean_ram,max_ram,perplexity,coherence,l2_norm
0,gensim_nmf,00:25:16,130 MB,165 MB,3741.8606,-2.9857,1983.3787
1,lda,01:25:48,140 MB,140 MB,4701.976,-2.5286,2273.643
2,sklearn_nmf,00:49:29,10716 MB,15961 MB,3943.036299,-,1987.280856


### Insights

Gensim NMF is better than Sklearn NMF in every aspect:

- **2x** faster


- Uses **80x-120x** less memory.

    About **8GB** of RAM comes from the input corpus sparse matrices, which, in contrast to Gensim NMF, can't be passed iteratively. But even if we forget about tremendous corpus size, Sklearn NMF still uses about **2-8 GB** of RAM, which is much larger than that of Gensim NMF and LDA.


- Still achieves better l2 norm and perplexity

Comparing to LDA, Gensim NMF is also better in almost everything:

- **3.5x** faster
- Achieves much better l2 norm and perplexity

Coherence is less than LDA's though.

### Topics

In [22]:
def compare_topics(tm_metrics):
    for _, row in tm_metrics.iterrows():
        print('\n{}:'.format(row.model))
        print("\n".join(str(topic) for topic in row.topics))
        
compare_topics(tm_metrics)


gensim_nmf:
(23, '0.007*"seri" + 0.006*"episod" + 0.006*"time" + 0.006*"appear" + 0.005*"later" + 0.005*"charact" + 0.005*"kill" + 0.005*"man" + 0.004*"work" + 0.004*"book"')
(19, '0.018*"king" + 0.012*"centuri" + 0.009*"church" + 0.007*"son" + 0.005*"princ" + 0.005*"french" + 0.005*"franc" + 0.005*"england" + 0.005*"kingdom" + 0.005*"year"')
(49, '0.079*"royal" + 0.038*"corp" + 0.034*"armi" + 0.028*"regiment" + 0.027*"capt" + 0.025*"townhous" + 0.020*"maj" + 0.020*"artilleri" + 0.017*"servic" + 0.017*"col"')
(38, '0.132*"mount" + 0.130*"iatrogen" + 0.128*"peak" + 0.126*"knightsbridg" + 0.126*"somedai" + 0.065*"survei" + 0.027*"octob" + 0.023*"campo" + 0.023*"css" + 0.018*"septemb"')
(43, '0.197*"linear" + 0.196*"secundaria" + 0.039*"newman" + 0.037*"septemb" + 0.034*"parallax" + 0.027*"octob" + 0.023*"knightsbridg" + 0.023*"anderson" + 0.023*"montana" + 0.023*"lanc"')

lda:
(35, '0.054*"russian" + 0.039*"soviet" + 0.032*"russia" + 0.027*"polish" + 0.026*"republ" + 0.025*"philippin" +

Seems like all models have successfully learned the topic representation of the corpus.