# Wikipedia training

In this tutorial we will:
 - Learn how to train the NMF topic model on English Wikipedia corpus
 - Compare it with LDA model
 - Evaluate results

In [143]:
import itertools
import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse
from tqdm import tqdm, tqdm_notebook

import gensim.downloader as api
from gensim import matutils
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import LdaModel
from gensim.models.nmf import Nmf
from gensim.parsing.preprocessing import preprocess_string

In [144]:
%load_ext autoreload
%autoreload 2

tqdm.pandas()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [145]:
data = api.load("wiki-english-20171001")
for article in data:
    for section_title, section_text in zip(article['section_titles'],
                                           article['section_texts']):
        print("Section title: %s" % section_title)
        print("Section text: %s" % section_text)
    break

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.

While anti-statism is central, anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations, including—but not limited to—the state system. Anarchism is usually considered a far-left ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism or participatory economics.

Anarchism does not offer a fixed body of doctrine from a single particular world view, instead fluxing and flowing as a philosophy. Many types and traditions of 

In [146]:
def wiki_articles_iterator():
    for article in tqdm_notebook(data):
        yield (
            preprocess_string(
                " ".join(
                    " ".join(section)
                    for section
                    in zip(article['section_titles'], article['section_texts'])
                )
            )
        )

In [147]:
def save_preprocessed_articles(filename, articles):
    with open(filename, 'w+') as writer:
        for article in tqdm_notebook(articles):
            writer.write(
                json.dumps(
                    preprocess_string(
                        " ".join(
                            " ".join(section)
                            for section
                            in zip(article['section_titles'],
                                   article['section_texts'])
                        )
                    )
                ) + '\n'
            )


def get_preprocessed_articles(filename):
    with open(filename, 'r') as reader:
        for line in tqdm_notebook(reader):
            yield json.loads(
                line
            )

In [148]:
# save_preprocessed_articles('wiki_articles.jsonlines', data)

In [149]:
# dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))

# dictionary.save('wiki.dict')

In [150]:
dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes(keep_n=20000)
dictionary.compactify()

2019-01-11 15:12:35,858 : INFO : loading Dictionary object from wiki.dict
2019-01-11 15:12:36,690 : INFO : loaded wiki.dict
2019-01-11 15:12:38,849 : INFO : discarding 1990258 tokens: [('abdelrahim', 49), ('abstention', 120), ('ammon', 1736), ('amoureus', 359), ('amoureux', 566), ('amparo', 1178), ('anarcha', 101), ('anarchica', 40), ('anarcho', 1433), ('anarchosyndicalist', 20)]...
2019-01-11 15:12:38,850 : INFO : keeping 20000 tokens which were in no less than 5 and no more than 2462447 (=50.0%) documents
2019-01-11 15:12:39,101 : INFO : resulting dictionary: Dictionary(20000 unique tokens: ['abandon', 'abil', 'abl', 'abolit', 'abstent']...)


In [151]:
class RandomCorpus(MmCorpus):
    def __init__(self, random_seed=42, testset=False, testsize=1000, *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        random_state = np.random.RandomState(random_seed)
        # TODO: Don't forget to remove that before push
        self.indices = random_state.permutation(range(self.num_docs))[:4000]
        if testset:
            self.indices = self.indices[:testsize]
        else:
            self.indices = self.indices[testsize:]

    def __iter__(self):
        for doc_id in self.indices:
            yield self[doc_id]

In [152]:
# corpus = (
#     dictionary.doc2bow(article)
#     for article
#     in get_preprocessed_articles('wiki_articles.jsonlines')
# )

# RandomCorpus.serialize('wiki.mm', corpus)

In [153]:
train_corpus = RandomCorpus(
    random_seed=42, testset=False, testsize=1, fname='wiki.mm'
)
test_corpus = RandomCorpus(
    random_seed=42, testset=True, testsize=1, fname='wiki.mm'
)

2019-01-11 15:12:39,759 : INFO : loaded corpus index from wiki.mm.index
2019-01-11 15:12:39,760 : INFO : initializing cython corpus reader from wiki.mm
2019-01-11 15:12:39,761 : INFO : accepted corpus with 4924894 documents, 20000 features, 629448427 non-zero entries
2019-01-11 15:12:41,013 : INFO : loaded corpus index from wiki.mm.index
2019-01-11 15:12:41,014 : INFO : initializing cython corpus reader from wiki.mm
2019-01-11 15:12:41,015 : INFO : accepted corpus with 4924894 documents, 20000 features, 629448427 non-zero entries


In [154]:
def get_execution_time(func):
    start = time.time()
    result = func()

    return (time.time() - start), result


def get_tm_perplexity(W, H, dense_corpus):
    pred_factors = W.dot(H)

    return np.exp(-(np.log(pred_factors,
                           where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum())


def get_tm_l2_norm(model, corpus):
    l2_norm = 0
    for bow_id, bow in enumerate(test_corpus):
        doc = matutils.corpus2csc([bow], len(model.id2word))
        doc /= doc.sum()

        pred_factors = scipy.sparse.csc_matrix([
            proba for idx, proba in model[bow]
        ])

        l2_norm += scipy.sparse.linalg.norm(doc - pred_factors)**2

    return np.sqrt(l2_norm)


def get_tm_metrics(model, test_corpus):
    W = model.get_topics().T
    H = np.zeros((model.num_topics, len(test_corpus)))
    for bow_id, bow in enumerate(test_corpus):
        for topic_id, proba in model[bow]:
            H[topic_id, bow_id] = proba

    perplexity = get_tm_perplexity(W, H, test_corpus)

    coherence = CoherenceModel(
        model=model,
        corpus=test_corpus,
        coherence='u_mass'
    ).get_coherence()

    l2_norm = get_tm_l2_norm(W, H, test_corpus)

    topics = model.show_topics()

    return dict(
        perplexity=perplexity,
        coherence=coherence,
        topics=topics,
        l2_norm=l2_norm,
    )

In [155]:
tm_metrics = pd.DataFrame()

In [156]:
params = dict(
    corpus=train_corpus,
    chunksize=2000,
    num_topics=50,
    id2word=dictionary,
    passes=1,
    eval_every=10,
    minimum_probability=0,
    random_state=42,
)

In [157]:
row = dict()
row['model'] = 'nmf'
row['train_time'], nmf = get_execution_time(
    lambda: Nmf(use_r=False, **params)
)
nmf.save('nmf.model')

2019-01-11 15:12:52,360 : INFO : Loss (no outliers): 2060.2597369446694	Loss (with outliers): 2060.2597369446694
2019-01-11 15:12:52,371 : INFO : saving Nmf object under nmf.model, separately None
2019-01-11 15:12:52,464 : INFO : saved nmf.model


In [158]:
get_tm_l2_norm(nmf, test_corpus)

ValueError: inconsistent shapes

In [90]:
nmf = Nmf.load('nmf.model')
row.update(get_tm_metrics(nmf, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf.show_topics(50)

2019-01-11 14:44:55,275 : INFO : loading Nmf object from nmf.model
2019-01-11 14:44:55,316 : INFO : loading id2word recursively from nmf.model.id2word.* with mmap=None
2019-01-11 14:44:55,317 : INFO : loaded nmf.model


MemoryError: 

In [89]:
row = dict()
row['model'] = 'nmf_with_r'
row['train_time'], nmf_with_r = get_execution_time(
    lambda: Nmf(
        use_r=True,
        lambda_=200,
        **params
    )
)
nmf_with_r.save('nmf_with_r.model')

KeyboardInterrupt: 

In [None]:
nmf_with_r = Nmf.load('nmf_with_r.model')
row.update(get_tm_metrics(nmf_with_r, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf_with_r.show_topics(50)

In [None]:
row = dict()
row['model'] = 'lda'
row['train_time'], lda = get_execution_time(
    lambda: LdaModel(**params)
)
lda.save('lda.model')

In [None]:
lda = LdaModel.load('lda.model')
row.update(get_tm_metrics(lda, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

lda.show_topics(50)