In [27]:
from gensim.corpora import MmCorpus
from gensim.models.nmf import Nmf
from gensim.models import LdaModel
import gensim.downloader as api
from gensim.parsing.preprocessing import preprocess_string
from tqdm import tqdm, tqdm_notebook

tqdm.pandas()

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [22]:
data = api.load("wiki-english-20171001")
for article in data:
    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
        print("Section title: %s" % section_title)
        print("Section text: %s" % section_text)
    break

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.

While anti-statism is central, anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations, including—but not limited to—the state system. Anarchism is usually considered a far-left ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism or participatory economics.

Anarchism does not offer a fixed body of doctrine from a single particular world view, instead fluxing and flowing as a philosophy. Many types and traditions of 

In [51]:
import itertools

def wiki_articles_iterator():
    for article in tqdm_notebook(data):
        yield (
            preprocess_string(
                " ".join(
                    " ".join(section)
                    for section
                    in zip(article['section_titles'], article['section_texts'])
                )
            )
        )

In [None]:
from gensim.corpora import Dictionary

dictionary = Dictionary(wiki_articles_iterator())
dictionary.filter_extremes()

dictionary.save('wiki.dict')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2018-08-30 13:05:09,089 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


In [None]:
dictionary = Dictionary.load('wiki.dict')

In [None]:
corpus = (
    dictionary.doc2bow(article)
    for article
    in wiki_articles
)

MmCorpus.serialize('wiki.mm', corpus)

In [None]:
corpus = MmCorpus('wiki.mm')

In [None]:
training_params = dict(
    corpus=corpus,
    chunksize=2000,
    passes=5,
    num_topics=20,
    id2word=dictionary,
    normalize=True
)

In [None]:
%%time

gensim_nmf = Nmf(**training_params)

In [48]:
gensim_nmf.show_topics(20)

[(0,
  '0.028*"british" + 0.019*"athen" + 0.014*"arab" + 0.012*"armi" + 0.012*"war" + 0.008*"north" + 0.008*"troop" + 0.008*"french" + 0.007*"defeat" + 0.007*"britain"'),
 (1,
  '0.076*"lincoln" + 0.012*"republican" + 0.011*"parti" + 0.011*"presid" + 0.010*"court" + 0.008*"elect" + 0.008*"illinoi" + 0.007*"slaveri" + 0.007*"democrat" + 0.006*"polit"'),
 (2,
  '0.035*"armenian" + 0.028*"countri" + 0.024*"alaska" + 0.018*"diplomat" + 0.018*"establish" + 0.009*"recogn" + 0.009*"foreign" + 0.009*"russia" + 0.008*"republ" + 0.008*"honorari"'),
 (3,
  '0.058*"film" + 0.016*"director" + 0.009*"award" + 0.009*"bell" + 0.007*"critic" + 0.006*"best" + 0.006*"japanes" + 0.005*"plai" + 0.005*"stori" + 0.005*"academi"'),
 (4,
  '0.069*"art" + 0.040*"angl" + 0.017*"bell" + 0.014*"artist" + 0.008*"paint" + 0.008*"athen" + 0.008*"measur" + 0.007*"turn" + 0.007*"aesthet" + 0.006*"cultur"'),
 (5,
  '0.014*"intellig" + 0.013*"human" + 0.012*"aristotl" + 0.010*"machin" + 0.009*"research" + 0.009*"problem"

In [49]:
%%time
# %%prun

lda = LdaModel(
    corpus,
    chunksize=2000,
    passes=5,
    num_topics=20,
    id2word=dictionary,
)

2018-08-30 13:04:37,060 : INFO : using symmetric alpha at 0.05
2018-08-30 13:04:37,063 : INFO : using symmetric eta at 0.05
2018-08-30 13:04:37,066 : INFO : using serial LDA version on this node
2018-08-30 13:04:37,077 : INFO : running online (multi-pass) LDA training, 20 topics, 5 passes over the supplied corpus of 402 documents, updating model once every 402 documents, evaluating perplexity every 402 documents, iterating 50x with a convergence threshold of 0.001000
2018-08-30 13:04:38,949 : INFO : -9.126 per-word bound, 558.6 perplexity estimate based on a held-out corpus of 402 documents with 500981 words
2018-08-30 13:04:38,951 : INFO : PROGRESS: pass 0, at document #402/402
2018-08-30 13:04:39,692 : INFO : topic #18 (0.050): 0.004*"angola" + 0.004*"film" + 0.003*"war" + 0.003*"countri" + 0.003*"court" + 0.003*"star" + 0.003*"popul" + 0.003*"greek" + 0.003*"govern" + 0.003*"death"
2018-08-30 13:04:39,695 : INFO : topic #4 (0.050): 0.003*"english" + 0.003*"film" + 0.003*"atom" + 0.0

CPU times: user 25.6 s, sys: 26.5 s, total: 52.1 s
Wall time: 14.5 s


In [50]:
lda.show_topics(20)

[(0,
  '0.018*"jew" + 0.010*"jewish" + 0.010*"forc" + 0.008*"anti" + 0.008*"man" + 0.008*"publish" + 0.007*"militari" + 0.006*"stori" + 0.005*"issu" + 0.005*"countri"'),
 (1,
  '0.019*"acid" + 0.014*"metal" + 0.007*"element" + 0.007*"water" + 0.007*"compound" + 0.006*"valu" + 0.006*"atom" + 0.006*"carbon" + 0.006*"reaction" + 0.005*"oxid"'),
 (2,
  '0.024*"engin" + 0.021*"choic" + 0.013*"mathemat" + 0.011*"comput" + 0.011*"function" + 0.010*"machin" + 0.010*"theori" + 0.010*"analyt" + 0.009*"program" + 0.008*"model"'),
 (3,
  '0.017*"angl" + 0.014*"armenian" + 0.014*"countri" + 0.007*"oil" + 0.007*"establish" + 0.007*"diplomat" + 0.005*"europ" + 0.004*"republ" + 0.004*"foreign" + 0.004*"type"'),
 (4,
  '0.013*"court" + 0.008*"open" + 0.008*"seri" + 0.008*"australia" + 0.008*"england" + 0.007*"appeal" + 0.007*"test" + 0.006*"amphibian" + 0.006*"final" + 0.006*"plai"'),
 (5,
  '0.023*"abort" + 0.017*"april" + 0.012*"anxieti" + 0.009*"einstein" + 0.007*"signal" + 0.006*"john" + 0.005*"geo