In [62]:
from gensim.corpora import MmCorpus
from gensim.models.nmf import Nmf
from gensim.models import LdaModel
import gensim.downloader as api
from gensim.parsing.preprocessing import preprocess_string
from tqdm import tqdm, tqdm_notebook
import json

tqdm.pandas()

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [22]:
data = api.load("wiki-english-20171001")
for article in data:
    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
        print("Section title: %s" % section_title)
        print("Section text: %s" % section_text)
    break

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.

While anti-statism is central, anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations, including—but not limited to—the state system. Anarchism is usually considered a far-left ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism or participatory economics.

Anarchism does not offer a fixed body of doctrine from a single particular world view, instead fluxing and flowing as a philosophy. Many types and traditions of 

In [51]:
import itertools

def wiki_articles_iterator():
    for article in tqdm_notebook(data):
        yield (
            preprocess_string(
                " ".join(
                    " ".join(section)
                    for section
                    in zip(article['section_titles'], article['section_texts'])
                )
            )
        )

In [94]:
def save_preprocessed_articles(filename, articles):
    with open(filename, 'w+') as writer:
        for article in tqdm_notebook(articles):
            writer.write(
                json.dumps(
                    preprocess_string(
                        " ".join(
                            " ".join(section)
                            for section
                            in zip(article['section_titles'], article['section_texts'])
                        )
                    )
                ) + '\n'
            )

def get_preprocessed_articles(filename):
    with open(filename, 'r') as reader:
        for line in tqdm_notebook(reader):
            yield json.loads(
                line
            )

In [95]:
save_preprocessed_articles('wiki_articles.jsonlines', data)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
from gensim.corpora import Dictionary

dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))

dictionary.save('wiki.dict')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2018-08-31 00:24:22,338 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-31 00:24:34,891 : INFO : adding document #10000 to Dictionary(399748 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:24:46,430 : INFO : adding document #20000 to Dictionary(591699 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:24:55,712 : INFO : adding document #30000 to Dictionary(731105 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:25:04,177 : INFO : adding document #40000 to Dictionary(851685 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:25:10,188 : INFO : adding document #50000 to Dictionary(931675 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:25:13,951 : INFO : adding document #60000 to Dictionary(952350 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:25:16,822 : INFO 

2018-08-31 00:27:53,376 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:27:53,415 : INFO : adding document #300000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:27:59,163 : INFO : discarding 43531 tokens: [('chalcophaea', 1), ('chamdoensi', 1), ('charchirensi', 1), ('chaudoiri', 1), ('chodjaii', 1), ('cholashanensi', 1), ('chormaensi', 1), ('coiffaiti', 1), ('colasi', 1), ('collivaga', 1)]...
2018-08-31 00:27:59,164 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 310000 (=100.0%) documents
2018-08-31 00:28:01,285 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:28:01,326 : INFO : adding document #310000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:28:07,145 : INFO : dis

2018-08-31 00:29:31,061 : INFO : discarding 41810 tokens: [('deathclaw', 1), ('scatari', 1), ('swoad', 1), ('dragonella', 1), ('isthistomorrow', 1), ('stuporman', 1), ('chepkoya', 1), ('chewanjel', 1), ('kibuk', 1), ('paulmann', 1)]...
2018-08-31 00:29:31,062 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 430000 (=100.0%) documents
2018-08-31 00:29:33,208 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:29:33,251 : INFO : adding document #430000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:29:38,502 : INFO : discarding 35200 tokens: [('subuktigin', 1), ('arkemedia', 1), ('har”', 1), ('jcdc', 1), ('malachismith', 1), ('malachi’', 1), ('reggaeconcept', 1), ('reggaesoca', 1), ('“wiseman”', 1), ('dibromophenol', 1)]...
2018-08-31 00:29:38,503 : INFO : keeping 2000000 tokens which were in no less than 0 and no more t

2018-08-31 00:31:00,013 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 550000 (=100.0%) documents
2018-08-31 00:31:02,150 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:31:02,194 : INFO : adding document #550000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:31:07,222 : INFO : discarding 38870 tokens: [('ågå', 1), ('åinakåsz', 1), ('åinasim', 1), ('åtzån', 1), ('funeral—', 1), ('wilef', 1), ('iskuryhmä', 1), ('legorgu', 1), ('phélip', 1), ('pomlt', 1)]...
2018-08-31 00:31:07,223 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 560000 (=100.0%) documents
2018-08-31 00:31:09,271 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:31:09,312 : INFO : adding document #560000 to Dictionary(2000000 unique toke

2018-08-31 00:32:26,676 : INFO : adding document #670000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:32:31,331 : INFO : discarding 31718 tokens: [('econbrows', 1), ('mieczkowski', 1), ('pouzèr', 1), ('fodem', 1), ('ouangolé', 1), ('kittrdg', 1), ('zarudnaya', 1), ('hüther', 1), ('iwkoeln', 1), ('ss”', 1)]...
2018-08-31 00:32:31,331 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 680000 (=100.0%) documents
2018-08-31 00:32:33,386 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:32:33,429 : INFO : adding document #680000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:32:38,107 : INFO : discarding 35660 tokens: [('ostravan', 1), ('vigantic', 1), ('swstk', 1), ('xdustmd', 1), ('rastenn', 1), ('lp×lp', 1), ('ssshhhhhh', 1), ('xdustcdx', 1), ('assets—church', 

2018-08-31 00:33:53,280 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 800000 (=100.0%) documents
2018-08-31 00:33:55,361 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:33:55,404 : INFO : adding document #800000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:34:00,058 : INFO : discarding 34021 tokens: [('mednarodnega', 1), ('miklič', 1), ('szdl', 1), ('vitoslav', 1), ('archeologists’', 1), ('emircanov', 1), ('hacibayov', 1), ('kamanca', 1), ('migachevir', 1), ('niftaliyev', 1)]...
2018-08-31 00:34:00,058 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 810000 (=100.0%) documents
2018-08-31 00:34:02,206 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:34:02,251 : INFO : adding document #810000 to Dict

2018-08-31 00:35:19,973 : INFO : discarding 33158 tokens: [('squeezeflowmod', 1), ('yucudaa', 1), ('clariscan', 1), ('cliavist', 1), ('combidex', 1), ('endorem', 1), ('ferrotec', 1), ('feruglos', 1), ('lumirem', 1), ('resovist', 1)]...
2018-08-31 00:35:19,974 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 930000 (=100.0%) documents
2018-08-31 00:35:22,131 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:35:22,179 : INFO : adding document #930000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:35:26,527 : INFO : discarding 29712 tokens: [('bandicoot—dansu', 1), ('dinestein', 1), ('hakik', 1), ('spiralmouth', 1), ('theplay', 1), ('callisen’', 1), ('lotheissen‘', 1), ('mcevedy’', 1), ('narath’', 1), ('retrovascular', 1)]...
2018-08-31 00:35:26,527 : INFO : keeping 2000000 tokens which were in no less than 0 and no mor

2018-08-31 00:36:41,160 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:36:41,208 : INFO : adding document #1050000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:36:45,452 : INFO : discarding 34086 tokens: [('tangle—wer', 1), ('korsakoff´', 1), ('wernicke´', 1), ('fedorov‘', 1), ('sisyphan', 1), ('smartshroud', 1), ('wormcam', 1), ('modérateur', 1), ('sugrivapithecu', 1), ('yastwant', 1)]...
2018-08-31 00:36:45,453 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1060000 (=100.0%) documents
2018-08-31 00:36:47,533 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:36:47,578 : INFO : adding document #1060000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:36:51,879 : INFO : di

2018-08-31 00:38:01,419 : INFO : discarding 27739 tokens: [('usvba', 1), ('abhiraja', 1), ('ce—or', 1), ('ce—th', 1), ('dhammavilasa', 1), ('elements—art', 1), ('generations—wa', 1), ('guards—', 1), ('hlaung', 1), ('hpyat', 1)]...
2018-08-31 00:38:01,420 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1180000 (=100.0%) documents
2018-08-31 00:38:03,482 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:38:03,529 : INFO : adding document #1180000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:38:07,579 : INFO : discarding 29505 tokens: [('clift’', 1), ('deer’’', 1), ('kazan’', 1), ('lodan', 1), ('loden”', 1), ('music—everyth', 1), ('pictures…', 1), ('wanda”', 1), ('“kazan', 1), ('“maggie”', 1)]...
2018-08-31 00:38:07,579 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1190000 (=100.0%) do

2018-08-31 00:39:18,999 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:39:19,047 : INFO : adding document #1300000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:39:23,319 : INFO : discarding 29336 tokens: [('hankari', 1), ('hubaira', 1), ('isma‘il', 1), ('jallaluddin', 1), ('jamiyattablighulislam', 1), ('jamālullah', 1), ('mawā', 1), ('muhiyuddin', 1), ('nausha', 1), ('naushahi', 1)]...
2018-08-31 00:39:23,320 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1310000 (=100.0%) documents
2018-08-31 00:39:25,498 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:39:25,547 : INFO : adding document #1310000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:39:29,609 : INFO : discar

2018-08-31 00:40:39,304 : INFO : discarding 27312 tokens: [('damascus–istr', 1), ('editzioni', 1), ('grafasdiv', 1), ('jalo–giarabub', 1), ('jozza', 1), ('legionaira', 1), ('leproni', 1), ('nauagia', 1), ('scaramanzia', 1), ('thermopila', 1)]...
2018-08-31 00:40:39,304 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1430000 (=100.0%) documents
2018-08-31 00:40:41,472 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:40:41,522 : INFO : adding document #1430000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:40:45,578 : INFO : discarding 28821 tokens: [('ardinari', 1), ('aysle—a', 1), ('below—not', 1), ('comaghaz', 1), ('cyberpapaci', 1), ('cyberpapacy—cov', 1), ('drakacanu', 1), ('ebenuscrux', 1), ('edeino', 1), ('eideno', 1)]...
2018-08-31 00:40:45,578 : INFO : keeping 2000000 tokens which were in no less than 0 and 

2018-08-31 00:41:56,987 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:41:57,037 : INFO : adding document #1550000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:42:01,166 : INFO : discarding 27786 tokens: [('malcadet', 1), ('vicalet', 1), ('agnervil', 1), ('aignervillai', 1), ('aignervillais', 1), ('berigot', 1), ('airannai', 1), ('airannais', 1), ('borgarelli', 1), ('deuzet', 1)]...
2018-08-31 00:42:01,167 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1560000 (=100.0%) documents
2018-08-31 00:42:03,250 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:42:03,298 : INFO : adding document #1560000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:42:07,456 : INFO : discardin

2018-08-31 00:43:14,032 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1680000 (=100.0%) documents
2018-08-31 00:43:16,121 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:43:16,169 : INFO : adding document #1680000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:43:19,338 : INFO : discarding 17143 tokens: [('platychilu', 1), ('pleurostriatu', 1), ('haddadu', 1), ('chapin—whom', 1), ('with—to', 1), ('pluvicanoru', 1), ('duffal', 1), ('strawbsweb', 1), ('coqui”', 1), ('bergwaldoffens', 1)]...
2018-08-31 00:43:19,339 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1690000 (=100.0%) documents
2018-08-31 00:43:21,507 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:43:21,558 : INFO : adding document #16900

2018-08-31 00:44:25,173 : INFO : adding document #1800000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:44:29,124 : INFO : discarding 27925 tokens: [('bhajia', 1), ('celantro', 1), ('upvaa', 1), ('hildegun', 1), ('hungn', 1), ('ingjerd', 1), ('voktor', 1), ('øigarden', 1), ('nanoscientist', 1), ('adifor', 1)]...
2018-08-31 00:44:29,125 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1810000 (=100.0%) documents
2018-08-31 00:44:31,298 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:44:31,348 : INFO : adding document #1810000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:44:35,098 : INFO : discarding 24224 tokens: [('pecsovszki', 1), ('peći', 1), ('sigaudi', 1), ('trémintin', 1), ('nzog', 1), ('branchiura', 1), ('allwörden', 1), ('kumminin', 1), ('brudders

2018-08-31 00:45:41,093 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1930000 (=100.0%) documents
2018-08-31 00:45:43,286 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:45:43,337 : INFO : adding document #1930000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:45:47,114 : INFO : discarding 30737 tokens: [('thomton', 1), ('kargów', 1), ('nieciesławic', 1), ('rzędów', 1), ('sieczków', 1), ('hydroclem', 1), ('massachset', 1), ('metereau', 1), ('chotel', 1), ('gluzi', 1)]...
2018-08-31 00:45:47,115 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1940000 (=100.0%) documents
2018-08-31 00:45:49,208 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:45:49,257 : INFO : adding document #1940000 to Dictionary(2

2018-08-31 00:46:58,502 : INFO : discarding 28306 tokens: [('askimenokonson', 1), ('naseongo', 1), ('nassanongo', 1), ('nassiongo', 1), ('nassiungo', 1), ('engen’', 1), ('missoula’', 1), ('canadiantheatr', 1), ('carmela’', 1), ('cibpa', 1)]...
2018-08-31 00:46:58,503 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2060000 (=100.0%) documents
2018-08-31 00:47:00,600 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:47:00,649 : INFO : adding document #2060000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:47:04,558 : INFO : discarding 26218 tokens: [('faguo', 1), ('jialü', 1), ('许明龙', 1), ('黃嘉略与早期法囯汉学', 1), ('cotyttia', 1), ('cotytto', 1), ('hoshal', 1), ('mulelland', 1), ('arsv', 1), ('babáková', 1)]...
2018-08-31 00:47:04,558 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2070000 (=100

2018-08-31 00:48:10,292 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2180000 (=100.0%) documents
2018-08-31 00:48:12,414 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:48:12,462 : INFO : adding document #2180000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:48:16,174 : INFO : discarding 25352 tokens: [('miętki', 1), ('modryniu', 1), ('prehorył', 1), ('rulikówka', 1), ('szychowic', 1), ('korytyna', 1), ('mołodiatycz', 1), ('nieledew', 1), ('aurelin', 1), ('drohiczani', 1)]...
2018-08-31 00:48:16,174 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2190000 (=100.0%) documents
2018-08-31 00:48:18,357 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:48:18,409 : INFO : adding document #2190000 to Dicti

2018-08-31 00:49:22,923 : INFO : adding document #2300000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:49:26,112 : INFO : discarding 18320 tokens: [('合欢皮', 1), ('大血藤', 1), ('天竺黄', 1), ('宽根藤', 1), ('山葡萄', 1), ('山麻黄', 1), ('川木通', 1), ('川贝母', 1), ('常春藤', 1), ('平贝母', 1)]...
2018-08-31 00:49:26,113 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2310000 (=100.0%) documents
2018-08-31 00:49:28,295 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:49:28,347 : INFO : adding document #2310000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:49:31,867 : INFO : discarding 23407 tokens: [('jhanghuà', 1), ('jhúběi', 1), ('kung¹', 1), ('lan²', 1), ('lien²', 1), ('lin²', 1), ('liu⁴', 1), ('li⁴', 1), ('ma³', 1), ('miao²', 1)]...
2018-08-31 00:49:31,868 : INFO : keeping 2000

2018-08-31 00:50:37,407 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2430000 (=100.0%) documents
2018-08-31 00:50:39,593 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:50:39,646 : INFO : adding document #2430000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:50:43,436 : INFO : discarding 26802 tokens: [('bayesloop', 1), ('cgarch', 1), ('cowpertwait', 1), ('figarch', 1), ('mann–kendal', 1), ('timeviz', 1), ('北条時頼', 1), ('yāˈqub', 1), ('إِبرَٰهِم', 1), ('إِبْنُ', 1)]...
2018-08-31 00:50:43,437 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2440000 (=100.0%) documents
2018-08-31 00:50:45,568 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:50:45,618 : INFO : adding document #2440000 to Dictionary(20

2018-08-31 00:51:49,408 : INFO : adding document #2550000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:51:53,215 : INFO : discarding 24108 tokens: [('bonisson', 1), ('cibb', 1), ('piuri', 1), ('tzanak', 1), ('ogorek', 1), ('estirao', 1), ('dammitt', 1), ('dalimil´', 1), ('candlelightrecord', 1), ('entrancemperium', 1)]...
2018-08-31 00:51:53,216 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2560000 (=100.0%) documents
2018-08-31 00:51:55,345 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:51:55,399 : INFO : adding document #2560000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:51:59,183 : INFO : discarding 23226 tokens: [('macilwraith', 1), ('maclink', 1), ('loueckhot', 1), ('wthout', 1), ('managlor', 1), ('pointwith', 1), ('bobmark', 1), ('mtdx', 1),

2018-08-31 00:53:04,972 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2680000 (=100.0%) documents
2018-08-31 00:53:07,107 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:53:07,159 : INFO : adding document #2680000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:53:11,209 : INFO : discarding 28087 tokens: [('geil’', 1), ('akweathercam', 1), ('acquaverd', 1), ('doria—wher', 1), ('fassolo', 1), ('genoa–milan', 1), ('genoa–rom', 1), ('genoa–turin', 1), ('mazzucchetti', 1), ('montegalletto', 1)]...
2018-08-31 00:53:11,210 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2690000 (=100.0%) documents
2018-08-31 00:53:13,396 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:53:13,449 : INFO : adding document #2

2018-08-31 00:54:20,207 : INFO : adding document #2800000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:54:24,168 : INFO : discarding 26926 tokens: [('ludhaina', 1), ('freeplaneport', 1), ('butzii', 1), ('island′', 1), ('wreck′', 1), ('rdasc', 1), ('wmfe', 1), ('eclaro', 1), ('eclaro’', 1), ('mbeic', 1)]...
2018-08-31 00:54:24,168 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2810000 (=100.0%) documents
2018-08-31 00:54:26,364 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:54:26,417 : INFO : adding document #2810000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:54:30,174 : INFO : discarding 30399 tokens: [('kumbárová', 1), ('lazarchuk', 1), ('liachovičiūtė', 1), ('mossiakova', 1), ('raclavská', 1), ('sydorska', 1), ('uzhylovska', 1), ('volodymyrivna',

2018-08-31 00:55:36,844 : INFO : discarding 26899 tokens: [('gamepag', 1), ('giftabl', 1), ('sportsfriend', 1), ('tanysphyra', 1), ('surnâm', 1), ('tuhaf', 1), ('yukh', 1), ('fencholen', 1), ('mediapartn', 1), ('stuckenholtz', 1)]...
2018-08-31 00:55:36,844 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2930000 (=100.0%) documents
2018-08-31 00:55:39,040 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:55:39,093 : INFO : adding document #2930000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:55:43,095 : INFO : discarding 35965 tokens: [('jarusombat', 1), ('phinij', 1), ('raktapongpisak', 1), ('parents—di', 1), ('scrappit', 1), ('jūliè', 1), ('unfight', 1), ('国史概要', 1), ('秦第一', 1), ('allowances—', 1)]...
2018-08-31 00:55:43,096 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2940000 (=

2018-08-31 00:56:49,794 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3050000 (=100.0%) documents
2018-08-31 00:56:51,998 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:56:52,052 : INFO : adding document #3050000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:56:55,792 : INFO : discarding 27065 tokens: [('lappifolia', 1), ('刘积斌', 1), ('nougat’', 1), ('nowga', 1), ('nucatu', 1), ('nucatum', 1), ('لوکا', 1), ('نوقا', 1), ('tschehr', 1), ('王亚平', 1)]...
2018-08-31 00:56:55,792 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3060000 (=100.0%) documents
2018-08-31 00:56:57,949 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:56:58,003 : INFO : adding document #3060000 to Dictionary(2000000 unique tokens:

2018-08-31 00:58:03,576 : INFO : adding document #3170000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:58:08,376 : INFO : discarding 37593 tokens: [('cinecrack', 1), ('cinedict', 1), ('deceault', 1), ('filmspot', 1), ('kempenaar', 1), ('larsenonfilm', 1), ('belonog', 1), ('faaea', 1), ('goncharuk', 1), ('haborák', 1)]...
2018-08-31 00:58:08,377 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3180000 (=100.0%) documents
2018-08-31 00:58:10,535 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:58:10,587 : INFO : adding document #3180000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:58:14,494 : INFO : discarding 27004 tokens: [('akbaşlı', 1), ('akçevr', 1), ('güveneroğlu', 1), ('hoşfikir', 1), ('olgun', 1), ('taviş', 1), ('ulucan', 1), ('uğurludoğan', 1), ('

2018-08-31 00:59:22,743 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3300000 (=100.0%) documents
2018-08-31 00:59:24,918 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:59:24,972 : INFO : adding document #3300000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:59:28,775 : INFO : discarding 28970 tokens: [('d’humili', 1), ('entlarg', 1), ('l’émotion', 1), ('ablagh', 1), ('ablaghiat', 1), ('khabriyat', 1), ('rujhanaat', 1), ('zabir', 1), ('aiyekooto', 1), ('ehinlanwo', 1)]...
2018-08-31 00:59:28,775 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3310000 (=100.0%) documents
2018-08-31 00:59:30,984 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 00:59:31,039 : INFO : adding document #3310000 to Dictionar

2018-08-31 01:00:37,428 : INFO : adding document #3420000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:00:41,245 : INFO : discarding 26003 tokens: [('rezāābād', 1), ('institute–commiss', 1), ('ebergéni', 1), ('ebergényi', 1), ('bardvāl', 1), ('bardwāl', 1), ('bārd', 1), ('oldest—perman', 1), ('defensism”', 1), ('afirmo', 1)]...
2018-08-31 01:00:41,246 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3430000 (=100.0%) documents
2018-08-31 01:00:43,452 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:00:43,508 : INFO : adding document #3430000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:00:47,314 : INFO : discarding 26749 tokens: [('altrokradio', 1), ('djjd', 1), ('metallicav', 1), ('nuclearrockradio', 1), ('thepenguinrock', 1), ('computer—thu', 1), ('nec

2018-08-31 01:01:53,226 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3550000 (=100.0%) documents
2018-08-31 01:01:55,440 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:01:55,495 : INFO : adding document #3550000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:01:59,073 : INFO : discarding 23115 tokens: [('gananita', 1), ('kabushia', 1), ('shereyk', 1), ('ethosc', 1), ('akedami', 1), ('asifia', 1), ('charkaman', 1), ('deccanwood', 1), ('hitex', 1), ('kutubkhana', 1)]...
2018-08-31 01:01:59,074 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3560000 (=100.0%) documents
2018-08-31 01:02:01,248 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:02:01,301 : INFO : adding document #3560000 to Dictionary(20

2018-08-31 01:03:09,401 : INFO : discarding 26999 tokens: [('crowdcube’', 1), ('lovespac', 1), ('annaházi', 1), ('belorv', 1), ('bántalmak', 1), ('coronariaspazmu', 1), ('czapf', 1), ('diagnózi', 1), ('döntően', 1), ('esophagocardiac', 1)]...
2018-08-31 01:03:09,401 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3680000 (=100.0%) documents
2018-08-31 01:03:11,588 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:03:11,641 : INFO : adding document #3680000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:03:15,426 : INFO : discarding 29512 tokens: [('rupayan', 1), ('sohrawardi', 1), ('absec', 1), ('boodjari', 1), ('munyarryun', 1), ('hairoddin', 1), ('cudi—', 1), ('baupolizeilich', 1), ('stadterweiterungen', 1), ('roengpithya', 1)]...
2018-08-31 01:03:15,426 : INFO : keeping 2000000 tokens which were in no less than 0

2018-08-31 01:04:23,828 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:04:23,881 : INFO : adding document #3800000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:04:27,657 : INFO : discarding 26588 tokens: [('pinellas”', 1), ('metapsíquico', 1), ('malhstedt', 1), ('cutston', 1), ('astodia', 1), ('hargovanda', 1), ('lakhmichand', 1), ('nathiba', 1), ('nhlmmc', 1), ('saraspur', 1)]...
2018-08-31 01:04:27,658 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3810000 (=100.0%) documents
2018-08-31 01:04:29,894 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:04:29,949 : INFO : adding document #3810000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:04:33,823 : INFO : discarding 

2018-08-31 01:05:39,034 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3930000 (=100.0%) documents
2018-08-31 01:05:41,269 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:05:41,324 : INFO : adding document #3930000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:05:44,831 : INFO : discarding 23651 tokens: [('premur', 1), ('prevajalec', 1), ('prevodoma', 1), ('produkciji', 1), ('protitok', 1), ('provincialn', 1), ('raziskovalno', 1), ('različni', 1), ('razsvetljenska', 1), ('reakcionarni', 1)]...
2018-08-31 01:05:44,832 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3940000 (=100.0%) documents
2018-08-31 01:05:47,020 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:05:47,073 : INFO : adding document #

2018-08-31 01:06:51,173 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:06:51,228 : INFO : adding document #4050000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:06:54,963 : INFO : discarding 25795 tokens: [('hodzyur', 1), ('krasnozhan', 1), ('utsiev', 1), ('affelai', 1), ('choutesioti', 1), ('masuaku', 1), ('strezo', 1), ('pangratio', 1), ('alebrini', 1), ('thadeyn', 1)]...
2018-08-31 01:06:54,964 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4060000 (=100.0%) documents
2018-08-31 01:06:57,160 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:06:57,214 : INFO : adding document #4060000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:07:00,990 : INFO : discarding 24945 to

2018-08-31 01:08:07,053 : INFO : discarding 32837 tokens: [('busiess', 1), ('duhoki', 1), ('mehoderet', 1), ('zilberstien', 1), ('“barghouti', 1), ('grosswig', 1), ('beninensi', 1), ('beninensis”', 1), ('pflanzenfam', 1), ('pflanzenw', 1)]...
2018-08-31 01:08:07,053 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4180000 (=100.0%) documents
2018-08-31 01:08:09,254 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:08:09,310 : INFO : adding document #4180000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:08:13,016 : INFO : discarding 26977 tokens: [('adefesio', 1), ('bellesa', 1), ('bitó', 1), ('cdgc', 1), ('coratg', 1), ('dispersión', 1), ('dramàtic', 1), ('poliorama', 1), ('reposición', 1), ('ridículo', 1)]...
2018-08-31 01:08:13,017 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 41900

2018-08-31 01:09:21,533 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:09:21,587 : INFO : adding document #4300000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:09:25,439 : INFO : discarding 26271 tokens: [('viestintä', 1), ('shinchireem', 1), ('polyromant', 1), ('alhathloul', 1), ('देओराई', 1), ('abbandon', 1), ('cicarimanah', 1), ('cilopang', 1), ('pamulihan', 1), ('ruhatma', 1)]...
2018-08-31 01:09:25,440 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4310000 (=100.0%) documents
2018-08-31 01:09:27,664 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:09:27,720 : INFO : adding document #4310000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:09:31,502 : INFO : discardi

2018-08-31 01:10:37,410 : INFO : discarding 25416 tokens: [('udadh', 1), ('umaisi', 1), ('withelector', 1), ('yadlaf', 1), ('yahzin', 1), ('yalhan', 1), ('keïta’', 1), ('millenniumsprei', 1), ('“schicksal', 1), ('biotinctur', 1)]...
2018-08-31 01:10:37,410 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4430000 (=100.0%) documents
2018-08-31 01:10:39,643 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:10:39,699 : INFO : adding document #4430000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:10:43,517 : INFO : discarding 25712 tokens: [('babies—no', 1), ('രണ്ടാമൂഴം', 1), ('mudhakkirâtî', 1), ('haunt–', 1), ('羊をめぐる冒険', 1), ('hiáni', 1), ('年のピンボール', 1), ('bouzaidi', 1), ('fishbook', 1), ('prusakowski', 1)]...
2018-08-31 01:10:43,518 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4440000

2018-08-31 01:11:52,033 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:11:52,089 : INFO : adding document #4550000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:11:55,843 : INFO : discarding 27946 tokens: [('aristri', 1), ('bitú', 1), ('contratado', 1), ('criou', 1), ('indivisível', 1), ('nosgenti', 1), ('storiei', 1), ('himmeltårnet', 1), ('mannfolk', 1), ('synker', 1)]...
2018-08-31 01:11:55,843 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4560000 (=100.0%) documents
2018-08-31 01:11:58,057 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:11:58,114 : INFO : adding document #4560000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:12:01,790 : INFO : discarding 25430 to

2018-08-31 01:13:06,836 : INFO : discarding 30694 tokens: [('karisalkalampatti', 1), ('karisalkallampatti', 1), ('maikandan', 1), ('prednli', 1), ('rayapalayam', 1), ('sengapadai', 1), ('sivarakkottai', 1), ('sivarakottai', 1), ('sundrav', 1), ('surayi', 1)]...
2018-08-31 01:13:06,837 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4680000 (=100.0%) documents
2018-08-31 01:13:09,061 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:13:09,116 : INFO : adding document #4680000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:13:12,581 : INFO : discarding 22690 tokens: [('dunnamona', 1), ('owenacharra', 1), ('forsgrini', 1), ('kyranak', 1), ('chemicæ', 1), ('purgantibu', 1), ('as“…an', 1), ('chemicals”', 1), ('cradle”', 1), ('smm’', 1)]...
2018-08-31 01:13:12,582 : INFO : keeping 2000000 tokens which were in no less than

2018-08-31 01:14:17,564 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4800000 (=100.0%) documents
2018-08-31 01:14:19,802 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:14:19,856 : INFO : adding document #4800000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:14:23,379 : INFO : discarding 27394 tokens: [('orders—list', 1), ('biis', 1), ('boncouer', 1), ('conhabit', 1), ('strips–', 1), ('aaron—top', 1), ('brafasco', 1), ('thughliph', 1), ('alsohold', 1), ('hospitals—credit', 1)]...
2018-08-31 01:14:23,380 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4810000 (=100.0%) documents
2018-08-31 01:14:25,637 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:14:25,693 : INFO : adding document #4810000 to D

2018-08-31 01:15:29,490 : INFO : adding document #4920000 to Dictionary(2000000 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...)
2018-08-31 01:15:30,267 : INFO : built Dictionary(2010258 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...) from 4924894 documents (total 1456741401 corpus positions)
2018-08-31 01:15:30,268 : INFO : saving Dictionary object under wiki.dict, separately None
2018-08-31 01:15:31,235 : INFO : saved wiki.dict


In [None]:
dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes()
dictionary.compactify()

2018-08-31 01:15:31,248 : INFO : loading Dictionary object from wiki.dict
2018-08-31 01:15:32,160 : INFO : loaded wiki.dict
2018-08-31 01:15:34,487 : INFO : discarding 1910258 tokens: [('abdelrahim', 49), ('abstention', 120), ('anarcha', 101), ('anarchica', 40), ('anarchosyndicalist', 20), ('antimilitar', 68), ('arbet', 194), ('archo', 100), ('arkhē', 5), ('autonomedia', 118)]...
2018-08-31 01:15:34,488 : INFO : keeping 100000 tokens which were in no less than 5 and no more than 2462447 (=50.0%) documents
2018-08-31 01:15:34,850 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['abandon', 'abil', 'abl', 'abolit', 'abstent']...)


In [202]:
import random

class RandomCorpus(MmCorpus):
    def __init__(self, *args, random_state, **kwargs):
        super().__init__(*args, **kwargs)

    def __iter__(self):
        random.seed(42)
        
        shuffled_indices = list(range(self.num_docs))
        random.shuffle(shuffled_indices)
        
        for doc_id in shuffled_indices:
            yield self[doc_id]

In [None]:
corpus = (
    dictionary.doc2bow(article)
    for article
    in get_preprocessed_articles('wiki_articles.jsonlines')
)

RandomCorpus.serialize('wiki.mm')

2018-08-31 02:09:12,249 : INFO : storing corpus in Matrix Market format to wiki.mm
2018-08-31 02:09:12,424 : INFO : saving sparse matrix to wiki.mm


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2018-08-31 02:09:12,447 : INFO : PROGRESS: saving document #0
2018-08-31 02:09:14,303 : INFO : PROGRESS: saving document #1000
2018-08-31 02:09:15,963 : INFO : PROGRESS: saving document #2000
2018-08-31 02:09:17,822 : INFO : PROGRESS: saving document #3000
2018-08-31 02:09:19,552 : INFO : PROGRESS: saving document #4000
2018-08-31 02:09:21,256 : INFO : PROGRESS: saving document #5000
2018-08-31 02:09:23,119 : INFO : PROGRESS: saving document #6000
2018-08-31 02:09:25,152 : INFO : PROGRESS: saving document #7000
2018-08-31 02:09:27,166 : INFO : PROGRESS: saving document #8000
2018-08-31 02:09:28,840 : INFO : PROGRESS: saving document #9000
2018-08-31 02:09:30,784 : INFO : PROGRESS: saving document #10000
2018-08-31 02:09:32,637 : INFO : PROGRESS: saving document #11000
2018-08-31 02:09:34,379 : INFO : PROGRESS: saving document #12000
2018-08-31 02:09:36,092 : INFO : PROGRESS: saving document #13000
2018-08-31 02:09:37,861 : INFO : PROGRESS: saving document #14000
2018-08-31 02:09:39,705

2018-08-31 02:11:22,868 : INFO : PROGRESS: saving document #124000
2018-08-31 02:11:23,814 : INFO : PROGRESS: saving document #125000
2018-08-31 02:11:24,801 : INFO : PROGRESS: saving document #126000
2018-08-31 02:11:25,360 : INFO : PROGRESS: saving document #127000
2018-08-31 02:11:26,211 : INFO : PROGRESS: saving document #128000
2018-08-31 02:11:27,220 : INFO : PROGRESS: saving document #129000
2018-08-31 02:11:28,276 : INFO : PROGRESS: saving document #130000
2018-08-31 02:11:29,222 : INFO : PROGRESS: saving document #131000
2018-08-31 02:11:30,264 : INFO : PROGRESS: saving document #132000
2018-08-31 02:11:31,561 : INFO : PROGRESS: saving document #133000
2018-08-31 02:11:32,560 : INFO : PROGRESS: saving document #134000
2018-08-31 02:11:33,806 : INFO : PROGRESS: saving document #135000
2018-08-31 02:11:35,091 : INFO : PROGRESS: saving document #136000
2018-08-31 02:11:36,016 : INFO : PROGRESS: saving document #137000
2018-08-31 02:11:37,006 : INFO : PROGRESS: saving document #13

In [206]:
corpus = RandomCorpus('wiki.mm', random_state=42)

2018-08-31 17:42:54,211 : INFO : loaded corpus index from wiki.mm.index
2018-08-31 17:42:54,211 : INFO : initializing cython corpus reader from wiki.mm
2018-08-31 17:42:54,212 : INFO : accepted corpus with 4924894 documents, 100000 features, 683375728 non-zero entries


In [207]:
PASSES = 2

training_params = dict(
    chunksize=2000,
    num_topics=100,
    id2word=dictionary
)

In [187]:
%%time

gensim_nmf = Nmf(**training_params)

for pass_ in range(PASSES):
    gensim_nmf.update(corpus_iter())
    gensim_nmf.save('nmf_%s.model' % pass_)

2018-08-31 17:17:30,351 : INFO : Loss (no outliers): 2367.6889968919445	Loss (with outliers): 2367.6889968919445
2018-08-31 17:18:59,163 : INFO : Loss (no outliers): 1854.0984036987372	Loss (with outliers): 1854.0984036987372
2018-08-31 17:20:26,149 : INFO : Loss (no outliers): 2345.1193574171775	Loss (with outliers): 2345.1193574171775


KeyboardInterrupt: 

In [188]:
gensim_nmf = Nmf.load('nmf_0.model')

2018-08-31 17:21:18,926 : INFO : loading Nmf object from nmf_0.model
2018-08-31 17:21:19,238 : INFO : loading id2word recursively from nmf_0.model.id2word.* with mmap=None
2018-08-31 17:21:19,240 : INFO : loading _r from nmf_0.model._r.npy with mmap=None
2018-08-31 17:21:19,625 : INFO : loaded nmf_0.model


In [189]:
gensim_nmf.show_topics(20)

[(0,
  '0.023*"film" + 0.017*"documentari" + 0.011*"best" + 0.010*"product" + 0.009*"award" + 0.008*"nomin" + 0.007*"colombian" + 0.006*"time" + 0.006*"director" + 0.005*"american"'),
 (1,
  '0.039*"game" + 0.023*"team" + 0.022*"vike" + 0.020*"season" + 0.012*"win" + 0.012*"playoff" + 0.010*"goal" + 0.010*"plai" + 0.009*"year" + 0.008*"score"'),
 (2,
  '0.033*"air" + 0.022*"squadron" + 0.021*"march" + 0.015*"wing" + 0.012*"unit" + 0.011*"servic" + 0.011*"base" + 0.011*"forc" + 0.011*"train" + 0.010*"command"'),
 (3,
  '0.018*"seri" + 0.016*"game" + 0.012*"point" + 0.011*"race" + 0.009*"group" + 0.008*"win" + 0.007*"final" + 0.007*"goal" + 0.007*"car" + 0.007*"score"'),
 (4,
  '0.012*"group" + 0.011*"team" + 0.010*"men" + 0.009*"women" + 0.008*"rank" + 0.008*"german" + 0.008*"resist" + 0.007*"final" + 0.007*"point" + 0.007*"event"'),
 (5,
  '0.016*"colorado" + 0.016*"histori" + 0.015*"type" + 0.010*"function" + 0.008*"class" + 0.008*"std" + 0.007*"int" + 0.007*"car" + 0.007*"constructor

In [190]:
%%time

gensim_lda = LdaModel(**training_params)

for pass_ in range(PASSES):
    gensim_lda.update(corpus_iter())
    gensim_lda.save('lda_%s.model' % pass_)

2018-08-31 17:21:19,680 : INFO : using symmetric alpha at 0.05
2018-08-31 17:21:19,681 : INFO : using symmetric eta at 0.05
2018-08-31 17:21:19,694 : INFO : using serial LDA version on this node


KeyboardInterrupt: 

In [191]:
lda.show_topics(20)

[(0,
  '0.040*"anim" + 0.039*"scienc" + 0.033*"actual" + 0.029*"sens" + 0.025*"write" + 0.020*"translat" + 0.020*"matter" + 0.019*"observ" + 0.018*"formal" + 0.017*"activ"'),
 (1,
  '0.348*"letter" + 0.064*"capit" + 0.055*"write" + 0.036*"sign" + 0.028*"distinguish" + 0.027*"size" + 0.027*"commonli" + 0.027*"round" + 0.027*"semi" + 0.019*"earliest"'),
 (2,
  '0.016*"actual" + 0.013*"scienc" + 0.013*"anim" + 0.010*"activ" + 0.010*"sens" + 0.009*"earth" + 0.009*"write" + 0.009*"translat" + 0.009*"observ" + 0.008*"formal"'),
 (3,
  '0.016*"court" + 0.011*"pass" + 0.011*"presid" + 0.010*"anim" + 0.010*"support" + 0.010*"congress" + 0.009*"write" + 0.009*"armi" + 0.009*"movement" + 0.009*"held"'),
 (4,
  '0.082*"death" + 0.074*"island" + 0.052*"kill" + 0.034*"figur" + 0.030*"sea" + 0.028*"king" + 0.023*"charact" + 0.023*"daughter" + 0.019*"stori" + 0.017*"give"'),
 (5,
  '0.058*"court" + 0.026*"cultur" + 0.026*"rate" + 0.024*"pass" + 0.023*"histor" + 0.022*"oper" + 0.022*"increas" + 0.017*"