In [33]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import LdaModel

import logging
import itertools

import numpy as np
import gensim
import json

In [28]:
WIKI_CORPUS_PATH = './data/simplewiki-20171020-pages-articles-multistream.xml.bz2'
WIKI_DICT_PATH = 'lda/id2word_wiki.txt'
WIKI_BOW_FILE = 'lda/wiki_bow.mm'
NO_BELOW = 20 # filter words that appear in less than this many documents
NO_ABOVE_PCT = 0.1 # filter words that appear in more than this percent of documents

LDA_SAVE_FILE = 'lda/lda_model'

In [4]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))


def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens

In [6]:
# Print the article title and its first ten tokens as an example
stream = iter_wiki(WIKI_CORPUS_PATH)
for title, tokens in itertools.islice(iter_wiki(WIKI_CORPUS_PATH), 8):
    print title, tokens[:10]  

April [u'april', u'th', u'month', u'year', u'comes', u'march', u'months', u'days', u'april', u'begins']
August [u'august', u'aug', u'th', u'month', u'year', u'gregorian', u'calendar', u'coming', u'july', u'september']
Art [u'painting', u'renoir', u'work', u'art', u'art', u'creative', u'activity', u'people', u'people', u'called']
A [u'page', u'letter', u'alphabet', u'indefinite', u'article', u'article', u'grammar', u'uses', u'disambiguation', u'thumb']
Air [u'air', u'fan', u'air', u'air', u'earth', u'atmosphere', u'air', u'mixture', u'gases', u'dust']
Autonomous communities of Spain [u'spain', u'divided', u'parts', u'called', u'autonomous', u'communities', u'autonomous', u'means', u'autonomous', u'communities']
Alan Turing [u'statue', u'alan', u'turing', u'rebuild', u'machine', u'alan', u'turing', u'alan', u'mathison', u'turing']
Alanis Morissette [u'alanis', u'nadine', u'morissette', u'born', u'june', u'grammy', u'award', u'winning', u'canadian', u'american']


### Make dictionary id2word using wikicorpus

In [8]:
doc_stream = (tokens for _, tokens in iter_wiki(WIKI_CORPUS_PATH))

In [9]:
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(156775 unique tokens: [u'fawn', u'\u03c9\u0431\u0440\u0430\u0434\u043e\u0432\u0430\u043d\u043d\u0430\u0467', u'vang', u'yollar\u0131', u'idaira']...)
INFO : adding document #20000 to Dictionary(232594 unique tokens: [u'biennials', u'sowela', u'tsukino', u'clottes', u'refreshable']...)
INFO : adding document #30000 to Dictionary(292328 unique tokens: [u'biennials', u'sowela', u'tsukino', u'clottes', u'klatki']...)
INFO : adding document #40000 to Dictionary(368454 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : adding document #50000 to Dictionary(416045 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : adding document #60000 to Dictionary(454336 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : built Dictionary(461803 unique tokens: [u'biennials', u'sowela', u'biysk', 

CPU times: user 4min 18s, sys: 2.13 s, total: 4min 20s
Wall time: 4min 20s
Dictionary(461803 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)


In [11]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE_PCT)
print(id2word_wiki)

INFO : discarding 431266 tokens: [(u'th', 10590), (u'alvares', 3), (u'large', 6446), (u'second', 9320), (u'new', 16522), (u'landmine', 8), (u'use', 7731), (u'peary', 14), (u'mswati', 7), (u'known', 16816)]...
INFO : keeping 30537 tokens which were in no less than 20 and no more than 6141 (=10.0%) documents
INFO : resulting dictionary: Dictionary(30537 unique tokens: [u'fawn', u'schlegel', u'sonja', u'woods', u'spiders']...)


Dictionary(30537 unique tokens: [u'fawn', u'schlegel', u'sonja', u'woods', u'spiders']...)


In [17]:
# save the id2word dictionary
id2word_wiki.save_as_text(WIKI_DICT_PATH)

# to reload: 
# from gensim.corpora import Dictionary
# loaded_dct = Dictionary.load_from_text(WIKI_DICT_PATH)

INFO : saving dictionary mapping to data/id2word_wiki.txt


### Vectorize the data

In [21]:
class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(WIKI_CORPUS_PATH, id2word_wiki)
vector = next(iter(wiki_corpus))
# print(vector)  # print the first vector in the stream

In [23]:
# store bag of words of the corpus into a file
%time gensim.corpora.MmCorpus.serialize(WIKI_BOW_FILE, wiki_corpus)

INFO : storing corpus in Matrix Market format to data/wiki_bow.mm
INFO : saving sparse matrix to data/wiki_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document #21000
INFO : PROGRESS: sa

CPU times: user 4min 40s, sys: 2.13 s, total: 4min 43s
Wall time: 4min 43s


In [29]:
# load mm corpus
mm_corpus = gensim.corpora.MmCorpus(WIKI_BOW_FILE)
print(mm_corpus)

INFO : loaded corpus index from lda/wiki_bow.mm.index
INFO : initializing corpus reader from lda/wiki_bow.mm
INFO : accepted corpus with 61418 documents, 30537 features, 5967192 non-zero entries


MmCorpus(61418 documents, 30537 features, 5967192 non-zero entries)


### Training LDA

In [30]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)  # use fewer documents during training, LDA is slow
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=100, id2word=id2word_wiki, passes=4)

INFO : using symmetric alpha at 0.01
INFO : using symmetric eta at 3.27471591839e-05
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 100 topics, 4 passes over the supplied corpus of 4000 documents, updating model once every 2000 documents, evaluating perplexity every 4000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/4000
INFO : merging changes from 2000 documents into a model of 4000 documents
INFO : topic #25 (0.010): 0.007*"february" + 0.004*"hex" + 0.004*"mass" + 0.003*"rgb" + 0.003*"color" + 0.003*"november" + 0.003*"music" + 0.003*"indigo" + 0.003*"france" + 0.003*"december"
INFO : topic #38 (0.010): 0.004*"jpg" + 0.004*"image" + 0.004*"league" + 0.004*"cell" + 0.004*"water" + 0.003*"cells" + 0.003*"aluminium" + 0.003*"food" + 0.003*"countries" + 0.003*"premier"
INFO : topic #17 (0.010): 0.005*"moon" + 0.004*"serbia" + 0.004*"copper" + 0.003*"ii" + 0.003*"degree" + 0.003*"

CPU times: user 11min 13s, sys: 13.7 s, total: 11min 26s
Wall time: 3min 39s


In [31]:
# Save LDA model
lda_model.save(LDA_SAVE_FILE)

INFO : saving LdaState object under lda/lda_model.state, separately None
INFO : saved lda/lda_model.state
INFO : saving LdaModel object under lda/lda_model, separately ['expElogbeta', 'sstats']
INFO : not storing attribute id2word
INFO : storing np array 'expElogbeta' to lda/lda_model.expElogbeta.npy
INFO : not storing attribute state
INFO : not storing attribute dispatcher
INFO : saved lda/lda_model


In [37]:
# Load saved LDA model
# loaded_lda_model = LdaModel.load(LDA_SAVE_FILE)

### Transforming new documents

In [38]:
# Example sentence
text = "A blood cell, also called a hematocyte, is a cell produced by hematopoiesis and normally found in blood."

# transform text into the bag-of-words space
bow_vector = id2word_wiki.doc2bow(tokenize(text))
print([(id2word_wiki[id], count) for id, count in bow_vector])

[(u'blood', 2), (u'normally', 1), (u'produced', 1), (u'cell', 2)]


In [39]:
# transform into LDA space
lda_vector = lda_model[bow_vector]
print(lda_vector)
# print the document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

[(68, 0.85857142857131141)]
0.016*"disease" + 0.016*"person" + 0.014*"cells" + 0.011*"body" + 0.009*"cause" + 0.008*"cell" + 0.008*"blood" + 0.007*"diseases" + 0.007*"symptoms" + 0.006*"bacteria"
