# Gensim example on Simple Wikipedia

Based on [this example](https://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html).

In [1]:
import os
import sys
import logging
import warnings
import copy
import re
import json
import tarfile
import itertools
import numpy as np

import gensim
# Uncomment to print Gensim log messages
# logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
# logging.root.level = logging.INFO

warnings.filterwarnings('ignore')
np.random.seed(42)

Load Wikipedia data. The Simple Wikipedia corpus can be [downloaded here](https://dumps.wikimedia.org/simplewiki/latest/).

In [2]:
# Load data
from gensim.utils import smart_open
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki

DATA_PATH = './data/simplewiki-latest-pages-articles.xml.bz2'

def iter_wiki():
    for title, text, pageid in _extract_pages(smart_open(DATA_PATH)):
        text = filter_wiki(text)
        yield title, text

test_examples = [(ti, te) for ti, te in itertools.islice(iter_wiki(), 3)]
for title, text in test_examples:
    print('Title: ', title, ', text: len:', len(text), ', sample: ', repr(text[:50]))

Title:  April , text: len: 17304 , sample:  "\n'''April''' is the 4th month of the year, and com"
Title:  August , text: len: 9896 , sample:  "\n'''August''' (Aug.) is the 8th month of the year "
Title:  Art , text: len: 5289 , sample:  "A painting by Renoir which is a work of art.\n\n'''A"


Tokenize and create a Bag-Of-Words dictionary from the tokens.

In [3]:
# Tokenize
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in gensim.utils.tokenize(text, lowercase=True, deacc=True) 
            if token not in STOPWORDS]

for title, text in test_examples:
    print('Tokens sample: ', tokenize(text)[:5])

Tokens sample:  ['april', 'th', 'month', 'year', 'comes']
Tokens sample:  ['august', 'aug', 'th', 'month', 'year']
Tokens sample:  ['painting', 'renoir', 'work', 'art', 'art']


In [4]:
# Create a dictionary (This wil take a couple of minutes)
doc_stream = (tokenize(text) for title, text in iter_wiki())
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print('id2word_wiki: ', id2word_wiki)

CPU times: user 3min 42s, sys: 570 ms, total: 3min 42s
Wall time: 3min 42s
id2word_wiki:  Dictionary(547548 unique tokens: ['april', 'th', 'month', 'year', 'comes']...)


In [5]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=20, no_above=0.1)
print('id2word_wiki: ', id2word_wiki)

id2word_wiki:  Dictionary(36692 unique tokens: ['april', 'th', 'month', 'year', 'comes']...)


In [6]:
# Vectorize example
for title, text in test_examples:
    bow_test = id2word_wiki.doc2bow(tokenize(text))
    print(title, bow_test[:3], [id2word_wiki[i] for i, _ in bow_test[:3]])

April [(0, 226), (1, 2), (2, 9)] ['april', 'th', 'month']
August [(1, 2), (2, 14), (3, 14)] ['th', 'month', 'year']
Art [(29, 4), (70, 3), (77, 1)] ['making', 'new', 'including']


In [7]:
# Example on how to find most common words
for title, text in test_examples:
    bow_test = id2word_wiki.doc2bow(tokenize(text))
    most_index, most_count = max(bow_test, key=lambda t: t[1])
    print(id2word_wiki[most_index], most_count)

april 226
august 140
art 41


In [8]:
# Serialise the corpus (takes a couple of minutes)
wiki_corpus_gen = (id2word_wiki.doc2bow(tokenize(text)) for title, text in iter_wiki())
%time gensim.corpora.MmCorpus.serialize('./data/wiki_bow.mm', wiki_corpus_gen)

CPU times: user 4min, sys: 1.09 s, total: 4min 1s
Wall time: 4min 2s


In [9]:
mm_corpus = gensim.corpora.MmCorpus('./data/wiki_bow.mm')
print('mm_corpus: ', mm_corpus)
print('mm_corpus[0]: ', mm_corpus[0][:7])

mm_corpus:  MmCorpus(242253 documents, 36692 features, 9144383 non-zero entries)
mm_corpus[0]:  [(0, 226.0), (1, 2.0), (2, 9.0), (3, 24.0), (4, 4.0), (5, 14.0), (6, 5.0)]


## Topic modelling

With LDA and LSI

In [10]:
# LDA topic-modelling on a subset of documents
mm_corpus_subset = gensim.utils.ClippedCorpus(mm_corpus, 5000)
%time lda_model = gensim.models.LdaModel(mm_corpus_subset, num_topics=10, id2word=id2word_wiki, passes=4)
# Serialise the LDA model
# lda_model.save('./data/lda_wiki.model')
# Serialise corpus transformed to LDA space
# %time gensim.corpora.MmCorpus.serialize('./data/wiki_lda.mm', lda_model[mm_corpus])

CPU times: user 1min 7s, sys: 1.18 s, total: 1min 8s
Wall time: 1min 8s


In [11]:
# Print a the most imortant words for some of the topics
for i in range(lda_model.num_topics):
    topic = lda_model.print_topic(i, topn=7)
    print('Topic {}: '.format(i), topic)

Topic 0:  0.011*"hex" + 0.011*"rgb" + 0.009*"person" + 0.008*"deleted" + 0.008*"color" + 0.008*"utc" + 0.008*"english"
Topic 1:  0.016*"january" + 0.015*"march" + 0.013*"february" + 0.013*"april" + 0.012*"day" + 0.011*"july" + 0.011*"december"
Topic 2:  0.044*"b" + 0.044*"d" + 0.040*"american" + 0.011*"actor" + 0.010*"politician" + 0.010*"english" + 0.009*"actress"
Topic 3:  0.008*"water" + 0.008*"jpg" + 0.007*"called" + 0.006*"like" + 0.005*"image" + 0.005*"usually" + 0.004*"different"
Topic 4:  0.015*"tower" + 0.013*"u" + 0.012*"c" + 0.010*"transmission" + 0.010*"mast" + 0.008*"g" + 0.008*"m"
Topic 5:  0.013*"city" + 0.009*"country" + 0.008*"south" + 0.007*"world" + 0.007*"united" + 0.007*"north" + 0.007*"state"
Topic 6:  0.010*"city" + 0.009*"rural" + 0.007*"war" + 0.005*"germany" + 0.005*"called" + 0.005*"th" + 0.005*"world"
Topic 7:  0.017*"music" + 0.009*"album" + 0.007*"band" + 0.006*"love" + 0.005*"song" + 0.005*"songs" + 0.005*"rock"
Topic 8:  0.008*"called" + 0.006*"number" +

In [12]:
for title, text in test_examples:
    bow_test = id2word_wiki.doc2bow(tokenize(text))
    print(title, lda_model[bow_test])

April [(1, 0.96864864118566418), (5, 0.030853415799662484)]
August [(1, 0.92873253424234059), (2, 0.032780138361494147), (5, 0.029970805995041965)]
Art [(0, 0.097266361244568042), (3, 0.22744177535490454), (6, 0.12031737546691786), (7, 0.24843586564719863), (8, 0.27067622290004123), (9, 0.034907546686844718)]


In [13]:
# TF-IDF transformed LSI topic modelling
# TF-IDF will take a couple of seconds on the full corpus
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_wiki)
# Building LSI model on top of tf-idf will take a couple of minutes
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_wiki, num_topics=200)

CPU times: user 19.6 s, sys: 98.4 ms, total: 19.7 s
Wall time: 19.8 s
CPU times: user 3min 9s, sys: 8.92 s, total: 3min 18s
Wall time: 1min 29s


In [14]:
# Print some of the TF-IDF transformations
for title, text in test_examples:
    bow_test = id2word_wiki.doc2bow(tokenize(text))
    print(title, tfidf_model[bow_test][:5])

April [(0, 0.8347284958624097), (1, 0.0066557863597582825), (2, 0.04867220330799707), (3, 0.07974327438792456), (4, 0.018764337390088714)]
August [(1, 0.010087312917778705), (2, 0.1147473468245084), (3, 0.07049965283991218), (5, 0.027609990322833237), (6, 0.015332692432403861)]
Art [(29, 0.06500274952987092), (70, 0.030321423028594887), (77, 0.013980524606924807), (92, 0.07351729658992928), (98, 0.02186757327473417)]


In [15]:
print('Number of topics in LSI model: ', lsi_model.num_topics)
# Print a the most imortant words for some of the topics
for i in range(5):
    topic = lsi_model.print_topic(i, topn=7)
    print('Topic {}: '.format(i), topic)

Number of topics in LSI model:  200
Topic 0:  0.455*"commune" + 0.400*"department" + 0.398*"france" + 0.308*"region" + 0.141*"calais" + 0.140*"north" + 0.140*"aisne"
Topic 1:  0.424*"utc" + 0.406*"discussion" + 0.346*"talk" + 0.235*"page" + 0.231*"delete" + -0.158*"commune" + -0.136*"department"
Topic 2:  0.319*"league" + 0.293*"football" + 0.237*"city" + 0.229*"united" + 0.217*"states" + 0.192*"statistics" + 0.186*"j"
Topic 3:  -0.339*"city" + -0.326*"states" + 0.319*"league" + -0.306*"united" + 0.287*"football" + -0.253*"county" + 0.197*"j"
Topic 4:  -0.940*"template" + -0.140*"infobox" + -0.126*"data" + -0.097*"country" + 0.073*"discussion" + 0.067*"utc" + 0.066*"city"


In [16]:
# Print some of the LSI transformations
for title, text in test_examples:
    bow_test = id2word_wiki.doc2bow(tokenize(text))
    print(title, lsi_model[tfidf_model[bow_test]][:5])

April [(0, 0.057208848408777505), (1, 0.069549132386538121), (2, 0.11163427860070518), (3, -0.046112950249986018), (4, 0.00066600030485395689)]
August [(0, 0.040953372002199791), (1, 0.064658783492308636), (2, 0.083755512095660636), (3, -0.020348795777710454), (4, -0.0013647585516808462)]
Art [(0, 0.020952291947944124), (1, 0.032866134424242216), (2, 0.038000092366583214), (3, -0.019156842554201348), (4, -0.008658660220001578)]


In [17]:
# Test on unseen text
text = 'Physics is the study of energy, forces, mechanics, waves, and the structure of atoms and the physical universe.'

# Transform to BOW
bow_vector = id2word_wiki.doc2bow(tokenize(text))
print('bow_vector: ', [(id2word_wiki[id], count) for id, count in bow_vector])
print('')

# transform into LDA space
lda_vector = lda_model[bow_vector]
print('lda_vector: ', lda_vector)
print('Most important LDA topic: ', lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))
print('')

# Transform into the LSI space
lsi_vector = lsi_model[tfidf_model[bow_vector]]
print('Most important LSI topic: ', lsi_model.print_topic(max(lsi_vector, key=lambda item: abs(item[1]))[0]))



bow_vector:  [('forces', 1), ('physical', 1), ('structure', 1), ('mechanics', 1), ('study', 1), ('physics', 1), ('energy', 1), ('atoms', 1), ('universe', 1), ('waves', 1)]

lda_vector:  [(8, 0.9181732909213518)]
Most important LDA topic:  0.008*"called" + 0.006*"number" + 0.006*"use" + 0.005*"light" + 0.005*"different" + 0.005*"time" + 0.005*"example" + 0.005*"energy" + 0.005*"like" + 0.004*"things"

Most important LSI topic:  0.227*"mario" + 0.159*"zurich" + 0.149*"bundesliga" + -0.139*"republic" + -0.137*"ret" + 0.137*"super" + 0.135*"characters" + -0.131*"stadium" + 0.130*"e" + -0.123*"tropical"
