# Topic Modeling

We start with importing `gensim`

**IMPORTANT**: You cannot run this example only from within the notebook. You must first download the data on the command line.

In [None]:
import gensim
from gensim import corpora, models, matutils

Now the usual imports:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from os import path


# Check that data exists
if not path.exists('./data/ap/ap.dat'):
    print('Error: Expected data to be present at data/ap/')
    print('Please cd into ./data & run ./download_ap.sh')



We will generate 100 topics as in the book, but you can changes this setting here:

In [None]:
NUM_TOPICS = 100

Load the data

In [None]:
corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')

Build the LDA model

In [None]:
model = models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)

In [None]:
num_topics_used = [len(model[doc]) for doc in corpus]
fig,ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('Figure_04_01.png')
fig

We can do the same after changing the $\alpha$ value: 

In [None]:
ALPHA = 1.0

model1 = models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
num_topics_used1 = [len(model1[doc]) for doc in corpus]

fig,ax = plt.subplots()
ax.hist([num_topics_used, num_topics_used1], np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')

# The coordinates below were fit by trial and error to look good
ax.text(9, 223, r'default alpha')
ax.text(26, 156, 'alpha=1.0')
fig.tight_layout()
fig.savefig('Figure_04_02.png')
fig

### Exploring the topic model

We can explore the mathematical structure of the topics:


In [None]:
doc = corpus.docbyoffset(0)
topics = model[doc]
print(topics)

This is not very informative, however. Another way to explore is to identify the most discussed topic, i.e., the one with the highest total weight:

In [None]:
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()

Get the top 64 words for this topic.
Without the argument, show_topic would return only 10 words

In [None]:
words = model.show_topic(max_topic, 64)

One way to visualize the results is to build a _word cloud_. For this we use the `wordcloud` module:

In [None]:
from wordcloud import WordCloud

wc = WordCloud(background_color='white', max_words=30, width=600, height=600)
wc = wc.generate_from_frequencies(dict(words))


fig,ax = plt.subplots()

ax.imshow(wc, interpolation="bilinear")
fig

# NEWS DATA

Now, repeat the same exercise using alpha=1.0.

You can edit the constant below to play around with this parameter

In [None]:
import nltk.stem

nltk.download('stopwords')

In [None]:
english_stemmer = nltk.stem.SnowballStemmer('english')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes'])

We need to add a little adaptor class:

In [None]:
class DirectText(corpora.textcorpus.TextCorpus):

    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)


Load the data

In [None]:
import sklearn.datasets
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
                                       mlcomp_root='./data')


We preprocess the data to split the data into words and remove stopwords:

In [None]:
otexts = dataset.data
texts = dataset.data

texts = [t.decode('utf-8', 'ignore') for t in texts]
texts = [t.split() for t in texts]
texts = [map(lambda w: w.lower(), t) for t in texts]
texts = [filter(lambda s: not len(set("+-.?!()>@012345689") & set(s)), t)
         for t in texts]
texts = [filter(lambda s: (len(s) > 3) and (s not in stopwords), t)
         for t in texts]
texts = [[english_stemmer.stem(w) for w in t] for t in texts]

We also remove words that are _too common_:

In [None]:
from collections import defaultdict
usage = defaultdict(int)
for t in texts:
    for w in set(t):
        usage[w] += 1
limit = len(texts) / 10
too_common = [w for w in usage if usage[w] > limit]
too_common = set(too_common)
texts = [[w for w in t if w not in too_common] for t in texts]

In [None]:
corpus = DirectText(texts)
dictionary = corpus.dictionary
try:
    dictionary['computer']
except:
    pass

model = models.ldamodel.LdaModel(
    corpus, num_topics=100, id2word=dictionary.id2token)

thetas = np.zeros((len(texts), 100))
for i, c in enumerate(corpus):
    for ti, v in model[c]:
        thetas[i, ti] += v

We compare all documents to each other **by the topics the contain**:

In [None]:
from scipy.spatial import distance
distances = distance.squareform(distance.pdist(thetas))
large = distances.max() + 1
for i in range(len(distances)):
    distances[i, i] = large

print(otexts[1])
print()
print()
print()
print(otexts[distances[1].argmin()])

# Modeling Wikipedia

Load the data

Note that you **must have run the `wikitopics_create.py` script**. This will take a few hours

In [None]:
import gensim
if not path.exists('wiki_lda.pkl'):
    import sys
    sys.stderr.write('''\
This script must be run after wikitopics_create.py!

That script creates and saves the LDA model (this must onlly be done once).
This script is responsible for the analysis.''')
    
# Load the preprocessed Wikipedia corpus (id2word and mm)
id2word = gensim.corpora.Dictionary.load_from_text(
    'data/wiki_en_output_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')

# Load the precomputed model
model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')

topics = np.load('topics.npy', mmap_mode='r')

Compute the number of topics mentioned in each document


In [None]:
lens = (topics > 0).sum(axis=1)
print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))

# Weights will be the total weight of each topic
weights = topics.sum(0)



Retrieve the most heavily used topic and plot it as a word cloud:


In [None]:
words = model.show_topic(weights.argmax(), 64)

wc = WordCloud(background_color='white', max_words=30, width=600, height=600)
wc = wc.generate_from_frequencies(dict(words))

fig,ax = plt.subplots()

ax.imshow(wc, interpolation="bilinear")
fig

In [None]:
fraction_mention = np.mean(topics[:,weights.argmax()] > 0)
print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
total_weight = np.mean(topics[:,weights.argmax()])
print("It represents {:.1%} of the total number of words.".format(total_weight))


Retrieve the **least** heavily used topic and plot it as a word cloud:

In [None]:
words = model.show_topic(weights.argmin(), 64)

wc = WordCloud(background_color='white', max_words=30, width=600, height=600)
wc = wc.generate_from_frequencies(dict(words))
fig,ax = plt.subplots()

ax.imshow(wc, interpolation="bilinear")
fig

Again, we can measure how often this topic used:

In [None]:
fraction_mention = np.mean(topics[:,weights.argmin()] > 0)
print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
total_weight = np.mean(topics[:,weights.argmin()])
print("It represents {:.1%} of the total number of words.".format(total_weight))