# Using gensim for topic modelling example

(Based on Coursera MIPT & Yandex Machine Learning course)

![comic1](http://imgs.xkcd.com/comics/seashell.png)

In [9]:
from gensim import corpora, models

In [10]:
# Import data as UCI Bag of words
data = corpora.UciCorpus("docword.xkcd.txt", "vocab.xkcd.txt")
dictionary = data.create_dictionary()

In [13]:
# fit the model
%time ldamodel = models.ldamodel.LdaModel(data, id2word=dictionary, num_topics=5, passes=20, alpha=1.25, eta=1.25)

Wall time: 4min 35s


In [14]:
# save the model
ldamodel.save("ldamodel_xkcd")

In [15]:
# and load!
ldamodel = models.ldamodel.LdaModel.load("ldamodel_xkcd")

In [16]:
# print top words
for t, top_words in ldamodel.print_topics(num_topics=10, num_words=10):
    print "Topic", t, ":", top_words

Topic 0 : 0.002*"wait" + 0.002*"peter" + 0.001*"map" + 0.001*"sagal" + 0.001*"wave" + 0.001*"boy" + 0.001*"dont" + 0.001*"reference" + 0.001*"text" + 0.001*"red"
Topic 1 : 0.002*"exhibit" + 0.001*"paul" + 0.001*"ron" + 0.001*"label" + 0.001*"han" + 0.001*"planet" + 0.001*"dot" + 0.001*"text" + 0.001*"graph" + 0.001*"labeled"
Topic 2 : 0.023*"man" + 0.012*"text" + 0.011*"person" + 0.010*"title" + 0.009*"woman" + 0.008*"guy" + 0.007*"one" + 0.006*"girl" + 0.005*"just" + 0.005*"two"
Topic 3 : 0.002*"scientist" + 0.001*"beef" + 0.001*"base" + 0.001*"shark" + 0.001*"reporter" + 0.001*"space" + 0.001*"flu" + 0.001*"pizza" + 0.001*"cast" + 0.001*"tank"
Topic 4 : 0.002*"goggles" + 0.001*"link" + 0.001*"degree" + 0.001*"jelly" + 0.001*"found" + 0.001*"bean" + 0.001*"acne" + 0.001*"egg" + 0.001*"005" + 0.001*"wikipedia"


In [17]:
# Measure perplexity log and transform to the generally accepted view
perplexity = ldamodel.log_perplexity(list(data))
print 2**(-perplexity)

350.369996937


In [18]:
perp = ldamodel.bound(data)
2**(-perp/float(87409))

350.36999666623427

In [None]:
# Adding new documents to the current models
ldamodel.update(data2, passes=10)

In [None]:
# Getting distributions for specific document
doc = list(data)[0]
ldamodel.get_document_topics(doc)

Эти люди не знают про тематические модели:

![comic2](http://imgs.xkcd.com/comics/the_problem_with_wikipedia.png) | ![comic3](http://imgs.xkcd.com/comics/mystery_news.png)