# Using LdaSeqModel for DTM

In [1]:
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
import numpy
from gensim import matutils

All you need to start using DTM is an iterable gensim corpus, id2word and a list with the number of documents in each of your time-slices.

In [2]:
# loading our corpus and dictionary
dictionary = Dictionary.load('Corpus/news_dictionary')
corpus = bleicorpus.BleiCorpus('Corpus/news_corpus')
# the corpus used here consists of news reports for 3 months
# the first month had 438 articles, the second 430 and the last month had 456 articles
# it's very important that your corpus is saved in order of your time-slices!

time_slice = [438, 430, 456]

In [3]:
# now, we set up the model.

ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=5, passes=20)

 EM iter  0
E Step
M Step
Fitting topic number 0
Computing bound, all times
initial sslm bound is  2795842.25993
Computing bound, all times
1  iteration lda seq bound is  2812881.60423  convergence is 0.00609452991812
Computing bound, all times
2  iteration lda seq bound is  2816896.73671  convergence is 0.00142740898702
Fitting topic number 1
Computing bound, all times
initial sslm bound is  2930495.62431
Computing bound, all times
1  iteration lda seq bound is  2943686.33857  convergence is 0.00450118886052
Computing bound, all times
2  iteration lda seq bound is  2946530.5326  convergence is 0.00096620145735
Fitting topic number 2
Computing bound, all times
initial sslm bound is  2988475.36794
Computing bound, all times
1  iteration lda seq bound is  2999832.07399  convergence is 0.00380016719362
Computing bound, all times
2  iteration lda seq bound is  3001904.64337  convergence is 0.000690895135268
Fitting topic number 3
Computing bound, all times
initial sslm bound is  3194060.29

  convergence = numpy.fabs((bound - old_bound) / old_bound)


0 iteration lda seq bound is 12380633.093 , convergence is  inf
 EM iter  1
E Step
M Step
Fitting topic number 0
Computing bound, all times
initial sslm bound is  2829286.67058
Computing bound, all times
1  iteration lda seq bound is  2830997.57809  convergence is 0.000604713379885
Computing bound, all times
2  iteration lda seq bound is  2831478.31217  convergence is 0.000169810843792
Fitting topic number 1
Computing bound, all times
initial sslm bound is  2927605.52964
Computing bound, all times
1  iteration lda seq bound is  2929057.40479  convergence is 0.000495925813536
Computing bound, all times
2  iteration lda seq bound is  2929402.59379  convergence is 0.00011784985956
Fitting topic number 2
Computing bound, all times
initial sslm bound is  3004352.00625
Computing bound, all times
1  iteration lda seq bound is  3005350.5243  convergence is 0.000332357209613
Computing bound, all times
2  iteration lda seq bound is  3005435.11891  convergence is 2.81480023821e-05
Fitting topic n

In [16]:
# to print all topics, use `print_topics`. 

ldaseq.print_topics(0)

[[(0.0040000000000000001, 'use'),
  (0.0040000000000000001, 'users'),
  (0.0040000000000000001, 'mobile'),
  (0.0040000000000000001, 'technology'),
  (0.0040000000000000001, 'net'),
  (0.0030000000000000001, 'security'),
  (0.0030000000000000001, 'software'),
  (0.0030000000000000001, 'information'),
  (0.0030000000000000001, 'using'),
  (0.0030000000000000001, 'used'),
  (0.0030000000000000001, 'like'),
  (0.0030000000000000001, 'make'),
  (0.0030000000000000001, 'digital'),
  (0.0030000000000000001, 'internet'),
  (0.0030000000000000001, 'phone'),
  (0.0030000000000000001, 'online'),
  (0.0030000000000000001, 'computer'),
  (0.0030000000000000001, 'search'),
  (0.0030000000000000001, 'system'),
  (0.0030000000000000001, 'service')],
 [(0.0070000000000000001, 'government'),
  (0.0040000000000000001, 'blair'),
  (0.0040000000000000001, 'minister'),
  (0.0040000000000000001, 'labour'),
  (0.0030000000000000001, 'year'),
  (0.0030000000000000001, 'public'),
  (0.0030000000000000001, 'las

In [5]:
# to fix a topic and see it evolve, use `print_topic_times`

ldaseq.print_topic_times(0) # evolution of 0th topic

[[(0.0040000000000000001, 'use'),
  (0.0040000000000000001, 'users'),
  (0.0040000000000000001, 'mobile'),
  (0.0040000000000000001, 'technology'),
  (0.0040000000000000001, 'net'),
  (0.0030000000000000001, 'security'),
  (0.0030000000000000001, 'software'),
  (0.0030000000000000001, 'information'),
  (0.0030000000000000001, 'using'),
  (0.0030000000000000001, 'used'),
  (0.0030000000000000001, 'like'),
  (0.0030000000000000001, 'make'),
  (0.0030000000000000001, 'digital'),
  (0.0030000000000000001, 'internet'),
  (0.0030000000000000001, 'phone'),
  (0.0030000000000000001, 'online'),
  (0.0030000000000000001, 'computer'),
  (0.0030000000000000001, 'search'),
  (0.0030000000000000001, 'system'),
  (0.0030000000000000001, 'service')],
 [(0.0040000000000000001, 'use'),
  (0.0040000000000000001, 'technology'),
  (0.0040000000000000001, 'users'),
  (0.0040000000000000001, 'mobile'),
  (0.0040000000000000001, 'net'),
  (0.0030000000000000001, 'software'),
  (0.0030000000000000001, 'informa

In [6]:
# to check Document - Topic proportions, use `doc-topics`

ldaseq.doc_topics(244) # check the 244th document in the corpuses topic distribution

array([  4.94926998e-05,   4.94926998e-05,   9.99802029e-01,
         4.94926998e-05,   4.94926998e-05])

In [7]:
# to check for an unseen document

ldaseq[[(1, 1), (4, 2)]]

array([ 0.00327869,  0.98688525,  0.00327869,  0.00327869,  0.00327869])

In [8]:
# now let's compare this to the DTM wrapper.
from gensim.models.wrappers.dtmmodel import DtmModel


dtm_path = "/Users/bhargavvader/Downloads/dtm_release/dtm/main"
dtm_model = DtmModel(dtm_path, corpus, time_slice, num_topics=5, id2word=dictionary, initialize_lda=True)
dtm_model.save('dtm_news')
ldaseq.save('ldaseq_news')

In [11]:
num_topics = 5
topic_term = dtm_model.lambda_[:,:,0] # the lambda matrix contains 

def validate(topic_term):
    topic_term = numpy.exp(topic_term)
    topic_term = topic_term / topic_term.sum()
    topic_term = topic_term * num_topics
    return topic_term

def get_topics(topic_terms, topic_number):
    topic_terms = topic_terms[topic_number]
    bestn = matutils.argsort(topic_terms, 20, reverse=True)
    beststr = [dictionary[id_] for id_ in bestn]
    return beststr

topic_term = validate(topic_term)
# next is doc_topic_dist
doc_topic = dtm_model.gamma_
# next is the vocabulary, which we already have

vocab = []
for i in range(0, len(dictionary)):
    vocab.append(dictionary[i])

# we now need term-frequency and doc_lengths

def term_frequency(corpus, dictionary):
    term_frequency = [0] * len(dictionary)
    doc_lengths = []
    for doc in corpus:
        doc_lengths.append(len(doc))
        for pair in doc:
            term_frequency[pair[0]] += pair[1]
    return term_frequency, doc_lengths

topics_wrapper = []
for i in range(0, num_topics):
    topics_wrapper.append(get_topics(topic_term, i))
    
    
term_frequency, doc_lengths = term_frequency(corpus, dictionary)

In [13]:
import pyLDAvis
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [31]:
# now let us visualize the DTM python port.

# getting a list of just words for each topics
dtm_tp = ldaseq.print_topics()
dtm_topics = []
for topic in dtm_tp:
    topics = []
    for prob, word in topic:
        topics.append(word)
    dtm_topics.append(topics)
    
# getting dtm python doc-topic proportions
doc_topic = numpy.copy(ldaseq.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis]

# getting dtm topic_word proportions for first time_slice
def get_topic_term(ldaseq, topic, time=0):
    topic = numpy.transpose(ldaseq.topic_chains[topic].e_log_prob)
    topic = topic[time]
    topic = numpy.exp(topic)
    topic = topic / topic.sum()
    return topic

# get_topic_term(ldaseq, 0).shape
topic_term =numpy.array(numpy.split(numpy.concatenate((get_topic_term(ldaseq, 0), get_topic_term(ldaseq, 1), get_topic_term(ldaseq, 2), get_topic_term(ldaseq, 3), get_topic_term(ldaseq, 4))), 5))
vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_dtm)

In [None]:
cm_wrapper = CoherenceModel(topics=topics_wrapper, corpus=news_corpus, dictionary=dictionary, coherence='u_mass')
cm_DTM = CoherenceModel(topics=topics_DTM, corpus=news_corpus, dictionary=dictionary, coherence='u_mass')

print (cm_wrapper.get_coherence())
print (cm_DTM.get_coherence())