In [64]:
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO 

In [65]:
documents = ["Human machine interface for lab abc computer applications",
        "A survey of user opinion of computer system response time",
        "The EPS user interface management system",
        "System and human system engineering testing of EPS",
        "Relation of user perceived response time to error measurement",
        "The generation of random binary unordered trees",
        "The intersection graph of paths in trees",
        "Graph minors IV Widths of trees and well quasi ordering",
        "Graph minors A survey"]

In [66]:
# remove common words and tokenize
stoplist = set('. ? |'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

In [67]:
print texts

[['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications'], ['a', 'survey', 'of', 'user', 'opinion', 'of', 'computer', 'system', 'response', 'time'], ['the', 'eps', 'user', 'interface', 'management', 'system'], ['system', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'eps'], ['relation', 'of', 'user', 'perceived', 'response', 'time', 'to', 'error', 'measurement'], ['the', 'generation', 'of', 'random', 'binary', 'unordered', 'trees'], ['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'], ['graph', 'minors', 'iv', 'widths', 'of', 'trees', 'and', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'a', 'survey']]


In [68]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

In [69]:
texts

[['human', 'interface', 'computer'],
 ['a', 'survey', 'of', 'user', 'of', 'computer', 'system', 'response', 'time'],
 ['the', 'eps', 'user', 'interface', 'system'],
 ['system', 'and', 'human', 'system', 'of', 'eps'],
 ['of', 'user', 'response', 'time'],
 ['the', 'of', 'trees'],
 ['the', 'graph', 'of', 'trees'],
 ['graph', 'minors', 'of', 'trees', 'and'],
 ['graph', 'minors', 'a', 'survey']]

In [70]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/dicty.dict') # store the dictionary, for future reference
print(dictionary.token2id)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(16 unique tokens: [u'a', u'and', u'minors', u'graph', u'of']...) from 9 documents (total 43 corpus positions)
INFO:gensim.utils:saving Dictionary object under /tmp/dicty.dict, separately None


{u'a': 3, u'and': 12, u'minors': 15, u'graph': 14, u'of': 4, u'system': 5, u'trees': 13, u'eps': 11, u'computer': 1, u'survey': 6, u'user': 7, u'human': 2, u'time': 8, u'interface': 0, u'the': 10, u'response': 9}


In [71]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/corpusy.mm', corpus) # store to disk, for later use
print(corpus)

INFO:gensim.corpora.mmcorpus:storing corpus in Matrix Market format to /tmp/corpusy.mm
INFO:gensim.matutils:saving sparse matrix to /tmp/corpusy.mm
INFO:gensim.matutils:PROGRESS: saving document #0
INFO:gensim.matutils:saved 9x16 matrix, density=28.472% (41/144)
INFO:gensim.corpora.indexedcorpus:saving MmCorpus index to /tmp/corpusy.mm.index


[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(0, 1), (5, 1), (7, 1), (10, 1), (11, 1)], [(2, 1), (4, 1), (5, 2), (11, 1), (12, 1)], [(4, 1), (7, 1), (8, 1), (9, 1)], [(4, 1), (10, 1), (13, 1)], [(4, 1), (10, 1), (13, 1), (14, 1)], [(4, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(3, 1), (6, 1), (14, 1), (15, 1)]]


In [79]:
timeslice = [1,3,5]

In [80]:
model = gensim.models.wrappers.DtmModel('./dtm_release/dtm/main',
                                        corpus=corpus,
                                        time_slices=timeslice,
                                        num_topics=20,
                                        prefix="/Users/brandonosis/Projects/dim_dtm/",
                                        id2word=dictionary)

INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /Users/brandonosis/Projects/dim_dtm/train-mult.dat
INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus
INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /Users/brandonosis/Projects/dim_dtm/train-mult.dat
INFO:gensim.corpora.bleicorpus:saving vocabulary of 16 words to /Users/brandonosis/Projects/dim_dtm/train-mult.dat.vocab
INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=20 --model=dtm  --mode=fit --initialize_lda=false --corpus_prefix=/Users/brandonosis/Projects/dim_dtm/train --outname=/Users/brandonosis/Projects/dim_dtm/train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 


In [78]:
model.print_topics()

[u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.229*trees + 0.229*of + 0.152*graph + 0.152*the + 0.020*response + 0.020*user + 0.020*and + 0.020*eps + 0.020*interface + 0.020*system',
 u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.062*response + 0.062*user + 0.062*graph + 0.062*trees + 0.062*and + 0.062*eps + 0.062*the + 0.062*interface + 0.062*time + 0.062*of',
 u'0.255*of + 0.050*respo