In [16]:
# -*- coding:utf-8 -*-
%matplotlib inline

import sys, os
root_dir = os.path.join(os.path.dirname(os.path.realpath('__file__')), '..')
sys.path.append(root_dir)
import dmr
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

def init_lda(datfilepath, K=5, alpha=0.1, beta=0.01):
    corpus = dmr.Corpus.read(datfilepath)
    voca = dmr.Vocabulary()
    docs = voca.read_corpus(corpus)
    lda = dmr.LDA(K, alpha, beta, docs, voca.size())
    return corpus, voca, docs, lda

datfilepath = os.path.join(root_dir, 'dat', 'LDA.dat')

# learning
corpus, voca, docs, lda = init_lda(datfilepath)
lda.learning(iteration=100, voca=voca)

# word probability of each topic
wdist = lda.word_dist_with_voca(voca)
for k in wdist:
    print("TOPIC", k)
    print("\t".join([w for w in wdist[k]]))
    print("\t".join(["%0.2f" % wdist[k][w] for w in wdist[k]]))

print()

# topic probability of each document
tdist = lda.topicdist()
for first_letter in ["a", "b", "c", "d", "e"]:
    for doc, td in zip(corpus, tdist):
        if doc[0].startswith(first_letter):
            print("DOC", "Words: ", doc, "Max topic: ", np.argmax(td), "Max prob.: ", np.max(td))

TOPIC 0
di	dd	dg	dc	de	da	db	dh	df	dj
0.08	0.14	0.10	0.09	0.10	0.08	0.10	0.10	0.09	0.11
TOPIC 1
ci	ch	cb	ce	ca	cj	cc	cg	cf	cd
0.07	0.11	0.07	0.11	0.13	0.07	0.10	0.11	0.11	0.12
TOPIC 2
bg	bc	bi	bh	be	bj	bb	ba	bf	bd
0.09	0.15	0.07	0.07	0.13	0.06	0.13	0.12	0.08	0.10
TOPIC 3
ac	ab	aj	af	ad	ah	ae	ai	ag	aa
0.08	0.08	0.08	0.11	0.11	0.08	0.09	0.17	0.09	0.09
TOPIC 4
ed	ei	eh	ee	ej	ec	ef	eb	eg	ea
0.08	0.07	0.13	0.09	0.13	0.08	0.12	0.12	0.07	0.11

DOC Words:  ['ai', 'ab', 'ag', 'ah', 'ah', 'ai', 'ab', 'af', 'aj', 'ai'] Max topic:  3 Max prob.:  0.961904761905
DOC Words:  ['ai', 'ai', 'ac', 'ad', 'ac', 'aa', 'ai', 'ai', 'ad', 'ai'] Max topic:  3 Max prob.:  0.961904761905
DOC Words:  ['ai', 'af', 'aj', 'aa', 'aj', 'ag', 'af', 'ad', 'ab', 'ai'] Max topic:  3 Max prob.:  0.961904761905
DOC Words:  ['ae', 'aj', 'ag', 'af', 'ah', 'ai', 'ai', 'aj', 'ac', 'ai'] Max topic:  3 Max prob.:  0.961904761905
DOC Words:  ['ai', 'ab', 'ab', 'aa', 'ad', 'ai', 'ai', 'ae', 'ae', 'aa'] Max topic:  3 Max prob.:  0.96