# Example of HMM-LDA 

In [1]:
import logging
import sys
sys.path.append('..')
from ptm.nltk_corpus import get_reuters_token_list_by_sentence
from ptm import HMM_LDA
from ptm.utils import get_top_words

logger = logging.getLogger('HMM_LDA')
logger.propagate=False

## Read corpus

`corpus` is a nested list of documents, sentences, and word tokens, respectively.

In [2]:
n_docs = 1000
voca, corpus = get_reuters_token_list_by_sentence(num_doc=n_docs)
print('Vocabulary size', len(voca))

Vocabulary size 3859


## Training HMM LDA

In [3]:
n_docs = len(corpus)
n_voca = len(voca)
n_topic = 50
n_class = 20
max_iter = 100
alpha = 0.1
beta = 0.01
gamma = 0.1
eta = 0.1
model = HMM_LDA(n_docs, n_voca, n_topic, n_class, alpha=alpha, beta=beta, gamma=gamma, eta=eta, verbose=False)
model.fit(corpus, max_iter=max_iter)

## Print Top 10 words for each class and topic

In [4]:
for ti in range(n_topic):
    top_words = get_top_words(model.TW, voca, ti, n_words=10)
    print('Topic', ti ,': ', ','.join(top_words))

Topic 0 :  will,on,its,must,throughout,same,by,traditional,loss,background
Topic 1 :  future,should,are,charge,higher,sulphur,first,an,company,letter
Topic 2 :  ready,same,be,basis,it,will,for,at,registered,capital
Topic 3 :  alone,great,specialty,would,unreasonable,falling,say,formed,top,declined
Topic 4 :  offer,do,although,on,over,would,much,by,fiscal,objective
Topic 5 :  barring,did,bearing,may,but,its,narrow,target,leading,same
Topic 6 :  for,two,meeting,may,still,at,six,whose,become,marked
Topic 7 :  stimulate,each,under,satisfied,at,transition,distribution,activity,for,provision
Topic 8 :  is,difficulty,effect,top,from,nine,price,deficit,agreed,only
Topic 9 :  for,country,pressure,increasing,will,government,its,quietly,nil,report
Topic 10 :  petroleum,per,expectation,pollard,weight,textile,from,cocoa,absorbing,remainder
Topic 11 :  should,but,set,shipment,much,term,same,be,practice,its
Topic 12 :  offer,present,at,this,they,help,name,an,time,show
Topic 13 :  would,rating,current

In [5]:
for ci in range(1, n_class):
    top_words = get_top_words(model.CW, voca, ci, n_words=10)
    print('Class', ci ,': ', ','.join(top_words))

Class 1 :  were,on,per,be,it,will,an,is,year,company
Class 2 :  at,was,have,billion,not,an,is,will,it,be
Class 3 :  trade,will,on,is,loss,be,have,it,from,this
Class 4 :  by,also,would,for,will,were,this,have,are,from
Class 5 :  the,about,be,on,year,company,would,by,with,loss
Class 6 :  the,he,billion,is,be,it,will,an,not,at
Class 7 :  the,he,from,were,an,loss,be,it,will,nil
Class 8 :  one,with,for,company,an,nil,billion,it,be,loss
Class 9 :  the,be,as,was,not,will,it,nil,at,an
Class 10 :  on,last,for,at,company,will,it,billion,be,by
Class 11 :  the,year,for,would,from,was,be,it,will,an
Class 12 :  or,are,it,will,for,not,at,billion,by,its
Class 13 :  as,is,not,company,were,will,it,be,loss,at
Class 14 :  was,its,it,be,quarter,for,billion,from,would,on
Class 15 :  market,last,is,with,on,would,share,by,billion,be
Class 16 :  last,on,an,its,loss,be,it,will,company,is
Class 17 :  the,trade,this,be,was,it,will,company,for,not
Class 18 :  the,last,will,from,billion,an,loss,be,it,its
Class 19 :

**Function words belong to classes and content words belong to topics.**

In this example, function words are not very well divided by their roles. As in the original paper, fine-tuning, sampling hyper-parameters or n-th order Markovian assumption may help to improve the performance.