In [11]:
from numpy import zeros, int8, log
from pylab import random
import re
import time
import codecs

In [12]:
def preProcessing(dataset, stopwords):
    file = codecs.open(stopwords, 'r', 'utf-8')
    sw = [line.strip() for line in file] 
    file.close()
    file = codecs.open(dataset, 'r', 'utf-8')
    documents = [document.strip() for document in file] 
    file.close()

    len_doc = len(documents)
    word_counts = [];
    word_to_id = {}
    id_to_word = {}
    init_id = 0;
    
    for document in documents:
        word_count = {}
        for word in document:
            word = word.lower().strip()
            if len(word) > 1 and not re.search('[0-9]', word) and word not in sw:               
                if word not in word_to_id.keys():
                    word_to_id[word] = init_id;
                    id_to_word[init_id] = word;
                    init_id += 1;
                if word in word_counts:
                    word_count[word] += 1
                else:
                    word_count[word] = 1
        word_counts.append(word_count);
    
    len_word_to_id = len(word_to_id)  
    X = zeros([len_doc, len_word_to_id], int8)
    
    for word in word_to_id.keys():
        j = word_to_id[word]
        for i in range(0, len_doc):
            if word in word_counts[i]:
                X[i, j] = word_counts[i][word];    

    return len_doc, len_word_to_id, word_to_id, id_to_word, X

In [13]:
def initializeParameters():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

In [14]:
def logLikelihood():
    log_l = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                log_l += X[i, j] * log(tmp)
    return log_l

In [15]:
def eStep():
    for i in range(0, N):
        for j in range(0, M):
            denom = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denom += p[i, j, k];
            if denom == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denom;

In [16]:
def mStep():
    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denom = 0
            for j in range(0, M):
                lamda[i, k] += X[i, j] * p[i, j, k]
                denom += X[i, j];
            if denom == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denom
                
    for k in range(0, K):
        denom = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += X[i, j] * p[i, j, k]
            denom += theta[k, j]
        if denom == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denom    

In [17]:
def result():
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()
    
    file = codecs.open(topWords,'w','utf-8')
    for i in range(0, K):
        topics = []
        ids = theta[i, :].argsort()
        for j in ids:
            topics.insert(0, id2word[j])
        tmp = ''
        for word in topics[0:min(topicWordsNum, len(topics))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()
        
    file = codecs.open(docTopicDistDistribution,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()
    
    file = codecs.open(topicWordDistribution,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

In [10]:
dataset = 'dataset.txt'
stopwords = 'stopwords.dic'

docTopicDistDistribution = 'docTopicDistribution.txt'
topicWordDistribution = 'topicWordDistribution.txt'
dictionary = 'dictionary.dic'
topWords = 'topics.txt'

N = 5
maxIteration = 25
threshold = 5.0
topicWordsNum = 5
    
# preprocessing
len_doc, len_word_to_id, word_to_id, id_to_word, X = preProcessing(dataset, stopwords)

# lamda[i, j]=>p(zj|di)
lamda = random([len_doc, N])

# theta[i, j]=>p(wj|zi)
theta = random([N, len_word_to_id])

# p[i, j, k]=>p(zk|di,wj)
p = zeros([len_doc, len_word_to_id, N])

initializeParameters()
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    eStep()
    mStep()
    newLoglikelihood = logLikelihood()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if((newLoglikelihood - oldLoglikelihood) < thresholdold and Loglikelihood != 1):
        break
    oldLoglikelihood = newLoglikelihood

result()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Ospan\AppData\Local\Temp\jieba.cache
Loading model cost 0.745 seconds.
Prefix dict has been built successfully.


[ 2020-12-27 22:02:26 ]  1  iteration   -147485.621660961
[ 2020-12-27 22:03:16 ]  2  iteration   -145296.73814273378
[ 2020-12-27 22:04:06 ]  3  iteration   -142487.00182718155
[ 2020-12-27 22:04:59 ]  4  iteration   -139216.32777316848
[ 2020-12-27 22:05:54 ]  5  iteration   -135913.96495237478
[ 2020-12-27 22:06:50 ]  6  iteration   -133017.4139328115
[ 2020-12-27 22:07:44 ]  7  iteration   -130727.37988343116
[ 2020-12-27 22:08:37 ]  8  iteration   -129025.38435876224
[ 2020-12-27 22:09:29 ]  9  iteration   -127775.20275849756
[ 2020-12-27 22:10:21 ]  10  iteration   -126827.7694353583
[ 2020-12-27 22:11:12 ]  11  iteration   -126100.24492361289
[ 2020-12-27 22:12:07 ]  12  iteration   -125517.76741863204
[ 2020-12-27 22:13:00 ]  13  iteration   -125053.88031924044
[ 2020-12-27 22:13:51 ]  14  iteration   -124696.47202140526
[ 2020-12-27 22:14:43 ]  15  iteration   -124421.87665048064
[ 2020-12-27 22:15:38 ]  16  iteration   -124225.73797153155
[ 2020-12-27 22:16:41 ]  17  iteratio