# Import Package

In [18]:
from os.path import join
from numba import jit
import numpy as np
import datetime

%run PLSA_model.ipynb

# Load Data and Calculate Time Function

In [19]:
def get_file_list(file_list_path):

    file_list = list()        
    with open(file_list_path, 'r', encoding='UTF-8') as f:
        for file_id in f.readlines():
            file_list.append(file_id.strip('\n'))
    f.close()        
    return file_list

def load_data(file_list, file_path):
    
    data = list()
    for file_id in file_list:
        with open(join(file_path, file_id+'.txt'), 'r', encoding='UTF-8') as f:
            data.append(f.read())
        f.close()
    return data

def now_time():
    return datetime.datetime.now()

def cost_time(start_time, end_time):
    cost_time = end_time - start_time
    print('Cost time: %s' % cost_time)

# PLSA Function

In [20]:
def random_initial(row, col):  
    # uniform distribution
    np_array = np.random.rand(row, col)
    '''
    e.g prob = [ 0.7, 0.6, 0.3 ]
    sum = 1.6
    x1 = 0.7 / 1.6 = 0.4375
    x2 = 0.6 / 1.6 = 0.375
    x3 = 0.3 / 1.6 = 0.1875
    Σ summation(x1, x2, x3) = 1 
    '''
    # axis = 0:sum of row
    # axis = 1: sum of column        
    return np_array / np_array.sum(axis=0, keepdims=True)
        
# Probabilistic Latent Semantic Analysis
@jit(nopython=True)
def plsa(word_topic_prob, topic_doc_prob, coo_row, coo_col, coo_data, topic=1, iter_num=1):
    
    # term_doc matrix's row is word w(i) 
    # term_doc matrix's column is document d(j)
    
    # print(len(coo_row), len(coo_col), len(coo_data))
    # P(Tk | wi, dj)
    topic_word_doc_prob = np.zeros((len(coo_data), topic_num))
    topics_sum = np.zeros(topic)
    docs_len = np.zeros(topic_doc_prob.shape[1])
    
    # EM Algorithm
    print("Iteration Start:")
    for iter_index in range(iter_num):
        # E step
        for i in range(len(coo_data)):            
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            topic_prob = np.zeros((topic))  
            topic_sum_prob = 0
            
            for k in range(topic):
                # P(wi | Tk) * P(Tk | dj)
                topic_prob[k] = word_topic_prob[w_coord, k] * topic_doc_prob[k, d_coord]
                topic_sum_prob += topic_prob[k]
                
            for k in range(topic):
                topic_word_doc_prob[i, k] = topic_prob[k] / topic_sum_prob        
        
        # M step
        
        # initial zero
        topics_sum.fill(0)
        word_topic_prob.fill(0)
        topic_doc_prob.fill(0)
        docs_len.fill(0)
        
        for i in range(len(coo_data)):
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            term_freq = coo_data[i]
            for k in range(topic):
                tf_twd = term_freq * topic_word_doc_prob[i, k]
                # P(wi | Tk)
                word_topic_prob[w_coord, k] += tf_twd
                # P(Tk | dj)
                topic_doc_prob[k, d_coord] += tf_twd
                topics_sum[k] += tf_twd
            docs_len[d_coord] += term_freq

            
        for i in range(word_topic_prob.shape[0]):
            for k in range(topic):
                word_topic_prob[i, k] /= topics_sum[k]

        for k in range(topic):
            for j in range(topic_doc_prob.shape[1]):
                topic_doc_prob[k, j] /= docs_len[j]
                
        # log likelihood        
        likelihood_sum = 0
        for i in range(0, len(coo_data)):            
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            term_freq = coo_data[i]
            
            likelihood = 0
            for k in range(0, topic):                
                likelihood += word_topic_prob[w_coord, k] * topic_doc_prob[k, d_coord]
            
            likelihood_sum += term_freq * np.log(likelihood)
        print("Iteration #", iter_index+1, '=', likelihood_sum)
    # final
    return word_topic_prob, topic_doc_prob


# Loading Data

In [21]:
# main function
if __name__ == '__main__':
    start_time = now_time()    

    # Load Data
    print('load data ...')    
    doc_list = get_file_list('ntust-ir-2020_hw4_v2/doc_list.txt')
    query_list = get_file_list('ntust-ir-2020_hw4_v2/query_list.txt')
    docs = load_data(doc_list, 'ntust-ir-2020_hw4_v2/docs/')
    queries = load_data(query_list, 'ntust-ir-2020_hw4_v2/queries/')  
    print('load data finish！\n')
    
    cost_time(start_time, now_time())

load data ...
load data finish！

Cost time: 0:00:03.003059


# Topic Model

In [22]:
    topic = TopicModel()

##### data preprocess | background modeling | document modeling

In [23]:
    start_time = now_time()    
    
    # Data Preprocess
    print('data preprocess ...')    
    topic.data_preprocess(query_list, doc_list, queries, docs)    
    print('data preprocess finish！\n')

    # background modeling  
    print('background modeling ...')
    topic.background_model()    
    print('background modeling finish！\n')
    
    # document modeling  
    print('document modeling  ...')
    topic.doc_model()    
    print('document modeling  finish！\n')
    
    cost_time(start_time, now_time())

data preprocess ...
data preprocess finish！

background modeling ...
background modeling finish！

document modeling  ...
document modeling  finish！

Cost time: 0:00:20.244428


#### PLSA training

In [27]:
    start_time = now_time()    

    # dimensional length
    word_size = len(topic.vocab)
    doc_size  = len(topic.documents)
    topic_num = 128
    # random initial 
    word_topic_prob = random_initial(word_size, topic_num)
    topic_doc_prob  = random_initial(topic_num, doc_size)
    # print('for each topic k:', word_topic_prob.sum(axis=0))
    # print('for each document j:', topic_doc_prob.sum(axis=0))
    
    # print(topic.term_doc)
    coo_row = topic.term_doc.row
    coo_col = topic.term_doc.col
    coo_data = topic.term_doc.data
    
    word_topic_prob, topic_doc_prob = plsa(word_topic_prob, topic_doc_prob, coo_row, coo_col, coo_data, topic_num, iter_num=1000)
    
    cost_time(start_time, now_time())

Iteration Start:
Iteration # 1 = -59254851.7958059
Iteration # 2 = -59147934.14887507
Iteration # 3 = -58991944.901428804
Iteration # 4 = -58730856.09393305
Iteration # 5 = -58403985.64626917
Iteration # 6 = -58021409.79121222
Iteration # 7 = -57504532.27076275
Iteration # 8 = -56812680.589041464
Iteration # 9 = -56005904.11213733
Iteration # 10 = -55219813.75002756
Iteration # 11 = -54556094.38283256
Iteration # 12 = -54031164.9814237
Iteration # 13 = -53617812.65035625
Iteration # 14 = -53287046.39604123
Iteration # 15 = -53017568.17420252
Iteration # 16 = -52794203.53412353
Iteration # 17 = -52606408.31182265
Iteration # 18 = -52446611.859216854
Iteration # 19 = -52309134.19284257
Iteration # 20 = -52189591.16983307
Iteration # 21 = -52084675.344977155
Iteration # 22 = -51992131.912899666
Iteration # 23 = -51910398.62541649
Iteration # 24 = -51837915.83754306
Iteration # 25 = -51773276.59394746
Iteration # 26 = -51715386.281831406
Iteration # 27 = -51663536.241656
Iteration # 28 = -

#### P(q|dj)

In [28]:
    start_time = now_time()    

    print('query likelihood measure ...')
    # parameter: map@num, alpha, beta
    topic.query_likelihood(word_topic_prob, topic_doc_prob, topic_num, 1000, 0.4, 0.5)
    print('query likelihood measure finish！\n')

    cost_time(start_time, now_time())

query likelihood measure ...
query likelihood measure finish！

Cost time: 0:05:14.818828


In [29]:
    topic.output('PLSA_model.csv')