# Import Package

In [1]:
from os.path import join
from numba import jit
from numba import njit
import numpy as np
import pandas as pd
import datetime

%run PLSA_model.ipynb

# Load Data and Calculate Time Function

In [2]:
def get_file_list(file_list_path):

    file_list = list()        
    with open(file_list_path, 'r', encoding='UTF-8') as f:
        for file_id in f.readlines():
            file_list.append(file_id.strip('\n'))
    f.close()        
    return file_list

def load_data(file_list, file_path):
    
    data = list()
    for file_id in file_list:
        with open(join(file_path, file_id+'.txt'), 'r', encoding='UTF-8') as f:
            data.append(f.read())
        f.close()
    return data

def now_time():
    return datetime.datetime.now()

def cost_time(start_time, end_time):
    cost_time = end_time - start_time
    print('Cost time: %s\n' % cost_time)

# PLSA Function

In [3]:
def random_initial(row, col):  
    # uniform distribution
    np_array = np.random.rand(row, col)
    '''
    e.g prob = [ 0.7, 0.6, 0.3 ]
    sum = 1.6
    x1 = 0.7 / 1.6 = 0.4375
    x2 = 0.6 / 1.6 = 0.375
    x3 = 0.3 / 1.6 = 0.1875
    Σ summation(x1, x2, x3) = 1 
    '''
    # axis = 0:sum of row
    # axis = 1: sum of column        
    return np_array / np_array.sum(axis=0, keepdims=True)
        
# Probabilistic Latent Semantic Analysis
@jit(nopython=True)
def plsa(word_topic_prob, topic_doc_prob, coo_row, coo_col, coo_data, topic=1, iter_num=1):
    
    # term_doc matrix's row is word w(i) 
    # term_doc matrix's column is document d(j)
    
    # print(len(coo_row), len(coo_col), len(coo_data))
    # P(Tk | wi, dj)
    topic_word_doc_prob = np.zeros((len(coo_data), topic_num))
    topics_sum = np.zeros(topic)
    docs_len = np.zeros(topic_doc_prob.shape[1])
    
    # EM Algorithm
    print("Iteration Start:")
    for iter_index in range(iter_num):
        # E step
        for i in range(len(coo_data)):            
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            topic_prob = np.zeros((topic))  
            topic_sum_prob = 0
            
            for k in range(topic):
                # P(wi | Tk) * P(Tk | dj)
                topic_prob[k] = word_topic_prob[w_coord, k] * topic_doc_prob[k, d_coord]
                topic_sum_prob += topic_prob[k]
                
            for k in range(topic):
                topic_word_doc_prob[i, k] = topic_prob[k] / topic_sum_prob        
        
        # M step
        
        # initial zero
        topics_sum.fill(0)
        word_topic_prob.fill(0)
        topic_doc_prob.fill(0)
        docs_len.fill(0)
        
        for i in range(len(coo_data)):
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            term_freq = coo_data[i]
            for k in range(topic):
                tf_twd = term_freq * topic_word_doc_prob[i, k]
                # P(wi | Tk)
                word_topic_prob[w_coord, k] += tf_twd
                # P(Tk | dj)
                topic_doc_prob[k, d_coord] += tf_twd
                topics_sum[k] += tf_twd
            docs_len[d_coord] += term_freq

            
        for i in range(word_topic_prob.shape[0]):
            for k in range(topic):
                word_topic_prob[i, k] /= topics_sum[k]

        for k in range(topic):
            for j in range(topic_doc_prob.shape[1]):
                topic_doc_prob[k, j] /= docs_len[j]
                
        # log likelihood        
        likelihood_sum = 0
        for i in range(0, len(coo_data)):            
            w_coord = coo_row[i]
            d_coord = coo_col[i]
            term_freq = coo_data[i]
            
            likelihood = 0
            for k in range(0, topic):                
                likelihood += word_topic_prob[w_coord, k] * topic_doc_prob[k, d_coord]
            
            likelihood_sum += term_freq * np.log(likelihood)
        print("Iteration #", iter_index+1, '=', likelihood_sum)
    # final
    return word_topic_prob, topic_doc_prob

# @njit
# def plsa_modeling(word_size, doc_size, unigram_prob, word_doc_prob, background_prob, alpha, beta):
#     print('#')
#     for i in range(word_size):
#         for j in range(doc_size):
#             key = (np.int64(i), np.int64(j))
#             if unigram_prob.get(key) is not None:
#                 word_doc_prob[i, j] = alpha*unigram_prob[key] + beta*word_doc_prob[i, j] + (1-alpha-beta)*background_prob[i]
#             else:
#                 word_doc_prob[i, j] = beta*word_doc_prob[i, j] + (1-alpha-beta)*background_prob[i]

#    return word_doc_prob

@jit(nopython=True)
def plsa_modeling(word_size, doc_size, unigram_prob, word_doc_prob, background_prob, alpha, beta):
    print('#')
    for i in range(word_size):
        for j in range(doc_size):
            word_doc_prob[i, j] = alpha*unigram_prob[i, j] + beta*word_doc_prob[i, j] + (1-alpha-beta)*background_prob[i]

    return word_doc_prob

# Loading Data

In [4]:
# main function
if __name__ == '__main__':
    start_time = now_time()    

    # Load Data
    print('load data ...')    
    doc_list = get_file_list('ntust-ir-2020_hw5_new/doc_list.txt')
    query_list = get_file_list('ntust-ir-2020_hw5_new/query_list.txt')
    docs = load_data(doc_list, 'ntust-ir-2020_hw5_new/docs/')
    queries = load_data(query_list, 'ntust-ir-2020_hw5_new/queries/')  
    print('load data finish！\n')
    
    cost_time(start_time, now_time())

load data ...
load data finish！

Cost time: 0:00:00.432246



# Topic Model

In [5]:
    topic = TopicModel()

##### data preprocess | background modeling | document modeling

In [6]:
    start_time = now_time()    
    
    # Data Preprocess
    print('data preprocess ...')    
    topic.data_preprocess(query_list, doc_list, queries, docs)    
    print('data preprocess finish！\n')

    # background modeling  
    print('background modeling ...')
    topic.background_model()    
    background_prob = topic.background_prob
    print('background modeling finish！\n')
    
    # document modeling  
    print('document modeling  ...')
    topic.doc_model()
    doc_unigram_prob = topic.doc_unigram_prob.toarray()
    print('document modeling  finish！\n')
    
    cost_time(start_time, now_time())

data preprocess ...
data preprocess finish！

background modeling ...
background modeling finish！

document modeling  ...
document modeling  finish！

Cost time: 0:00:46.081368



#### PLSA training

In [7]:
    start_time = now_time()    
    
    print('PLSA training ...')
    # dimensional length
    word_size = len(topic.vocab)
    doc_size  = len(topic.documents)
    topic_num = 256
    print(word_size)
    print(doc_size)
    
#     word_topic_prob = np.load('plsa_word_topic_prob.npy')
#     topic_doc_prob = np.load('plsa_topic_doc_prob.npy')
    # random initial 
    word_topic_prob = random_initial(word_size, topic_num)
    topic_doc_prob  = random_initial(topic_num, doc_size)
    # print('for each topic k:', word_topic_prob.sum(axis=0))
    # print('for each document j:', topic_doc_prob.sum(axis=0))
    
    # print(topic.term_doc)
    coo_row = topic.term_doc.row
    coo_col = topic.term_doc.col
    coo_data = topic.term_doc.data
    
    word_topic_prob, topic_doc_prob = plsa(word_topic_prob, topic_doc_prob, coo_row, coo_col, coo_data, topic_num, iter_num=150)
    print('PLSA training finish！\n')
    cost_time(start_time, now_time())

PLSA training ...
154240
30000
Iteration Start:
Iteration # 1 = -100838261.41036318
Iteration # 2 = -100721458.46087986
Iteration # 3 = -100580098.62736395
Iteration # 4 = -100371546.74157852
Iteration # 5 = -100040043.07167366
Iteration # 6 = -99530295.80967695
Iteration # 7 = -98761336.49125214
Iteration # 8 = -97645751.25671434
Iteration # 9 = -96211888.45785245
Iteration # 10 = -94658995.89060335
Iteration # 11 = -93229146.72616015
Iteration # 12 = -92044877.71902134
Iteration # 13 = -91105445.53244871
Iteration # 14 = -90360923.27914272
Iteration # 15 = -89761566.61258103
Iteration # 16 = -89270366.07052375
Iteration # 17 = -88861444.2164478
Iteration # 18 = -88516806.55574057
Iteration # 19 = -88223545.28363496
Iteration # 20 = -87971771.11254317
Iteration # 21 = -87753687.59785198
Iteration # 22 = -87563158.8364695
Iteration # 23 = -87395331.07962506
Iteration # 24 = -87246302.32337488
Iteration # 25 = -87113193.89765729
Iteration # 26 = -86994033.6178564
Iteration # 27 = -86887

#### Summation Topic k

In [8]:
    start_time = now_time()    
    print('PLSA summation Topic k ...')    
    plsa_prob = np.dot(word_topic_prob, topic_doc_prob)
    print('PLSA summation Topic k finish！\n')
    cost_time(start_time, now_time())

PLSA summation Topic k ...
PLSA summation Topic k finish！

Cost time: 0:00:38.433107



#### PLSA Probability

In [9]:
#     doc_tuple_unigram_prob = Dict.empty(key_type=types.UniTuple(types.int64, 2), value_type=types.float64,)
#     for i in range(len(doc_unigram_prob.data)):
#         w_coord = doc_unigram_prob.row[i]
#         d_coord = doc_unigram_prob.col[i]
#         doc_tuple_unigram_prob[(w_coord, d_coord)] = doc_unigram_prob.data[i]

In [10]:
    start_time = now_time()    
    print('PLSA likelihood ...')    
    plsa_prob = plsa_modeling(word_size, doc_size, doc_unigram_prob, plsa_prob, background_prob, 0.4, 0.5)
    print('PLSA likelihood finish！\n')
    cost_time(start_time, now_time())

PLSA likelihood ...
#
PLSA likelihood finish！

Cost time: 0:00:07.432730



In [11]:
    print(plsa_prob.shape)

(154240, 30000)


#### Query likelihood measure P(q|dj)

In [12]:
    start_time = now_time()    

    print('query likelihood measure ...')
    # parameter: map@num
    topic.query_likelihood(plsa_prob, 5000)
    print('query likelihood measure finish！\n')

    cost_time(start_time, now_time())

query likelihood measure ...
query likelihood measure finish！

Cost time: 0:00:17.325944



## Saving Model

In [13]:
    np.save('plsa_document_unigram_model', doc_unigram_prob)

In [14]:
    np.save('plsa_background_model', background_prob)

In [15]:
    pd.DataFrame({'vocab':list(topic.vocab.keys())}).to_csv('plsa_vocab', index=False)

In [16]:
    np.save('plsa_word_topic_prob', word_topic_prob)

In [17]:
    np.save('plsa_topic_doc_prob', topic_doc_prob)

In [18]:
    np.save('plsa_prob', plsa_prob)

In [19]:
    topic.output('PLSA_model.csv')