# Import Package

In [2]:
import sys
from os.path import join
from numba import jit
import numpy as np
import pandas as pd
import datetime

%run psesudo_relevance_feedback.ipynb

# Load Data and Preprocess and Calculate Time Function

In [3]:
def get_file_list(file_list_path):

    file_list = list()        
    with open(file_list_path, 'r', encoding='UTF-8') as f:
        for file_id in f.readlines():
            file_list.append(file_id.strip('\n'))
    f.close()        
    return file_list

def load_data(file_list, file_path):
    
    data = list()
    for file_id in file_list:
        with open(join(file_path, file_id+'.txt'), 'r', encoding='UTF-8') as f:
            data.append(f.read())
        f.close()
    return data

def load_pseudo_relevant_docs(file_path, top_ranked=10, odd=False):
    prdocs_list = dict()
    df = pd.read_csv(file_path)
    
    for indexs in df.index:
        rowData = df.loc[indexs].values[0:2]
        rowData = rowData.tolist()
        query_id = rowData[0]
        if odd:
            rowData = rowData[1].split()[::2]
            docs_id = rowData[0:top_ranked]
        else:
            docs_id = rowData[1].split()[0:top_ranked]
        prdocs_list[query_id] = docs_id
        
    return prdocs_list

def cut(text):
    return text.split()

def now_time():
    return datetime.datetime.now()

def cost_time(start_time, end_time):
    cost_time = end_time - start_time
    print('Cost time: %s\n' % cost_time)
    
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

# SMM Function

In [4]:
def random_initial(row):  
    # uniform distribution
    np_array = np.random.rand(row)
    '''
    e.g prob = [ 0.7, 0.6, 0.3 ]
    sum = 1.6
    x1 = 0.7 / 1.6 = 0.4375
    x2 = 0.6 / 1.6 = 0.375
    x3 = 0.3 / 1.6 = 0.1875
    Σ summation(x1, x2, x3) = 1 
    '''
    # axis = 0:sum of row
    # axis = 1: sum of column        
    return np_array / np_array.sum(axis=0, keepdims=True)


# Probabilistic Simple Mixture Model
@jit(nopython=True)
def smm(word_Tsmm_prob, word_bg_prob, coo_row, coo_col, coo_data, alpha, iter_num):
    
    # term_doc matrix's row is word w(i) 
    # term_doc matrix's column is document d(j)
    
    # print(len(coo_row), len(coo_col), len(coo_data))
    ''' P(Tsmm | wi) for each query '''
    Tsmm_word_prob = np.zeros((word_Tsmm_prob.shape[0]))
    ''' P(Tsmm) '''
    Tsmm_prob = 1-alpha
    ''' P(BG) '''
    bg_prob = alpha
    
    # EM Algorithm
    print("Iteration Start:")
    for iter_index in range(iter_num):
        # E step
        for i in range(word_Tsmm_prob.shape[0]):      
            '''  P(wi | Tsmm) * P(Tsmm) / ( P(wi | Tsmm) * P(Tsmm) + P(wi | BG) * P(BG) ) '''
            Tsmm_word_prob[i] = (word_Tsmm_prob[i] * Tsmm_prob) / (word_Tsmm_prob[i] * Tsmm_prob + word_bg_prob[i] * bg_prob)
                
        # M step
        
        # initial zero
        word_Tsmm_prob.fill(0)
        Tsmm_sum = 0
        
        for i in range(len(coo_data)):
            w_coord = coo_row[i]
            # d_coord = coo_col[i]
            term_freq = coo_data[i]
            # 
            tf_tw = term_freq * Tsmm_word_prob[w_coord]
            ''' P(wi | Tsmm) '''
            word_Tsmm_prob[w_coord] += tf_tw
            Tsmm_sum += tf_tw

        for i in range(word_Tsmm_prob.shape[0]):
             word_Tsmm_prob[i] /= Tsmm_sum
                
        # log likelihood (Multiply to Add)
        likelihood_sum = 0
        for i in range(len(coo_data)):            
            w_coord = coo_row[i]
            # d_coord = coo_col[i]
            term_freq = coo_data[i]
            ''' P(wi | Tsmm) * P(Tsmm) + P(wi | BG) * P(BG) '''
            likelihood_sum += term_freq * np.logaddexp(np.log(word_Tsmm_prob[w_coord] * Tsmm_prob), np.log(word_bg_prob[w_coord] * bg_prob))
        print("Iteration #", iter_index+1, '=', likelihood_sum)
    # final
    return word_Tsmm_prob


# Query Model and Document Model Function

In [5]:
''' Query Unigram Model '''
@jit(nopython=True)
def query_modling(vocab, unigram_prob, Psmm_prob, alpha, beta, bg_prob):
    query_model = np.zeros((vocab))
    ''' [ alpha * PULM(w) + beta * Psmm(w) + (1-alpha-beta) * PBG(w) ] '''
    for i in range(vocab):
        query_model[i] = alpha * unigram_prob[i] + beta * Psmm_prob[i] + (1-alpha-beta) * bg_prob[i]
    return query_model

# KL-Divergence Function

In [6]:
# Probabilistic KL-Divergence
@jit(nopython=True)
def KL_Divergence(vocab, j, query_model, word_doc_prob, background_prob):
    prob = 0
    for i in range(vocab):
        ''' -Σ(wi->V) Psmm(w|q)* logP(w|dj) ''' 
        prob += query_model[i] * np.log(word_doc_prob[i, j])
    return -prob

@jit(nopython=True)
def KL_Divergence_by_unigram(vocab, j, query_model, word_doc_prob, background_prob, gamma):
    prob = 0
    for i in range(vocab):
        ''' -Σ(wi->V) Psmm(w|q)* logP(w|dj) ''' 
        # prob += query_model[i] * np.log(word_doc_prob[i, j])
        prob += query_model[i] * np.log(gamma * word_doc_prob[i, j] + (1-gamma)*background_prob[i])                                                               
    return -prob

def rank(query_doc_prob, map_at, rev=False):
    return [ (key, value) for key, value in sorted(query_doc_prob.items(),
              key = lambda item:item[1], reverse=rev)[:map_at] ]    

# Loading Data

In [7]:
# main function
if __name__ == '__main__':
    
    start_time = now_time()    
    # Load Data
    print('load data ...')    
    doc_list = get_file_list('ntust-ir-2020_hw5_new/doc_list.txt')
    query_list = get_file_list('ntust-ir-2020_hw5_new/query_list.txt')
    documents = load_data(doc_list, 'ntust-ir-2020_hw5_new/docs/')
    queries = load_data(query_list, 'ntust-ir-2020_hw5_new/queries/')  
    print('load data finish！\n')
    cost_time(start_time, now_time())

load data ...
load data finish！

Cost time: 0:00:19.160419



# Data Preprocessing

In [8]:
    start_time = now_time()    
    # tokenization
    print('data preprocessing ...') 
    documents = [ cut(doc) for doc in documents ]
    queries = [ cut(query) for query in queries ]
    vocab_list = list(pd.read_csv('plsa_vocab', dtype=str).vocab)
    # get index term and collection
    corpus = Corpus()
    vocab_index, docs_index =  corpus.indexing(vocab_list, doc_list)
    print('data preprocessing finish！\n')
    cost_time(start_time, now_time())

data preprocessing ...
data preprocessing finish！

Cost time: 0:00:01.430626



# Document Model

In [9]:
    start_time = now_time()   
    # Load PLSA Document Model
    print('loading PLSA document model ...')
    plsa_prob = np.load('plsa_prob.npy')
    # plsa_prob = np.load('plsa_document_model.npy')
    print('loading PLSA document model finish！\n')
    cost_time(start_time, now_time())

loading PLSA document model ...
loading PLSA document model finish！

Cost time: 0:05:32.590588



# Background Model

In [10]:
    # background modeling  
    print('background modeling ...')
    bg_prob = np.load('plsa_background_model.npy')    
    print('background modeling finish！\n')

background modeling ...
background modeling finish！



# Query Model

### Query Unigram Model

In [11]:
    query_unigram_model = dict()
    for i in range(len(query_list)):
        unigram_prob = UnigramLanguageModel(vocab_index, queries[i])
        unigram_prob.modeling()
        query_unigram_model[query_list[i]] = unigram_prob.get_unigram_prob()

### Select Pseudo Relevant Docs

In [29]:
    start_time = now_time()    
    # select pseudo relevant
    print('load pseudo relevant docs ...')    
    pseudo_list = load_pseudo_relevant_docs('PLSA_model.csv', top_ranked=4)
    print('load pseudo relevant docs finish！\n')
    print(pseudo_list)

load pseudo relevant docs ...
load pseudo relevant docs finish！

{301: ['FBIS4-41684', 'FBIS4-26397', 'FBIS4-45564', 'FBIS3-19199'], 302: ['LA043090-0036', 'FBIS4-67720', 'FBIS4-67701', 'FBIS3-41672'], 303: ['LA041990-0151', 'FT921-7107', 'FT941-15661', 'LA122990-0029'], 304: ['LA051890-0005', 'FR940713-1-00064', 'FR940616-0-00132', 'FR940816-1-00069'], 305: ['LA112489-0003', 'LA031689-0177', 'FT944-9883', 'FBIS3-59951'], 306: ['LA021790-0114', 'LA123190-0062', 'FBIS3-44115', 'FT942-9707'], 307: ['FBIS4-21356', 'FBIS3-6545', 'FBIS3-20665', 'FBIS4-33116'], 308: ['LA070489-0051', 'FR940214-1-00036', 'FR941027-2-00067', 'LA011590-0099'], 309: ['LA082490-0024', 'LA091089-0074', 'LA040189-0066', 'LA041990-0241'], 310: ['FT931-11958', 'LA100889-0041', 'FT934-14064', 'LA101089-0095'], 311: ['FT944-15440', 'FBIS3-58751', 'LA081789-0098', 'FT944-8837'], 312: ['LA051090-0097', 'LA040289-0050', 'LA052490-0029', 'FBIS4-67023'], 313: ['FBIS3-59655', 'FBIS4-66412', 'LA102690-0187', 'FT944-3015'], 31

In [30]:
    print('pseudo relevance docs preprocess ...')
    pseudo_query = dict()
    for query_id in pseudo_list:
        # for each query's pseudo relevant document's ID
        pseudo_doc_index_list = [ docs_index[doc_id] for doc_id in pseudo_list[query_id] ]  
        # ex: FBIS4-41684 FBIS3-23986 FBIS3-19646
        # term-pseudo_doc sparse matrix
        row = list()
        col = list()
        term_freq = list()
        
        for doc_index in pseudo_doc_index_list:
            tf = np.zeros(len(vocab_index))
            # calculate tf(i)
            for word in documents[doc_index]:
                tf[vocab_index[str(word)]] += 1
            # store sparse
            for word_index in tf.nonzero()[0]:
                row.append(word_index)
                col.append(doc_index)
                term_freq.append(tf[vocab_index[word]])
                
        sparse_term_pseudo_doc = sparse.coo_matrix((term_freq, (np.array(row), np.array(col))))
        # pseudo relevant document's info
        # put into for each query
        pseudo_query[query_id] = sparse_term_pseudo_doc
    print('pseudo relevance docs preprocess！\n')
    cost_time(start_time, now_time())

pseudo relevance docs preprocess ...
pseudo relevance docs preprocess！

Cost time: 0:00:09.594654



### Query Simple Mixture Model

In [32]:
    start_time = now_time()    

    print('SMM training ...')
    smm_model = dict()

    for query_id in pseudo_query:
        
        # dimensional length
        word_size = len(vocab_index)
        # random initial (for each query)
        word_Tsmm_prob = random_initial(word_size)
        # print('for each query Tsmm:', word_Tsmm_prob.sum(axis=0))

        # print(sparse_term_doc)        
        # Psmm(wi)
        smm_model[str(query_id)] = smm(word_Tsmm_prob, bg_prob, 
                                       pseudo_query[query_id].row, 
                                       pseudo_query[query_id].col, 
                                       pseudo_query[query_id].data, 
                                       alpha=0.1, iter_num=30)
        print()
    print('SMM training finish！\n')
    cost_time(start_time, now_time())

SMM training ...
Iteration Start:
Iteration # 1 = -11051.805216494147
Iteration # 2 = -9840.696318987935
Iteration # 3 = -9813.020376381437
Iteration # 4 = -9812.152154142412
Iteration # 5 = -9811.869598421597
Iteration # 6 = -9811.713157777955
Iteration # 7 = -9811.630718103133
Iteration # 8 = -9811.579188069825
Iteration # 9 = -9811.55207576207
Iteration # 10 = -9811.541524463883
Iteration # 11 = -9811.538220287226
Iteration # 12 = -9811.537264849674
Iteration # 13 = -9811.536974749926
Iteration # 14 = -9811.536874359112
Iteration # 15 = -9811.536834398783
Iteration # 16 = -9811.536816844533
Iteration # 17 = -9811.536808712628
Iteration # 18 = -9811.536804853553
Iteration # 19 = -9811.536803004386
Iteration # 20 = -9811.53680211537
Iteration # 21 = -9811.536801687618
Iteration # 22 = -9811.536801481827
Iteration # 23 = -9811.536801382867
Iteration # 24 = -9811.536801335284
Iteration # 25 = -9811.536801312433
Iteration # 26 = -9811.536801301447
Iteration # 27 = -9811.536801296166
Iter

### Query Modeling

In [40]:
    query_model = dict()
    for query_id in query_list:
        unigram_prob = query_unigram_model[query_id]
        smm_prob = smm_model[query_id]
        vocab = len(list(vocab_index.values()))
        # weight unigram, smm, background
        query_model[str(query_id)] = query_modling(vocab, unigram_prob, smm_prob, 0.5, 0.4, bg_prob)

In [34]:
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-41684'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-26397'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-45564'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS3-19199'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS3-19646'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS3-21961'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-46734'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS3-23986'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-62079'], query_model['301'], plsa_prob, bg_prob))
    print(KL_Divergence(len(list(vocab_index.values())), docs_index['FBIS4-43965'], query_model['301'], plsa_prob, bg_prob))

6.533684558822817
5.61710878436304
5.617091183427182
6.598010382862521
6.684149190533936
6.684119688155167
6.895033047443415
6.820608615336265
6.933155121879054
6.720297673147918


In [42]:
    pseudo_doc_list = load_pseudo_relevant_docs('smm_model_v3.csv', top_ranked=5000)

# KL-Divergence P(q||dj)

In [43]:
    start_time = now_time()    
    
    print('KL-Divergence measure ...')
    rank_prob = dict()
    for i in range(len(query_list)):
        query_id = int(query_list[i])
        print('query:#', query_id)
        word_query_prob = query_model[query_list[i]]
        query_doc_prob = { doc_id:0 for doc_id in pseudo_doc_list[query_id] }
        for j in progressbar(range(len(pseudo_doc_list[query_id])), "KL-Divergence measure: ", 50):
            '''parameters:'''
            doc_id = pseudo_doc_list[query_id][j]
            prob = KL_Divergence(len(list(vocab_index.values())), docs_index[doc_id], word_query_prob, plsa_prob, bg_prob)
            query_doc_prob[doc_id] = prob
        rank_prob[query_list[i]] = rank(query_doc_prob, 5000, rev=False)
        print()
#         print(rank_prob[query_list[i]])
#         break
    print('KL-Divergence measure finish！\n')
    cost_time(start_time, now_time())

KL-Divergence measure ...
query:# 301
KL-Divergence measure: [##################################################] 5000/5000

query:# 302
KL-Divergence measure: [##################################################] 5000/5000

query:# 303
KL-Divergence measure: [##################################################] 5000/5000

query:# 304
KL-Divergence measure: [##################################################] 5000/5000

query:# 305
KL-Divergence measure: [##################################################] 5000/5000

query:# 306
KL-Divergence measure: [##################################################] 5000/5000

query:# 307
KL-Divergence measure: [##################################################] 5000/5000

query:# 308
KL-Divergence measure: [##################################################] 5000/5000

query:# 309
KL-Divergence measure: [##################################################] 5000/5000

query:# 310
KL-Divergence measure: [###############################################

KeyboardInterrupt: 

In [37]:
    print(pseudo_doc_list[301])

['FBIS4-41684', 'FBIS4-45564', 'FBIS4-26397', 'FBIS3-19199', 'FBIS3-19646', 'FBIS3-21961', 'FBIS4-46734', 'FBIS3-23986', 'FBIS3-41247', 'FBIS3-26415', 'FBIS4-43965', 'FBIS4-62079', 'FBIS4-7811', 'FBIS3-55219', 'FBIS4-61760', 'FT923-3929', 'FBIS3-60984', 'FBIS3-58028', 'FBIS3-58058', 'FBIS3-58055', 'FBIS4-54904', 'FBIS4-46846', 'FBIS4-67072', 'FBIS4-56982', 'FBIS4-33857', 'FBIS3-26597', 'FBIS3-41246', 'FBIS4-45333', 'FBIS3-55171', 'FBIS3-24145', 'FBIS4-64135', 'FBIS4-26902', 'FBIS3-46172', 'FBIS3-41103', 'FBIS3-41272', 'FBIS4-38364', 'FBIS4-45360', 'FBIS4-31295', 'FBIS3-21770', 'FBIS4-43533', 'FBIS3-46302', 'FBIS3-41285', 'FBIS3-42315', 'FBIS4-62084', 'FBIS3-21765', 'FBIS4-45477', 'FBIS4-2802', 'FBIS4-68801', 'FBIS4-43801', 'FBIS4-8957', 'FBIS4-21321', 'FBIS3-54773', 'FBIS4-47045', 'FBIS3-41387', 'FBIS3-31267', 'FBIS3-35561', 'FBIS3-24182', 'FBIS3-38070', 'FBIS3-60083', 'FBIS4-11948', 'FBIS4-26653', 'FBIS4-21166', 'FBIS4-12999', 'FBIS4-41687', 'FBIS3-60093', 'FBIS3-49567', 'FBIS3-24247'

In [38]:
    print(rank_prob['301'])

[('FBIS4-45564', 5.617091183427182), ('FBIS4-26397', 5.61710878436304), ('FBIS4-26653', 6.498337113407309), ('FBIS4-41684', 6.533684558822817), ('FBIS4-45563', 6.559966950461137), ('FBIS4-26902', 6.595361888972377), ('FBIS3-19199', 6.598010382862521), ('FBIS3-41247', 6.600930103503086), ('FBIS3-26415', 6.606502716574959), ('FBIS3-21961', 6.684119688155167), ('FBIS3-19646', 6.684149190533936), ('FBIS3-26597', 6.687827722971355), ('FBIS4-2802', 6.688392425361511), ('FBIS3-41246', 6.690684926240563), ('FBIS4-43965', 6.720297673147918), ('FBIS4-21166', 6.74302163300122), ('FBIS3-46172', 6.754885113679252), ('FBIS3-41103', 6.758808256443388), ('FBIS3-60984', 6.77141425852177), ('FBIS4-33857', 6.7729518942109355), ('FBIS3-46302', 6.784302220287814), ('FT923-3929', 6.787528614624192), ('FBIS3-58055', 6.7932384678132856), ('FBIS4-61760', 6.795709771962749), ('FBIS4-67209', 6.796892257754042), ('FBIS4-52946', 6.797389851627944), ('FBIS3-55219', 6.799880311615778), ('FBIS4-67072', 6.808342008723

In [39]:
    with open('smm_model.csv', 'w', encoding='UTF-8') as f:
        f.write("Query,RetrievedDocuments\n")
        for query_id in rank_prob:
            # output
            f.write("%s," % query_id)
            for rank_id, rank_score in rank_prob[query_id]:           
                # print(rank_id, rank_score)
                f.write("%s " % rank_id)
#             for rank_id in after_doc[int(query_id)]:
#                 f.write("%s " % rank_id)
            f.write('\n')
    f.close()