In [1]:
import numpy as np
from scipy import sparse

%run tokenization.ipynb

In [3]:
class TopicModel():

    def __init__(self):
        ''' initial variable '''
        # data
        self.doc_list = list()
        self.query_list = list()
        self.queries = list()
        self.documents = list()
        
        # mapping data
        self.vocab = dict()
        
        # document unigram language model use
        # self.doc_unigram_prob = None
        
        # background language model use
        self.collection = list()
        # self.background_prob = None
        
        # probablistic latent semantic analysis model use
        self.term_doc = None
        
        # score variable
        self.query_doc_prob = dict()
        self.rank = dict()
        
        # preprocess tool 
        self.tokenize = Tokenization()
        
    def cut(self, text):
        # text = self.tokenize.cut(text, stopword=False, splitnum=False)
        return text.split()
    
    def data_preprocess(self, query_list, doc_list, queries, documents):
        
        self.query_list = query_list   
        self.doc_list = doc_list     
        self.query_doc_prob = { doc_id:0 for doc_id in self.doc_list }
        
        for query in queries:
            self.queries.append(self.cut(query))
        
        for doc in documents:
            text_seq = self.cut(doc)
            self.documents.append(text_seq)
            self.collection += text_seq
            
        # index term
        vocab = list(set(self.collection))
        self.vocab = { vocab[i]:i for i in range(0, len(vocab)) }
                                                  
    # background language model function
    def background_model(self):
        # cf(i) is collection frequency
        # calculate cf(i)        
        self.background_prob = np.zeros(len(self.vocab))
        for word in self.collection:
            self.background_prob[self.vocab[word]] += 1
        # calculate background probability 
        self.background_prob /= len(self.collection)
        
    def get_background_prob(self, word):
        return self.background_prob[self.vocab[word]]
    
    def sparse_matrix(self, data, row, column):
        return sparse.coo_matrix((data, (np.array(row), np.array(column))))
        
    # document unigram language model function
    def doc_model(self):
        # term-doc matrix use sparse matrix
        # row column coordinate
        row = list()
        column = list()
        term_freq = list()
        unigram_prob = list()
        
        for j in range(0, len(self.documents)):
            doc_set = set(self.documents[j])
            tf = np.zeros(len(self.vocab))
            # calculate tf(i)
            for word in self.documents[j]:
                tf[self.vocab[word]] += 1
            # store sparse
            for word_coord in tf.nonzero()[0]:
                row.append(word_coord)
                column.append(j)
                term_freq.append(tf[word_coord])
                unigram_prob.append(tf[word_coord] / len(self.documents[j]))
        
        self.term_doc = self.sparse_matrix(term_freq, row, column)
        self.doc_unigram_prob = self.sparse_matrix(unigram_prob, row, column).tocsr()
        
    def get_unigram_prob(self, word, doc):
        row = self.vocab[word]
        column = doc
        return self.doc_unigram_prob[row, column]
    
    def sum_topic(self, word, doc):
        likelihood = 0
        for k in range(self.topic_num):            
            likelihood += self.word_topic_prob[self.vocab[word], k] * self.topic_doc_prob[k, doc]
        return likelihood
              
    def prob(self, query, map_at):
        
        for word in query:
            if word in self.vocab:
                for j in range(0, len(self.documents)):
                    unigram_prob = self.get_unigram_prob(word, j)
                    background_prob = self.get_background_prob(word)
                    self.query_doc_prob[self.doc_list[j]] += np.log(self.alpha*unigram_prob + self.beta*self.sum_topic(word, j) + (1-self.alpha-self.beta)*background_prob)
        
        return [ (key, value) for key, value in sorted(self.query_doc_prob.items(),
                      key = lambda item:item[1], reverse=True)[:map_at] ]
    
    def query_likelihood(self, word_topic_prob, topic_doc_prob, topic_num, map_at=1000, alpha=None, beta=None):
        
        # tuning parameters
        self.topic_num = topic_num
        self.alpha = alpha
        self.beta = beta
        # PLSA prob
        self.word_topic_prob = word_topic_prob
        self.topic_doc_prob = topic_doc_prob
        
        # query likelihood measure
        for i in range(0, len(self.queries)):
            query = self.queries[i]
            self.query_doc_prob = { doc_id:0 for doc_id in self.doc_list }
            self.rank[self.query_list[i]] = self.prob(query, map_at)
            # print(self.rank[self.quer_list[i]])
            
    def output(self, result_path):
        with open(result_path, 'w', encoding='UTF-8') as f:
            f.write("Query,RetrievedDocuments\n")
            for query_id in self.rank:
                # output
                f.write("%s," % query_id)
                # print(len(self.rank[query_id]))
                for rank_id, rank_score in self.rank[query_id]:           
                    # print(rank_id, rank_score)
                    f.write("%s " % rank_id)
                f.write('\n')
        f.close()
        