In [1]:
from os.path import join
from math import log

%run tokenization.ipynb

class BestMatchModel:
    def __init__(self):
        # variable initial         
        self.total_doc_length = 0
        self.avg_doclen = 0
        self.docs_length = dict()
        self.doc_list = dict()
        self.query_list = dict()
        self.vocab = dict()
        self.queries = dict()
        
        # score variable
        self.score_list = dict()
        self.rank = dict()
        
        # preprocess tool 
        self.tokenize = Tokenization()
    
    def set_docs_path(self, doc_list_path, doc_fpath):
        self.doc_list_path = doc_list_path
        self.doc_fpath = doc_fpath
        
    def set_queries_path(self, query_list_path, query_fpath):
        self.query_list_path = query_list_path
        self.query_fpath = query_fpath
        
    def get_file_list(self, file_list_path):      
        
        file_list = list()        
        with open(file_list_path, 'r', encoding='UTF-8') as f:
            for file_id in f.readlines():
                file_list.append(file_id.strip('\n'))
        f.close()
        
        return file_list
    
    def read_tokenize_data(self, file_path):
        
        with open(file_path, 'r', encoding='UTF-8') as f:
            text = self.tokenize.cut(f.read(), stopword=False, splitnum=False)
            # text = f.read()            
        f.close()
            
        return text.split()
    
    def view_df(self):
        df = dict()
        for term in self.vocab:
            df[term] = self.vocab[term]['df']
            
        df_list = [ (key, value) for key, value in sorted(df.items(),
                      key = lambda item:item[1], reverse=True) ]
        
        for term, df in df_list:
            print(term, df)
    
    ''' BM Model factor calculate method '''
    
    # calc_F_prime
    def calc_doc_tf(self, tf, doc_len):
        return ( (self.k1+1) * (tf+self.delta) / ( self.k1 * ( (1-self.b)+ self.b * doc_len/self.avg_doclen) + tf + self.delta) )

    # calc_F
    def calc_query_tf(self, tf):     
        return ( (self.k3+1)*tf ) / ( self.k3 + tf )
    
    # calc_Sparck_Jone_Equation
    def calc_idf(self, df):
        if df > (len(self.docs_length)/2):
            return 0
        return log((len(self.docs_length)-df+0.5) / (df+0.5), 10)
            
    '''   End  '''
    
    def calc_BM_score(self, query_id, query):
        
        for term in query:
            if term in self.vocab:
                for doc_id in self.vocab[term]['posting_list']:
                    self.score_list[doc_id] += self.queries[query_id][term] * self.vocab[term]['posting_list'][doc_id] * self.vocab[term]['idf']
            else:
                print("%s's %s is not in docs" % (query_id, term))
                            
        return [ (key, value) for key, value in sorted(self.score_list.items(),
                      key = lambda item:item[1], reverse=True) ]
    
    def score(self, query_id, query_terms):        
        self.score_list = { doc_id : 0 for doc_id in self.score_list }
        return self.calc_BM_score(query_id, query_terms)
    
    
    def calc_doc_weight(self): 
        
        # tf(i): term(i) 's frequency
        # d(j): document j
        # df(i): vocabulary(i) appear in all document's frequcy        
        
        doc_list = self.get_file_list(self.doc_list_path)
        query_list = self.get_file_list(self.query_list_path)
        
        if self.corpus_type == 'document':  
            # construct vocabulary (index term) and calculate document weight  
            for doc_id in doc_list:
                self.score_list[doc_id] = 0
                self.doc_list[doc_id] = self.read_tokenize_data(join(self.doc_fpath, doc_id+'.txt'))
                self.docs_length[doc_id] = len(self.doc_list[doc_id])
                self.total_doc_length += self.docs_length[doc_id]
                doc_words = set()
                
                for term in self.doc_list[doc_id]:                
                    # add vocabulary and calculate df(i)
                    if term not in self.vocab:
                        inverted_index_info = dict()
                        inverted_index_info['posting_list'] = dict()
                        inverted_index_info['df'] = 1              
                        self.vocab[term] = inverted_index_info
                    elif term not in doc_words:
                        self.vocab[term]['df'] += 1
                    # calculate tf(i) in d(j) 
                    if term not in doc_words:
                        self.vocab[term]['posting_list'][doc_id] = 1
                    else:
                        self.vocab[term]['posting_list'][doc_id] += 1
                    doc_words.add(term)        
        elif self.corpus_type == 'query':   
            # construct vocabulary (index term)
            for query_id in query_list:
                self.query_list[query_id] = self.read_tokenize_data(join(self.query_fpath, query_id+'.txt'))
                for term in self.query_list[query_id]:
                    if term not in self.vocab:
                        inverted_index_info = dict()
                        inverted_index_info['posting_list'] = dict()
                        inverted_index_info['df'] = 0             
                        self.vocab[term] = inverted_index_info 
                        
            # calculate document weight
            for doc_id in doc_list:
                self.score_list[doc_id] = 0
                self.docs_length[doc_id] = 0
                self.doc_list[doc_id] = self.read_tokenize_data(join(self.doc_fpath, doc_id+'.txt'))
                doc_words = set()
                for term in self.doc_list[doc_id]:
                    if term in self.vocab:
                        self.docs_length[doc_id] += 1
                self.total_doc_length += self.docs_length[doc_id]

                for term in self.doc_list[doc_id]:     
                    if term in self.vocab:
                        # add vocabulary and calculate df(i)
                        if self.vocab[term]['df'] == 0:
                            self.vocab[term]['df'] = 1
                        elif term not in doc_words:
                            self.vocab[term]['df'] += 1
                        # calculate tf(i) in d(j) 
                        if term not in doc_words:
                            self.vocab[term]['posting_list'][doc_id] = 1
                        else:
                            self.vocab[term]['posting_list'][doc_id] += 1
                        doc_words.add(term)   

        # doc_num: the number N of document
        # idf(i): inverse document frequency

        self.avg_doclen = self.total_doc_length / len(self.docs_length)  

        for doc_id in self.doc_list:
            # calculate weighting(current is tf and idf(i)) in d(j)
            for index_term in self.vocab:
                if doc_id in self.vocab[index_term]['posting_list']:
                    doc_tf = self.vocab[index_term]['posting_list'][doc_id]
                    # parameters: tf, doc_len
                    tf_factor = self.calc_doc_tf(doc_tf, self.docs_length[doc_id])
                    idf_factor = self.calc_idf(self.vocab[index_term]['df'])
                    self.vocab[index_term]['posting_list'][doc_id] = tf_factor
                    if 'idf' not in self.vocab[index_term]:
                        self.vocab[index_term]['idf'] = idf_factor
                        
    def calc_query_weight(self):        
        
        # tf(i): term i 's frequency
        # q: query
        # df(i): term i appear in all query's frequency
        if self.corpus_type == 'document':
            query_list = self.get_file_list(self.query_list_path)
            for query_id in query_list:
                query_words = set()
                query_weight = dict() 
                query_terms = self.read_tokenize_data(join(self.query_fpath, query_id+'.txt'))
                for term in query_terms:
                    if term not in query_words:
                        query_weight[term] = 1
                    else:
                        query_weight[term] += 1
                    query_words.add(term)
                self.queries[query_id] = query_weight

                # calculate weighting(current is tf(i)) in q
                for term in query_terms:
                    if term in self.vocab:
                        # parameters: tf
                        tf_factor = self.calc_query_tf(self.queries[query_id][term])
                        self.queries[query_id][term] = tf_factor
                self.rank[query_id] = self.score(query_id, query_terms)
                
        elif self.corpus_type == 'query':            
            for query_id in self.query_list:
                query_words = set()
                query_weight = dict() 
                query_terms = self.query_list[query_id]
                for term in query_terms:
                    if term not in query_words:
                        query_weight[term] = 1
                    else:
                        query_weight[term] += 1
                    query_words.add(term)
                self.queries[query_id] = query_weight

                # calculate weighting(current is tf(i)) in q
                for term in query_terms:
                    if term in self.vocab:
                        # parameters: tf
                        tf_factor = self.calc_query_tf(self.queries[query_id][term])
                        self.queries[query_id][term] = tf_factor
                self.rank[query_id] = self.score(query_id, query_terms)                
                        
    def output(self, result_path):
        with open(result_path, 'w', encoding='UTF-8') as f:
            f.write("Query,RetrievedDocuments\n")
            for query_id in self.rank:
                # output
                f.write("%s," % query_id)         
                for rank_id, rank_score in self.rank[query_id]:           
                    # print(rank_id, rank_score)
                    f.write("%s " % rank_id)
                f.write('\n')
        f.close()        
        
    def model(self, b, k1, k3, delta=0, corpus_type='document'):
        
        # tunable parameters
        self.b = b
        self.k1 = k1
        self.k3 = k3
        self.delta = delta        
        self.corpus_type = corpus_type
        
        # calculate doc and query weightings
        print('Building Model ...')
        self.calc_doc_weight()
        self.calc_query_weight()
        print('Model construction completed !')
        # self.view_df()
        