In [1]:
from os.path import join
from math import log

%run tokenization.ipynb

class VectorSpaceModel:
    def __init__(self):
        # variable initial 
        self.doc_num = 0
        self.doc_list = dict()
        self.query_list = dict()
        self.vocab = dict()
        self.queries = dict()
        self.score = dict()
        self.match_num = dict()
        
        # preprocess tool
        self.tokenize = Tokenization()
        
    def get_file_list(self, file_list_path):      
        
        file_list = list()        
        with open(file_list_path, 'r', encoding='UTF-8') as f:
            for file_id in f.readlines():
                file_list.append(file_id.strip('\n'))
        f.close()
        
        return file_list
    
    def read_tokenize_data(self, file_path):
        
        with open(file_path, 'r', encoding='UTF-8') as f:
            text = self.tokenize.cut(f.read(), splitnum=False, stopword=False)
            # text = f.read()            
        f.close()
            
        return text.split()
    
    def calc_cosine_similarity(self, query_id, query):
        
        for term in query:
            if term in self.vocab:
                for doc_id in self.vocab[term]['posting_list']:
                    self.score[doc_id] += self.queries[query_id][term] * self.vocab[term]['posting_list'][doc_id]
                    self.match_num[doc_id] += 1
            else:
                print("%s's %s is not in docs" % (query_id, term))
                
        for doc_id in self.score:
            self.score[doc_id] *= (self.match_num[doc_id] / len(query)) 
            
        return [ (key, value) for key, value in sorted(self.score.items(),
                      key = lambda item:item[1], reverse=True) ]
    
    def view_df(self):
        df = dict()
        for term in self.vocab:
            df[term] = self.vocab[term]['df']
            
        df_list = [ (key, value) for key, value in sorted(df.items(),
                      key = lambda item:item[1], reverse=True) ]
        
        for term, df in df_list:
            print(term, df)
    
    def calc_unit_vector_length(self, doc_id):
        
        square_sum = 0
        for term in self.vocab:
            if doc_id in self.vocab[term]['posting_list']:
                square_sum += (self.vocab[term]['posting_list'][doc_id])**2
        unit_vector_length = (square_sum)**(0.5)
        print(unit_vector_length)
        
    
    def calc_doc_tf(self, tf):        
        return (1+log(tf, 2))
    
    def calc_doc_idf(self, df):
        return (log(self.doc_num / df, 10))
        
    def calc_query_tf(self, tf, tf_list):
        return (0.4 + 0.6 * (tf / max(tf_list)))
    
    def calc_query_idf(self, df):        
        return log((self.doc_num / df), 10)
    
    def calc_doc_weight(self, doc_list_path, doc_fpath): 
        
        # tf(i): term(i) 's frequency
        # d(j): document j
        # df(i): vocabulary(i) appear in all document's frequency        
        
        self.doc_list = self.get_file_list(doc_list_path)
        
        for doc_id in self.doc_list:
            self.score[doc_id] = 0
            self.match_num[doc_id] = 0
            doc_words = set()
            doc_terms = self.read_tokenize_data(join(doc_fpath, doc_id+'.txt'))
            
            for term in doc_terms:                
                # add vocabulary and calculate df(i)
                if term not in self.vocab:
                    inverted_index_info = dict()
                    inverted_index_info['posting_list'] = dict()
                    inverted_index_info['df'] = 1              
                    self.vocab[term] = inverted_index_info
                elif term not in doc_words:
                    self.vocab[term]['df'] += 1
                # calculate tf(i) in d(j) 
                if term not in doc_words:
                    self.vocab[term]['posting_list'][doc_id] = 1
                else:
                    self.vocab[term]['posting_list'][doc_id] += 1
                doc_words.add(term)
                
        # doc_num: the number N of document
        # idf(i): inverse document frequency
        # tf-idf(i): term(i) 's tf(i) * idf(i)
        
        self.doc_num = len(self.doc_list)
                
        for doc_id in self.doc_list:
            # calculate weighting(current is tfidf(i)) in d(j)  and d(j)'s vector length
            square_sum = 0 
            for index_term in self.vocab:
                if doc_id in self.vocab[index_term]['posting_list']:
                    tf = self.calc_doc_tf(self.vocab[index_term]['posting_list'][doc_id])
                    idf = self.calc_doc_idf(self.vocab[index_term]['df'])
                    self.vocab[index_term]['posting_list'][doc_id] = tf * idf
                    square_sum += (tf * idf)**2      
                    if 'idf' not in self.vocab[index_term]:
                        self.vocab[index_term]['idf'] = idf
                        
            vector_length = square_sum**(0.5)
            
            # normalize
            for index_term in self.vocab:
                if doc_id in self.vocab[index_term]['posting_list']:
                    self.vocab[index_term]['posting_list'][doc_id] /= vector_length    
        
        # check unit vector length
        # for doc_id in self.doc_list:
            # self.calc_unit_vector_length(doc_id)
                
    def output(self, query_list_path, query_fpath, result_path):
        
        # tf(i): term i 's frequency
        # q: query
        # df(i): term i appear in all query's frequency
        
        self.query_list = self.get_file_list(query_list_path)
            
        with open(result_path, 'w', encoding='UTF-8') as f:
            f.write("Query,RetrievedDocuments\n")
            for query_id in self.query_list:
                query_words = set()
                query_terms = self.read_tokenize_data(join(query_fpath, query_id+'.txt'))
                query_weight = dict() 
                for term in query_terms:
                    if term not in query_words:
                        query_weight[term] = 1
                    else:
                        query_weight[term] += 1
                    query_words.add(term)
                self.queries[query_id] = query_weight
                # calculate weighting(current is tfidf(i)) in q and q's vector length
                square_sum = 0
                for term in query_terms:
                    if term in self.vocab:
                        tf = self.calc_query_tf(self.queries[query_id][term], self.queries[query_id].values())
                        idf = self.vocab[term]['idf'] # self.calc_query_idf(self.vocab[term]['idf']) 
                        query_weight[term] = tf * idf
                        square_sum += (tf * idf)**2

                vector_length = square_sum**(0.5)
                # normalize
                for term in query_terms:
                    self.queries[query_id][term] /= vector_length
                    
                # output
                f.write("%s," % query_id)
                self.score = { doc_id : 0 for doc_id in self.score }
                self.match_num = { doc_id : 0 for doc_id in self.match_num }
                rank = self.calc_cosine_similarity(query_id, query_terms)
                for rank_id, rank_score in rank:
                    # print(rank_id, rank_score)
                    f.write("%s " % rank_id)
                f.write('\n')
        f.close()        
        
    def model(self, doc_list_path, doc_fpath):
        
        print('model constucting ...')
        self.calc_doc_weight(doc_list_path, doc_fpath)
        # self.view_df()
        print('model constructed!')
            