In [1]:
from os import listdir, mkdir
from os.path import join, isfile, basename
from math import log

%run tokenization.ipynb

class VectorSpaceModel:
    def __init__(self):
        # variable initial 
        self.doc_num = 0
        self.doc_list = dict()
        self.query_list = dict()
        self.vocab = dict()
        self.queries = dict()
        self.score = dict()
        
        # preprocess tool
        self.tokenize = Tokenization()
        self.tokenize.load_stopword_userdict('stopwords.txt')
        
    def get_file_list(self, file_list_path):      
        return [ join(file_list_path, file) for file in listdir(file_list_path) if isfile(join(file_list_path, file)) ]
    
    def read_tokenize_data(self, file_path):
        
        with open(file_path, 'r', encoding='UTF-8') as f:
            text = self.tokenize.cut(f.read().strip('\n'))
        f.close()
            
        return text.split()
    
#     def binary_search(self, arr, left, right, x):
        
#         if right >= left:   
#             mid = left + (right - left) // 2
#             if arr[mid] == x:
#                 return mid            
#             elif arr[mid] > x: 
#                 return binary_search(arr, left, mid-1, x) 
#             else: 
#                 return binary_search(arr, mid + 1, right, x)
    
    def view_df(self):
        df = dict()
        for term in self.vocab:
            df[term] = self.vocab[term]['df']
            
        df_list = [ (key, value) for key, value in sorted(df.items(),
                      key = lambda item:item[1], reverse=True) ]
        
        for term, df in df_list:
            print(term, df)
    
    def calc_unit_vector_length(self, doc_id):
        
        square_sum = 0
        for term in self.vocab:
            if doc_id in self.vocab[term]['posting_list']:
                square_sum += (self.vocab[term]['posting_list'][doc_id])**2
        unit_vector_length = (square_sum)**(0.5)
        print(unit_vector_length)
    
    def calc_doc_term_num(self, doc_id):
        num = 0
        for term in self.vocab:
            if doc_id in self.vocab[term]['posting_list']:
                num += 1
        return num
    
    def calc_doc_tf(self, tf):        
        return (tf)
    
    def calc_doc_idf(self, df):
        return (log(self.doc_num / df, 10))
    
    def calc_doc_weight(self, doc_fpath): 
        
        # tf(i): term(i) 's frequency
        # d(j): document j
        # df(i): vocabulary(i) appear in all document's frequency        
        
        self.doc_list = self.get_file_list(doc_fpath)        
        
        for doc_path in self.doc_list:
            doc_id = basename(doc_path).split('.')[0]            
            self.score[doc_id] = 0
            doc_words = set()
            doc_terms = self.read_tokenize_data(doc_path)
            for term in doc_terms:
                # add vocabulary and calculate df(i)
                if term not in self.vocab:
                    inverted_index_info = dict()
                    inverted_index_info['posting_list'] = dict()
                    inverted_index_info['df'] = 1              
                    self.vocab[term] = inverted_index_info
                elif term not in doc_words:
                    self.vocab[term]['df'] += 1
                # calculate tf(i) in d(j) 
                if term not in doc_words:
                    self.vocab[term]['posting_list'][doc_id] = 1
                else:
                    self.vocab[term]['posting_list'][doc_id] += 1
                doc_words.add(term)
        
        
        # doc_num: the number N of document
        # idf(i): inverse document frequency
        # tf-idf(i): term(i) 's tf(i) * idf(i)

        self.doc_num = len(self.doc_list)
        
        for doc_path in self.doc_list:
            doc_id = basename(doc_path).split('.')[0]      
            # calculate weighting(current is tfidf(i)) in d(j)  and d(j)'s vector length
            square_sum = 0 
            for index_term in self.vocab:
                if doc_id in self.vocab[index_term]['posting_list']:
                    tf = self.calc_doc_tf(self.vocab[index_term]['posting_list'][doc_id])
                    idf = self.calc_doc_idf(self.vocab[index_term]['df'])
                    self.vocab[index_term]['posting_list'][doc_id] = tf * idf
                    square_sum += (tf * idf)**2      
                    if 'idf' not in self.vocab[index_term]:
                        self.vocab[index_term]['idf'] = idf

            vector_length = square_sum**(0.5)

            # normalize
            for index_term in self.vocab:
                if doc_id in self.vocab[index_term]['posting_list']:
                    self.vocab[index_term]['posting_list'][doc_id] /= vector_length   
                    # print(self.vocab[index_term]['posting_list'][doc_id])
        
        # check unit vector length
#         for doc_id in self.doc_list:
#             doc_id = basename(doc_path).split('.')[0]      
#             self.calc_unit_vector_length(doc_id)
                
    def calc_cosine_similarity(self, doc_x_path, doc_y_path):
        
        score = 0
        tfidf_path = 'Docs_TF-IDF/'
        with open(tfidf_path+doc_x_path, 'r', encoding='UTF-8') as x:
            with open(tfidf_path+doc_y_path, 'r', encoding='UTF-8') as y:
                for x_line in x.readlines()[2:]:
                    for y_line in y.readlines()[2:]:
                        x_tfidf = x_line.split()
                        y_tfidf = y_line.split()
                        if x_tfidf[0] == y_tfidf[0]:
                            score += float(x_tfidf[1]) * float(y_tfidf[1])
            y.close()
        x.close()
        
        return score
    
    def output_dictionary(self):
        
        with open('dictionary.txt', 'w', encoding='UTF-8') as f:
            f.write("%-8s\t%-20s\t%-3s\n" % ("t_index", "term", "df"))
            t_index = 0
            self.vocab = dict(sorted(self.vocab.items()))
            for index_term in self.vocab:
                t_index += 1
                f.write("%-8d\t%-20s\t%-3d\n" % (t_index, index_term, self.vocab[index_term]['df']))
        f.close()
        
    def output_doc_weight(self, file_path):
        try:
            mkdir(file_path)
            for doc_path in self.doc_list:
                doc_id = basename(doc_path).split('.')[0]
                with open(file_path+doc_id+'.txt', 'w', encoding='UTF-8') as f:
                    f.write("%-8d\n" % self.calc_doc_term_num(doc_id))
                    f.write("%-8s\t\t%s\n" % ("t_index", "tf-idf"))
                    vocab = list(self.vocab.keys())
                    for i in range(1, len(vocab)):
                        term = vocab[i-1]
                        if doc_id in self.vocab[term]['posting_list']:
                            f.write("%-8d\t\t%f\n" % ((i-1), self.vocab[term]['posting_list'][doc_id]))
                f.close()
        except FileExistsError:
            print('directory %s already exists' % file_path)
        
    def model(self, doc_fpath):
        
        print('model constucting ...')
        self.calc_doc_weight(doc_fpath)
        # self.view_df()
        self.output_dictionary()
        self.output_doc_weight('Docs_TF-IDF/')
        print('model constructed!')
            