In [8]:
import pandas as pd
import numpy as np
import collections
import pickle
import operator
from tqdm import tqdm
%run preprocessing_ti.ipynb
%run basicVSM_TIversion.ipynb
%run utils.ipynb

In [9]:
class TierIndexBuilder:
    
    inverted_index = ''
    number_tier = ''
    global_tier={}
    
    
    
    def __init__(self, path_to_docs, path_to_index, path_to_tw_matrix, stop_words_file_path, rare_words_file_path):
        self.df_docs = pd.read_csv(path_to_docs)
    
    def raw_frequency(self, term, doc):
        count = 0
        if isinstance(doc, str):
            for word in doc.split():
                if term == word:
                    count = count + 1
        return count

    def get_most_freq_term(self, doc):
        doc_freq = dict([word, self.raw_frequency(word, doc)] for word in doc.split())
        value, count = collections.Counter(doc_freq).most_common(1)[0]
        return count

    def compute_tf(self, term, doc):
        if (self.raw_frequency(term, doc) > 0):
            return (1 + np.log10(self.raw_frequency(term, doc))) / (1 + np.log10(self.get_most_freq_term(doc)))
        else:
            return 0

    def build_inverted_index(self, docs_df:pd.DataFrame):
        print('building inverted_index...')
        inverted_index = collections.defaultdict(dict)
        for i in tqdm(range(len(docs_df))):
            existing=set()
            doc_id = docs_df['id'][i]
            doc_text = docs_df['text'][i]
            for term in str(doc_text).split():
                 if term not in existing:
                    tf = self.compute_tf(term, doc_text)
                    inverted_index[term][doc_id] = tf
                    existing.add(term)
        #print(inverted_index)
        return self.sort_inverted_index(inverted_index)
    
    def sort_inverted_index(self, inverted_index):
        sorted_inverted_index=collections.defaultdict(dict)
        for i in inverted_index:
            sorted_inverted_index[i]=dict(sorted(inverted_index[i].items(),key=operator.itemgetter(1), reverse=True))
            #print(sorted_inverted_index)
        return sorted_inverted_index
    
    def store_sort_inverted_index(self, inverted_index, inverted_index_path):
        pickle.dump(inverted_index, open(inverted_index_path,'wb'))

    def build_tiers(self, inverted_index,tier_type, fix_length, threshold):
        print('building tiers..')
        global_tier=collections.defaultdict(dict)
        for term in tqdm(inverted_index):
            if tier_type=="fixedLength":
                local_tier=self.build_fixed_local_tier(list(inverted_index[term].items()),fix_length)
            else:
                local_tier=self.build_threshold_local_tier(list(inverted_index[term].items()),threshold)
            for tier in local_tier:
                global_tier[tier][term]=local_tier[tier]
                global_tier[tier][term].sort(key=operator.itemgetter(0))
        
        return global_tier

    def build_fixed_local_tier(self,inverted_index_list:list, fix_length):
        local_tier=collections.defaultdict(list)
        number_tier=(int(len(inverted_index_list)/fix_length)+1)
        for tier_id in range(number_tier):
            step_tier=tier_id*fix_length
            for doc_index in range(step_tier,step_tier+fix_length):
                if doc_index==len(inverted_index_list):
                     break
                local_tier[tier_id].append()
        return dict (local_tier)
    
   
    def build_threshold_local_tier(self, inverted_index_list:list, threshold):
        local_tier=collections.defaultdict(list)
#         print(inverted_index_list)
        df_posting=pd.DataFrame(columns=['doc_id','weight'])
        for i in inverted_index_list:
            df_posting.loc[len(df_posting)]=i
        tier=0
        while tier<=len(threshold):
            tier_docs = list()
            if tier == 0:
                document=df_posting.loc[df_posting['weight']>threshold[tier]]
                tier_docs.extend([tuple(x)for x in document.values ])
            elif tier==len(threshold):
               
                document = df_posting.loc[df_posting['weight'] <=threshold[tier-1]]
                tier_docs.extend([tuple(x) for x in document.values])
            else:
                ttt  = df_posting.loc[df_posting['weight']>threshold[tier]]
                document = ttt[ttt['weight']<=threshold[tier-1]]
                tier_docs.extend([tuple(x) for x in document.values])
            for a in tier_docs :
                local_tier[tier].append(a)
            tier=tier+1
        return dict(local_tier)
        
    def storeGlobalTier(self,global_tier,global_tier_path):
        pickle.dump(global_tier,open (global_tier_path,'wb'))
        
    def countNumTierTerm(self,term, global_tier):
            count=0
            for tier in global_tier:
                for key , value in dict(global_tier[tier]).items():
                    if key==term:
                        count=count+1
            return count
        
    def createterm_numberoftier(self,inverted_index,global_tier):
            number_tier=collections.defaultdict(int)
            for term in inverted_index.keys():
                number_tier[term]=self.countNumTierTerm(term,global_tier)
            return number_tier

    def storeNumbertier(self,number_tier,number_tier_path):
        pickle.dump(number_tier,open(number_tier_path,'wb'))
        

In [10]:
# if __name__=='__main__':
#     # run from current jpy __name__='__main__'
#     # import by other file :__name__='TieredIndex' , just module name
#     path_to_inverted_index=r'../data/inverted_index.p'
#     path_to_index=r'../data/index_all_docs.p'
#     path_to_tw_matrix=r'../data/matrix.p'
#     stop_words_file_path=r'../data/nfcorpus\raw\stopwords.large' 
#     rare_words_file_path=r'../data/rare_tokens.txt'
#     path_to_docs=r'../data/dev.df'
#     builder=TierIndexBuilder(path_to_docs, path_to_index, path_to_tw_matrix, stop_words_file_path, rare_words_file_path)
#     inverted_index = builder.build_inverted_index(pd.read_csv(path_to_docs))
#     builder.store_sort_inverted_index(path_to_inverted_index)### inverted index
#     global_tier = builder.build_tiers(inverted_index,'Threshold',40,[0.7,0.6,0.2])
#     global_tier_path=r'../data/global_tier.p'
#     builder.storeGlobalTier(global_tier,global_tier_path)
#     number_tier_path =r'../data/number_tier.p'
#     number_tier=builder.createterm_numberoftier(inverted_index,global_tier)
#     builder.storeNumbertier(number_tier,number_tier_path)

In [11]:
class tierIndexSearch:
    def __init__(self,stop_words_path, rare_words_path,inverted_index_path,number_tier_path
               ,global_tier_path):
        self.preprocess=Preprocessor(stop_words_path, rare_words_path)
        self.inverted_index=pickle.load(open(inverted_index_path, "rb"))
        self.global_tier=pickle.load(open(global_tier_path, "rb"))
        self.number_tiers=pickle.load(open(number_tier_path,'rb'))
        self.maxnumberofTier=len(self.global_tier)
        
        self.doc_vector = None
        
    def search(self, query, number_documents, inverted_index, doc_processed, idf):
        query=self.preprocess.preprocess_line(query)
        query_list=list()
        
        for term in query.split():
            if term in self.inverted_index:
                query_list.append(term)
#         print(query_list)
        revelance_doc=list()
        if len(query_list)!=0:
            if self.maxnumberofTier>50:
                revelance_doc=self.searchByFixlength(query_list,number_documents)
            else:
                revelance_doc=self.searchByThreshold(query_list,number_documents)
        sortRevelanceDocs=self.getSortedRevelanceDoc(query, revelance_doc, inverted_index, doc_processed, idf)
        return sortRevelanceDocs
    
    
    def searchByFixlength(self,query, number_documents):
        revelance_doc=list()
        local=self.searchFirstTier(query)
        document_merged=self.mergeTierQUery(local)
        revelance_doc=document_merged
#         local=collections.defaultdict(list)
        tier=0
        while(len(revelance_doc)<number_documents):
            new_query_list=self.pruneexistTiers(query,tier)
            if len(new_query_list)!=0:
                del revelance_doc[:]
                local=self.mergeTierTerm(new_query_list,local,tier)
                document_merged=self.mergeTierQUery(local)
                revelance_doc=document_merged
                tier=tier+1
            else:
                break
        return revelance_doc
    
    def searchByThreshold(self,query, number_documents):
        revelance_doc=list()
        new_query_list=list()
        for term in query:
            if self.checkTermInTier(term,0):
                new_query_list.append(term)
#         print(new_query_list)
        if len(new_query_list)>0:
            local=self.searchFirstTier(query)
            document_merged=self.mergeTierQUery(local)
            revelance_doc=document_merged
#             local=collections.defaultdict(list)
            tier=0
            while(len(revelance_doc)<number_documents):
                new_query_list=self.pruneexistTiers(query,tier)
                if len(new_query_list)!=0:
                    del revelance_doc[:]
                    local=self.mergeTierTerm(new_query_list,local,tier)
                    document_merged=self.mergeTierQUery(local)
                    revelance_doc=document_merged
                    tier=tier+1
                else:
                    if tier<self.maxnumberofTier:
                        tier=tier+1
                    else:
                        break
        return revelance_doc
    
    def save_vector_of_doc(self, inverted_index, doc_processed, idf):
        vec = {}
        for i,doc in enumerate(tqdm(doc_processed)):
            vec[doc[0]] = createVectorTI(i, inverted_index, doc_processed, idf)
        pickle.dump(vec, open('/data/doc_vectors.psave','wb'))
        
    
    def getSortedRevelanceDoc(self,query,revelance_doc, inverted_index,doc_processed, idf):
        if self.doc_vector is None:
            if not os.path.exists('/data/doc_vectors.psave'):
                self.save_vector_of_doc(inverted_index, doc_processed, idf)
            self.doc_vector = pickle.load(open('/data/doc_vectors.psave','rb'))
        
        
        revelance_doc_id=[x[0] for x in revelance_doc]
        
#         res_doc = []
#         for i in range(len(doc_processed)):
#             if doc_processed[i][0] in revelance_doc_id:
#                 res_doc.append(i)
        
        
        docVecs = []
#         for d in tqdm(res_doc):
#             docVecs.append(createVectorTI(d,inverted_index,doc_processed,idf))
        for id in revelance_doc_id:
            docVecs.append(self.doc_vector[id])
        

        queryVec = [createQueryVectorTI(query.split(),inverted_index)]
        #rank them
        results = {}
        for i in range(len(docVecs)):
            v = [docVecs[i]]
            #compute cosine
            cos = cosine_similarity(queryVec,v)
            #save
            results[revelance_doc_id[i]]=cos[0][0]
        #sort rtn by similarity
        rtn = sorted(results.items(), key=lambda kv: kv[1],reverse=True)
        rtn = [x[0] for x in rtn]
    
        return rtn
    
    
    def searchFirstTier(self,query):
        local=collections.defaultdict(list)
        for term in query:
            if term in self.global_tier[0]:
                local[term]=self.global_tier[0][term]
        return local

        
    def mergeUnion(self,a,b):
        merge_list=list()
        i=0
        j=0
        doc_b = { x[0]:x[1] for x in b}
        doc_a = { x[0]:x[1] for x in a}
        while(i<len(a)):
            if a[i][0] in doc_b:
                weight = a[i][1] + doc_b[a[i][0]]
                merge_list.append((a[i][0],weight))
            if a[i][0] not in doc_b:
                merge_list.append((a[i][0],a[i][1]))
            i+=1
        while(j<len(b)):
            if b[j][0] not in doc_a:
                merge_list.append((b[j][0],b[j][1]))
            j+=1
#             if a[i][0]==b[j][0]:
#                 weight= a[i][1]+b[j][1]
#                 merge_list.append((a[i][0],weight))
#                 i=i+1
#                 j=j+1
#             else:
#                 if a[i][0]<b[j][0]:
#                     weight=a[i][1]
#                     merge_list.append((a[i][0],weight))
#                     i=i+1
#                 else:
#                     weight=b[j][1]
        return merge_list
                    
                    
    def checkTermInTier(self,term,tier):
        return True if term in self.global_tier[tier] else False
        for key,value in(self.global_tier[tier]).items():
            if key==term:
                return  True
            else:
                return False
            
    def pruneQuerySubsequentiers(self, query, tier):
        new_query=list()
        for term in query:
            if self.checkTermInTier(term, tier) and self.checkTermInTier(term, tier+1):
                new_query.append(term)
        return new_query

    def pruneexistTiers(self,query,tier):
        new_query=list()
        for term in query:
            if self.number_tiers[term]>(tier+1):
                new_query.append(term)
        return new_query


    def mergeTierTerm(self, query,local, tier):
#         local=collections.defaultdict(list)
        if bool(local) is False:
            for term in query:
                merged_tiers=self.mergeUnion(self.global_tier[tier][term],self.global_tier[tier+1][term])
                local[term] = merged_tiers

        else:
#             local=collections.defaultdict(list)
            for term in query:
                if term in self.global_tier[tier+1]:
                    merged_tiers=self.mergeUnion(local[term],self.global_tier[tier+1][term])
                    local[term] = merged_tiers
        return local


    def mergeTierQUery(self, local):
        local=dict(sorted(local.items(),key=lambda item:len(item[1])))
        query_terms=list(local.keys())
        document_merged=local[query_terms[0]]
        for key in range(1,len(query_terms)):
            document_merged=self.mergeUnion(document_merged, local[query_terms[key]])
        return document_merged