In [2]:
import os
import random
import math
import scipy.sparse as sps
from sklearn.preprocessing import normalize
import pandas as pd
%run utils.ipynb
%run basicVSM.ipynb


class SparsePreclustering:
    def __init__(self,collection,inverted_index,idf):
        '''sets up everything in order to retrieve with preclustering. Attention, takes some time to execute.
        As collection, please provide docs_preprocessed. 

        '''
        self.collection=list(collection['text'])
        self.docIDs = list(collection['id'])
        self.idf = idf
        self.inverted_index=inverted_index
        print('get Leaders')
        self.leader_list, self.leader_matrix = self.getLeaders()
        print('compute similarities of documents to leaders')
        self.leader_similarity = self.getDocLeaderSimilarity()
        print('compute cluster matricies')
        self.clusters, self.clusterMatrices = self.createClusters()
    
    def createNormalisedCSRMatrix(self,collection,inverted_index):
        '''
        procedure that takes collection of docs and outputs the corresponding CSR Matrix row normalised
        '''
        data = []
        rows = []
        cols = []
        for i in range(len(collection)):
     #       print(i)
            #look up, bag of words of d
            doc = collection[i]
     #       print(doc)
            #compute term frequency
            f = np.unique(doc,return_counts=True)
            max_f = np.max(f[1])
            tf = 1+np.log(f[1])
            tf = tf/(1 + np.log(max_f))
            #create vector
            for t in range(len(f[0])): # for loop with ints running through the bag of words of doc
                term = f[0][t] # the term at position t
                #find correct column of term
                pos = list(inverted_index.keys()).index(term)
                #insert tf-idf of term into sparse matrix baseline structures
                data.append(self.idf[term] * tf[t])
                rows.append(i)
                cols.append(pos)
        matrixCOO = sps.coo_matrix((data, (rows, cols)), shape=(len(collection), len(inverted_index.keys())))
        matrixCSR = matrixCOO.tocsr()
        normalize(matrixCSR,copy = False)
        return matrixCSR

    def getLeaders(self):
        '''
        procedure that randomly selects sqrt(len(collection)) leaders and outputs the indices of those leaders
        and a sparse matrix containing their document vectors
        '''
        l=int(math.sqrt(len(self.collection)))
        leader_list= random.sample(self.collection,l)
        #print(leader_list)
        leader_matrix = self.createNormalisedCSRMatrix(leader_list,self.inverted_index)
        return (leader_list,leader_matrix)

    #function of cosine ranking---query and leader comparison
    def leader_retrieveCosine(self,query):
        '''
        deprecated
        '''
        #get all relevant docs with boolean retrieval
        boolResult = index
        #get vectors for all docs
        docVecs = [createVector(self.collection[d],self.inverted_index,self.idf) for d in boolResult]
        #get query vector
        queryVec = [createQueryVector(query,inverted_index)]
        #rank them
        results = {}
        for i in range(len(docVecs)):
            v = [docVecs[i]]
            #compute cosine
            cos = cosine_similarity(queryVec,v)
            #save
            results[boolResult[i]]=cos[0][0]
        #sort rtn by similarity
        rtn = sorted(results.items(), key=lambda kv: kv[1],reverse=True)
        return rtn

    def getDocLeaderSimilarity(self):
        '''
        compares each document to the leaders (by computing cosine similarity) 
        and assigns it to the cluster by providing an output which can be used as a mapping
        '''
        leader_similarity=[]
        for d in range(len(self.collection)):
            queryVec = createVector(self.collection[d],self.inverted_index,self.idf).reshape(-1, 1)
            normalize(queryVec,axis=0,copy= False)
            #compute similarity of document to leaders
            leaderSimilarities = self.leader_matrix.dot(queryVec)
            #get corresponding row
            leader = np.argmax(leaderSimilarities)
            #result=leader_retrieveCosine(document) 
            leader_similarity.append(leader)
            if((d % 100) == 0):
                print(d)
        return leader_similarity

    def clusterDocs(self,leader_similarities,leader_list): 
        '''
        cluster the close documents to leader

        '''
        #initialise dic with empty lists and leader positions as keys
        leader_cluster={key: [] for key in np.arange(len(leader_list))}
        for i in range(len(leader_similarities)):
            #insert the index of the doc (i) at the position of its leader (leader_similarity[i])
            leader_cluster[leader_similarities[i]].append(i)
        return leader_cluster

    def createClusters(self):
        '''
        creates a sparse matrix for every cluster containing all the vectors of the corresponding documents.
        This matrices are returned in a list. On top, the clusters are also returned as a list
        '''
        clusters = self.clusterDocs(self.leader_similarity,self.leader_list)
        result = []
        print('in total there are ',len(clusters),' clusters')
        counter=0
        for leader in clusters.keys(): # for every leader
            counter+=1
            #get the list of docs assigned to leader and subset collection with those
            clusterAsCollection = [self.collection[i] for i in clusters[leader]]
            #at the implicit index of leader, insert the matrix of its docs
            result.append(self.createNormalisedCSRMatrix(clusterAsCollection,self.inverted_index))
            if((counter % 10) == 0):
                print(counter)
        return (clusters,result)

    def retrieveWithPreclustering(self,query,amount=100):
        '''
        query   list of unique terms
        amount   '''
        #get query vector
        queryVec = createQueryVector(query,self.inverted_index).reshape(-1, 1)
        normalize(queryVec,axis=0,copy= False)
        #compute similarity of query to leaders
        leaderSimilarities = self.leader_matrix.dot(queryVec)
        #get corresponding matrix
        leader = np.argmax(leaderSimilarities)

        #retrieve on this matrix
        sim = self.clusterMatrices[leader].dot(queryVec)
        #translate indices back by looking up in the leader_cluster
        #look up the cluster in which the results where found
        cluster = self.clusters[leader]
        results = {}
        for i in range(len(sim)):
            results[self.docIDs[cluster[i]]]=sim[i]

        #if there are not enough results in the first cluster
        if(len(results)<amount):
            #things get slow and messy. Code is copied in order to normally not execute it and get better speed.

            #sort leader similarities and index them
            leaderSimilarities=[-leaderSimilarities[a][0] for a in range(len(leaderSimilarities))]
            leadersSorted= np.argsort(leaderSimilarities)
            counter = 1
            while((len(results)<amount) & (counter < len(leadersSorted))):
                #pick the next leader
                leader = leadersSorted[counter]
                #retrieve on this matrix
                sim = self.clusterMatrices[leader].dot(queryVec)
                #translate indices back by looking up in the leader_cluster
                #look up the cluster in which the results where found
                cluster = self.clusters[leader]
                for i in range(len(sim)):
                    results[self.docIDs[cluster[i]]]=sim[i]
                counter +=1

        rtn = sorted(results.items(), key=lambda kv: kv[1],reverse = True)
        #output only first elements
        if(len(rtn) > amount):
            rtn = rtn[0:amount]
        return rtn