In [3]:
#import stuff
import pickle
import numpy as np
from sklearn.preprocessing import normalize
from scipy.spatial.distance import hamming as hammingDist
import pandas as pd
%run basicVSM.ipynb

In [5]:
class RandomProjection:
      
    #setup code to project the collection
    def __init__(self,collection: pd.DataFrame,inverted_index,idf,doHash = False, numberOfRandomVectors = 10, distribution = 'achlioptas'):
                             

        '''sets up everything in order to retrieve with random projections. Attention, takes some time to execute.

        collection           please provide data frame with preprocessed documents
        inverted_index       please provide the inverted index
        idf                  please provide the IDF scores
        doHash               whether to perform a hashing as presented in the lecture, or not which will maybe be a little slower, but way more accurate and is the normal way in the literature I found
        numberOfRandomVectors
        distribution         random vectors are generated from one of the three different possibilities:
                             'achlioptas': a distribution considered most efficient in the literature, see e.g. https://cs.gmu.edu/~jessica/publications/lsi_sdm_workshop03.pdf
                             'normal': a normal distribution
                             'randomSamples': chooses random documents as random vectors

        there is no explicit load and safe anymore as the class can be simply pickled'''
       #split collection in id column and collection as list
        self.collection =list(collection['text'])
        self.docIDs = list(collection['id'])

        self.inverted_index=inverted_index
        self.idf = idf

        print('create random matrix with ',distribution,' distribution')
        self.randomMatrix = self.createRandomMatrix(distribution,numberOfRandomVectors)

        if(doHash):
            print('project the collection with ',numberOfRandomVectors, "random vectors and hashing")
        else:
            print('project the collection with ',numberOfRandomVectors, "random vectors without hashing")
        self.docMatrix, self.threshold = self.projectCollection(numberOfRandomVectors, doHash)



    #create matrix of random vectors 
    def createRandomMatrix(self,distribution,numberOfRandVecs):
        #(Achlioptas random distribution is more efficient according to: https://cs.gmu.edu/~jessica/publications/lsi_sdm_workshop03.pdf)
        NumberOfTerms = len(self.inverted_index)
        if(distribution=='achlioptas'):
            #sparse version not suitable as matrix is too sparse
            #values = (np.sqrt(3),0,-1*np.sqrt(3))
            #weights = (1/6,2/3,1/6)
            #R = np.random.choice(a = values,p = weights, size= numberOfRandVecs*NumberOfTerms)
            #dense version
            values = (1,-1)
            R = np.random.choice(a = values, size= numberOfRandVecs*NumberOfTerms)
            R = np.reshape(R,(numberOfRandVecs,NumberOfTerms))
            normalize(R,axis = 1,copy = False)
        #a normal distribution
        elif(distribution == 'normal'):
            R = np.random.random((numberOfRandVecs,NumberOfTerms))
            normalize(R,axis = 1,copy = False)
        #get a sample from the document collection
        elif(distribution == 'randomSamples'):
            docs = np.random.randint(len(self.collection), size = numberOfRandVecs)
            R = np.zeros((numberOfRandVecs,NumberOfTerms))
            for i in range(numberOfRandVecs):
                R[i]+= createVector(self.collection[docs[i]],self.inverted_index,self.idf)
            normalize(R,axis = 1,copy = False)
        else:
            raise ValueError("The distribution param was not specified correctly. It accepts 'achlioptas','normal''randomSamples'")
        return R

    #random projects without threshold
    def projectCollection(self, numberOfRandVecs, doHash = False):
        NumberOfDocs = len(self.collection)
        rtn = np.zeros((NumberOfDocs,numberOfRandVecs))
        #for every document
        for i in range(NumberOfDocs):
            if (i%100 == 0):
                print(i) # to see that it didn't crash
            d = createVector(self.collection[i],self.inverted_index,self.idf) # create vector for document
            d=normalize(d.reshape(1,-1))[0] # such that 
            #calc dot product with all random vectors
            inner = d@self.randomMatrix.T
            # inset into result matrix
            rtn[i] += inner
        #hash against the median of a each column of the vector to achieve maximum entropy. 
        #Will output the threshold to use for query hashing
        if (doHash == True):
            threshold = np.median(rtn, axis=0)
            rtn = rtn>threshold
        else:
            threshold = None
        return (rtn,threshold) 

    def projectQuery(self, query):
        d = createQueryVector(query,self.inverted_index)
        d=normalize(d.reshape(1,-1))[0] 
        #calc dot product with all random vectors
        inner = d@self.randomMatrix.T
        #hash 
        if self.threshold is not None:
            return inner > self.threshold
        else:
            inner=normalize(inner.reshape(-1,1)) 
        return inner

    #retrieval on new matrix
    def retrieveWithRandom(self, query, amount= 100):
        '''
        retrieval method

        query   as list of unique terms'''
        #project the query
        if self.threshold is None:
            q = self.projectQuery(query)
            #compute similarity of all projected data
            sim = self.docMatrix.dot(q)
        else:
            sim = np.zeros(len(self.docIDs))
            q = self.projectQuery(query)
            #compute Hamming distance
            for d in range(len(self.docIDs)):
                sim[d] = 1 - hammingDist(self.docMatrix[d],q)
        #sort output and cut to first 100
        results = {}
        for i in range(len(sim)):
            results[self.docIDs[i]]=sim[i]
        rtn = sorted(results.items(), key=lambda kv: kv[1],reverse = True)
        if(len(rtn) > amount):
            rtn = rtn[0:amount]
        return rtn

#random projects without threshold
def projectCollection(randomMatrix, collection, inverted_index, NumberOfRandVecs, doHash = False):
    R = randomMatrix
    NumberOfDocs = len(collection)
    rtn = np.zeros((NumberOfDocs,NumberOfRandVecs))
    #for every document
    for i in range(NumberOfDocs):
        if (i%100 == 0):
            print(i) # to see that it didn't crash
        d = createVector(i,inverted_index,collection) # create vector for document
        d=normalize(d.reshape(1,-1))[0] # such that 
        #calc dot product with all random vectors
        inner = d@randomMatrix.T
        # inset into result matrix
        rtn[i] += inner
    #hash against the median of a each column of the vector to achieve maximum entropy. 
    #Will output the threshold to use for query hashing
    if (doHash == True):
        threshold = np.median(rtn, axis=0)
        rtn = rtn>threshold
    else:
        threshold = None
    return (rtn,threshold) 

def projectQuery(query, randomMatrix, inverted_index,threshold = None):
    d = createQueryVector(query,inverted_index)
    d=normalize(d.reshape(1,-1))[0] 
    #calc dot product with all random vectors
    inner = d@randomMatrix.T
    #hash 
    if threshold is not None:
        return inner > threshold
    else:
        inner=normalize(inner.reshape(-1,1)) 
    return inner

#retrieval on new matrix
def retrieveWithRandom(query, docMatrix, randomMatrix, inverted_index,docIDs, amount= 100, threshold = None):
    #project the query
    if threshold is None:
        q = projectQuery(query=query,randomMatrix=randomMatrix,inverted_index = inverted_index)
        #compute similarity of all projected data
        sim = docMatrix.dot(q)
    else:
        sim = np.zeros(len(docIDs))
        q = projectQuery(query = query,randomMatrix = randomMatrix, inverted_index= inverted_index, threshold = threshold)
        #compute Hamming distance
        for d in range(len(docIDs)):
            sim[d] = 1 - hammingDist(docMatrix[d],q)
    #sort output and cut to first 100
    results = {}
    for i in range(len(sim)):
        results[docIDs[i]]=sim[i]
    rtn = sorted(results.items(), key=lambda kv: kv[1],reverse = True)
    if(len(rtn) > amount):
        rtn = rtn[0:amount]
    return rtn

def retrieveWithRandomWrapper(query, Model, amount = 100):
    '''retrieves with random projections given a Model
    
    query    query as list of unique terms
    Model    the model containing all 'class variables' for random projections without using the class structure
    amount   how many results to show'''
    return retrieveWithRandom(query, Model['docMatrix'], Model['randomMatrix'], Model['inverted_index'],Model['docIDs'],amount, Model['threshold'])


#setup code to project the collection
def randomProjectionSetup(collection: pd.DataFrame,inverted_index,doHash = False, numberOfRandomVectors = 10, distribution = 'achlioptas',
                          load = False, safe = True, path = "randomProjectionModel.psave"):
    
    '''sets up everything in order to retrieve with random projections. Attention, takes some time to execute.
    
    collection           please provide docs_preprocessed
    inverted_index       please provide the inverted index
    doHash               whether to perform a hashing as presented in the lecture, or not which will maybe be a little slower, but way more accurate and is the normal way in the literature I found
    numberOfRandomVectors
    distribution         random vectors are generated from one of the three different possibilities:
                         'achlioptas': a distribution considered most efficient in the literature, see e.g. https://cs.gmu.edu/~jessica/publications/lsi_sdm_workshop03.pdf
                         'normal': a normal distribution
                         'randomSamples': chooses random documents as random vectors
    
    load  is either false and the setup will be performed, or true and the model will be restored from path
    safe  defaults to true and indicates whether the model should be saved to path on creation.
    path  contains the standard path to the model saves to and loads from. May be changed.'''
    if(load):
        with open(path,"rb") as f:
            Model = pickle.load(f)

    else:
        Model = {}
        #split collection in id column and collection as list
        Model['collection']=list(collection['text'])
        Model['docIDs'] = list(collection['id'])
        
        Model['inverted_index']=inverted_index
        Model['doHash'] = doHash
        Model['numberOfRandomVectors'] = numberOfRandomVectors
        Model['distribution'] = distribution
        
        print('create random matrix with ',distribution,' distribution')
        Model['randomMatrix'] = createRandomMatrix(numberOfRandomVectors, Model['collection'], inverted_index, distribution)
        
        if(doHash):
            print('project the collection with ',numberOfRandomVectors, "random vectors and hashing")
        else:
            print('project the collection with ',numberOfRandomVectors, "random vectors without hashing")
        Model['docMatrix'], Model['threshold'] = projectCollection(Model['randomMatrix'], Model['collection'], inverted_index, 
                                                       numberOfRandomVectors, doHash)

        
        if(safe):
            with open(path,"wb") as f:
                pickle.dump(Model,f)
    return Model
#R = createRandomMatrix()
#docMatrixHash, t = projectCollection(randomMatrix= R, doHash = True)
#docMatrix = projectCollection(randomMatrix=R)
#docMatrix = docMatrix[0]