In [6]:
#import stuff
import pickle
import numpy as np
import scipy.sparse as sps
#from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import check_pairwise_arrays
from sklearn.preprocessing import normalize

%run utils.ipynb
%run basicVSM.ipynb

class SparseBasicVSM:
     
    def __init__(self,collectionAsDataframe: pd.DataFrame,inverted_index,load = False, safe = True, path = "sparseVSM.psave"):
        '''sets up everything in order to retrieve with a basic VSM model based on sparse matrices. Attention, takes some time to execute.
        collection        please provide the preprocessed dataframe
        inverted_index    please provide the inverted index
        load              is either false and the setup will be performed, or true and the model will be restored from path
        safe              defaults to true and indicates whether the model should be saved to path on creation.
        path              contains the standard path to the model saves to and loads from. May be changed.'''

        if(load):
            self.inverted_index= inverted_index
            self.collectionAsDataframe = collectionAsDataframe
            self.collectionAsList = list(collectionAsDataframe['text'])
            with open(path,"rb") as f:
                self.collectionAsMatrix = pickle.load(f)
        else:
            self.inverted_index= inverted_index
            self.collectionAsDataframe = collectionAsDataframe
            self.collectionAsList = list(collectionAsDataframe['text'])
            self.collectionAsMatrix = self.createNormalisedCSRMatrix()
            if(safe):
                with open(path,"wb") as f:
                    pickle.dump(self.collectionAsMatrix,f)


    def createNormalisedCSRMatrix(self):
        '''
        procedure that takes collection of docs and outputs the corresponding CSR Matrix row normalised

        '''
        data = []
        rows = []
        cols = []
        
        idf = createIDF(self.inverted_index,len(self.collectionAsList))
        for i in range(len(self.collectionAsList)):
            #look up, bag of words of d
            doc = self.collectionAsList[i]
            #compute term frequency
            terms,tf = termFrequencies(doc)
            #create vector
            for t in range(len(terms)): # for loop with ints running through the bag of words of doc
                term = terms[t] # the term at position t
                #find correct column of term
                pos = list(self.inverted_index.keys()).index(term)
                #insert tf-idf of term into sparse matrix baseline structures
                data.append(idf[term] * tf[t])
                rows.append(i)
                cols.append(pos)
        matrixCOO = sps.coo_matrix((data, (rows, cols)), shape=(len(self.collectionAsList), len(self.inverted_index)))
        collectionAsMatrix = matrixCOO.tocsr()
        normalize(collectionAsMatrix,copy = False)
        return collectionAsMatrix
        
        #cosine ranking
    def retrieveCosineSparse(self,query,amount):
        '''
        retrieval for the sparse VSM model

        query                  please provide query as list of unique terms
        amount                 how many results shall be shown?
        '''
        #get query vector
        queryVec = createQueryVector(query,self.inverted_index).reshape(-1, 1)
        normalize(queryVec,axis=0,copy= False)
        #queryVec = queryVec[0]
        #compute similarity of query to matrix
        sim = self.collectionAsMatrix.dot(queryVec)

        #sort output and cut to amount
        results = {}
        for i in range(len(sim)):
            results[self.collectionAsDataframe['id'][i]]=sim[i]
        rtn = sorted(results.items(), key=lambda kv: kv[1],reverse = True)
        if(len(rtn) > amount):
            rtn = rtn[0:amount]
        return rtn

In [3]:
##load stuff
#f=open("inverted_index.txt","rb")
#inverted_index = pickle.load(f)
#f.close()
##the name of text_process was changed to docs_preprocessed
#f=open("docs_preprocessed.psave","rb")
#docs_preprocessed = pickle.load(f)
#f.close()

In [36]:
testQuery = ['vascular','anxious']
invalidQuery = ['asdfhjahfd','wieuriowuer']
#retrieveCosineSparse(testQuery,collection)



1.414213562373095


Unnamed: 0,0
166,0.148150
1738,0.138299
3077,0.130409
412,0.126705
1727,0.124830
2254,0.117029
1726,0.112413
3081,0.111741
975,0.111279
857,0.109711
