In [1]:
#import stuff
import pickle
import numpy as np
import pandas as pd
#from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import check_pairwise_arrays
from sklearn.preprocessing import normalize

%run utils.ipynb

In [3]:
#get the query vector
def createQueryVector(query, inverted_index):
    '''
    creates the query vector.
    
    query: list of unique terms of the query
    inverted_index: the regular inverted index
    '''
    rtn = np.zeros(len(inverted_index))
    for term in query:
        #print(term)
        try: 
            pos = list(inverted_index.keys()).index(term)
            rtn[pos] = 1
        except ValueError:
            pass
    if (np.sum(rtn)==0):
        raise ValueError('None of the search terms is part of the collection.')
    return rtn

In [2]:
#procedure that takes one doc and outputs the corresponding vector.
def createVector(doc,inverted_index,idf):
    '''
    procedure takes a document and outputs the corresponding vector containing tf-idf values. The output is not normalised.
    
    doc: preprocessed document as a list of unique terms
    inverted_index 
    idf
    '''
    terms, tf = termFrequencies(doc)
    #create vector
    rtn = np.zeros(len(inverted_index))
    for i in range(len(terms)): # for loop with ints running through the bag of words of doc
        term = terms[i] # the term at position i
        #find correct position in vector
        pos = list(inverted_index.keys()).index(term)
        #insert tf-idf of term
        rtn[pos] = idf[term] * tf[i]
    return rtn

In [5]:

def retrieveBool(query,inverted_index): 
    '''
    boolean retrieval with query or-ed to get initial set
    
    query as bag of words representation preprocessed (as a list!)
    '''
    
    rtn = []
    for term in query:
        #retrieve posting list
        try:
            help = inverted_index[term]
            rtn += [i[0] for i in help]
        except KeyError:
            pass
    rtn = list(set(rtn)) #eliminate all duplicates
    return sorted(rtn)

In [3]:
#cosine ranking
def retrieveCosine(query,inverted_index,idf,collectionAsDataframe: pd.DataFrame,amount=100):
    '''basic slow cosine retrieval
    
    query    as preprocessed list of unique terms
    inverted_index
    collectionAsDataframe   as preprocessed pandas dataframe'''
    #split collection in id column and collection as list
    collection=list(collectionAsDataframe['text'])
    docIDs = list(collectionAsDataframe['id'])
    
    #get all relevant docs with boolean retrieval
    boolResult = retrieveBool(query,inverted_index)
    #get vectors for all docs
    docVecs = []
    print(len(boolResult))
    for d in boolResult:
        docVecs.append(createVector(collection[d],inverted_index,idf))
#         if i % 100 = 0:
#             print(i)
#     docVecs = [createVector(d,inverted_index,collection,idf) for d in boolResult]
    #get query vector
    
    queryVec = [createQueryVector(query,inverted_index)]
    #rank them
    results = {}
    for i in range(len(docVecs)):
        v = [docVecs[i]]
        #compute cosine
        cos = cosine_similarity(queryVec,v)
        #save
        results[docIDs[boolResult[i]]]=cos[0][0]
    #sort output and cut to amount
    rtn = sorted(results.items(), key=lambda kv: kv[1],reverse = True)
    if(len(rtn) > amount):
        rtn = rtn[0:amount]
    return rtn

In [None]:
testQuery = ['vascular','anxious']
invalidQuery = ['asdfhjahfd','wieuriowuer']
#retrieveCosine(invalidQuery)



In [None]:
#code for Mingying, don't run.

#doc =inverted_index[0]
#doc = np.unique(doc)
#cosineSimilarity(doc)