In [3]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist
import numpy as np
from ml_metrics import apk, mapk 
from collections import defaultdict

vv = TfidfVectorizer()

docs = [] 
names = [] 
for root, dirs, files in os.walk('docs'):
    for file in sorted(files, key=int):
        with open(os.path.join(root,file), 'r') as f:
            docs.append(f.read())
            names.append(file)

vv.fit(docs)
transformedDocs = vv.transform(docs)
print(transformedDocs.shape) #1400 docs with 20338 unique words
#print(transformedDocs.toarray()) 



(1400, 20338)


In [4]:
queries = [] 
for root, dirs, files in os.walk('queries'):
    for file in sorted(files, key=int):
        with open(os.path.join(root,file), 'r') as f:
            queries.append(f.read())
            

transformedQueries = vv.transform(queries)
print(transformedQueries.shape) #225 file with 20338 unique words
#print(transformedQueries.toarray()) 


(225, 20338)


In [5]:
sims = 1-cdist(transformedQueries.toarray(), transformedDocs.toarray(), metric='cosine')
print('cosine similarity array: \n', sims , '\n')
#print(sims.shape)

cosine similarity array: 
 [[ 0.          0.04416816  0.         ...,  0.00977902  0.          0.        ]
 [ 0.01664458  0.04593605  0.03206653 ...,  0.03456817  0.02332632
   0.03120366]
 [ 0.01932345  0.03254715  0.00646016 ...,  0.01212514  0.02200258
   0.01380319]
 ..., 
 [ 0.01201924  0.07037756  0.13453498 ...,  0.11359977  0.23400063
   0.1720973 ]
 [ 0.03530657  0.10114632  0.02898973 ...,  0.0536287   0.03752077
   0.03581733]
 [ 0.08693186  0.02721953  0.         ...,  0.0158465   0.0026837
   0.02706897]] 



  dm /= _row_norms(XB)


In [7]:
# Returns a list of sorted document ids according to maximum similarity
idsSort=[]
for relDocs in (sims):
    idsSort.append(sorted(range(len(relDocs)), key=lambda ix: relDocs[ix],reverse = True))

#To solve zero indexing issue
ids = np.add(1,idsSort)
print('ranked list of relevant documents for each query: \n', ids , '\n')
#print('ids shape',ids.shape)
#print(ids[0][:20])
# print(ids)
# print(len(ids[0]))
# print(len(ids))

ranked list of relevant documents for each query: 
 [[  13  184   51 ..., 1395 1399 1400]
 [  12   51  792 ...,  769 1048 1045]
 [ 485  181  144 ..., 1249 1210 1266]
 ..., 
 [ 400  484 1399 ..., 1266 1348 1395]
 [ 537  317  329 ..., 1045  963  492]
 [ 226  368  431 ..., 1368 1369 1389]] 



In [8]:
#read cranquel file to get the real relevant docs to each query
relevantDocs = []
with open('cranqrel') as f:
    for line in f.readlines():
        relevantDocs.append(line.split(" "))

#print(relevantDocs)

#initializing a dictionary with key(queries) and values(related documents) 
dic ={}
l =[]
for i in range(1,226):
    l=[]
    for j in relevantDocs:
        if(j[0] == str(i)):
            l.append(j[1])
            dic['{}'.format(i)] = list(map(int,l)) 

#Calculating the Mean Average Precision
numerator = 0
denominator = 0
summation = 0
pr=[]

for i in range(1,len(ids)):
    denominator = 1
    for j in range(1,len(ids[i])):
        if(ids[i-1][j-1] in dic['{}'.format(i)]):
            numerator += 1
            summation += (numerator/denominator)
        denominator += 1
    pr.append(summation/len(dic['{}'.format(i)]))
    summation = 0
    numerator = 0
            
precisionSum = np.sum(pr)
mapValue = precisionSum/len(dic)
print('Mean Average Precision' , mapValue)

        
# forming a list of lists so as to be able to use mapk
maplist =[]
for i in range(1,226):
    maplist.append(dic['{}'.format(i)])


print('Mean Average Precision using mapk',mapk(maplist, ids, max([len(res) for res in ids]))) # set K to maximum length of any of the results


Mean Average Precision 0.218268668715
