# IMPORT

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


import numpy as np

To get int representation of right result of queris from reference file

In [2]:
def reference_to_number_array(order_of_file):
    f = open("./r/"+str(order_of_file)+".txt")
    _array = []
    while True:
        line = f.readline()
        if line:
            number = int(line)
            _array.append(number)
        else:
            break
    return _array

# Function to get precision, recall and f1 score 
It take theory from https://en.wikipedia.org/wiki/Precision_and_recall

In [3]:
def get_precision_recall_f1(reference, my_result):
    positive = 0
    false_positive = 0
    false_negative = 0
    
    for x in my_result:
        if x in reference:
            positive += 1
        else:
            false_positive += 1
            
    false_negative = len(np.array([val for val in reference if val not in my_result]))
    
    precision = positive*1.0 / (positive + false_positive)
    recall = positive*1.0 / (positive + false_negative)
    f1 = 0
    if precision != 0 or recall != 0:
        f1 = ( 2 * (precision * recall) ) / (precision + recall)
    
    return precision, recall, f1

In [4]:
corpus = []
for d in range(1,1401,1):
    f = open("./d/"+str(d)+".txt")
    corpus.append(f.read())

queries = []
for d in range(1, 225, 1):
    f = open("./q/"+str(d)+".txt")
    query = f.read()
    queries.append(query)
    corpus.append(query)
    
reference_results = []
for d in range(1, 225, 1):
    reference_results.append(reference_to_number_array(d))

# Init of Vector Space Models

In [5]:
#TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=u'word', stop_words=None, token_pattern=u'(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<type 'numpy.int64'>, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

binary_vec = TfidfVectorizer(use_idf=False, norm=False, binary=True)
tf_vec = TfidfVectorizer(use_idf=False)
tfidf_vec = TfidfVectorizer()

# Create matrix with Vector Space Models

In [6]:
binary_matrix = binary_vec.fit_transform(corpus)
tf_matrix = tf_vec.fit_transform(corpus)
tfidf_matrix = tfidf_vec.fit_transform(corpus)

In [7]:
tfidf_sim_cosine = []
tfidf_sim_eucleidan = []

tf_sim_cosine = []
tf_sim_eucleidan = []

binary_sim_cosine = []
binary_sim_eucleidan = []

for q in range(1400, 1400 + len(queries), 1):
    tfidf_sim_cosine.append(np.array(cosine_similarity(tfidf_matrix[q], tf_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)
    tfidf_sim_eucleidan.append(np.array(euclidean_distances(tfidf_matrix[q], tf_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)
    
    tf_sim_cosine.append(np.array(cosine_similarity(tf_matrix[q], tf_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)
    tf_sim_eucleidan.append(np.array(euclidean_distances(tf_matrix[q], tf_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)
    
    binary_sim_cosine.append(np.array(cosine_similarity(binary_matrix[q], binary_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)
    binary_sim_eucleidan.append(np.array(euclidean_distances(binary_matrix[q], binary_matrix[0:1400]))[0].argsort()[-10:][::-1]+1)


In [8]:
result_tfidf_sim_cosine = []
result_tfidf_sim_eucleidan = []

result_tf_sim_cosine = []
result_tf_sim_eucleidan = []

result_binary_sim_cosine = []
result_binary_sim_eucleidan = []

for q in range(0, 224, 1):
    result_tfidf_sim_cosine.append(get_precision_recall_f1(reference_results[q], tfidf_sim_cosine[q]))
    result_tfidf_sim_eucleidan.append(get_precision_recall_f1(reference_results[q], tfidf_sim_eucleidan[q]))
    
    result_tf_sim_cosine.append(get_precision_recall_f1(reference_results[q], tf_sim_cosine[q]))
    result_tf_sim_eucleidan.append(get_precision_recall_f1(reference_results[q], tf_sim_eucleidan[q]))
    
    result_binary_sim_cosine.append(get_precision_recall_f1(reference_results[q], binary_sim_cosine[q]))
    result_binary_sim_eucleidan.append(get_precision_recall_f1(reference_results[q], binary_sim_eucleidan[q]))

In [9]:
result_tfidf_sim_cosine = np.array(result_tfidf_sim_cosine)
result_tfidf_sim_eucleidan = np.array(result_tfidf_sim_eucleidan)

result_tf_sim_cosine = np.array(result_tf_sim_cosine)
result_tf_sim_eucleidan = np.array(result_tf_sim_eucleidan)

result_binary_sim_cosine = np.array(result_binary_sim_cosine)
result_binary_sim_eucleidan = np.array(result_binary_sim_eucleidan)

# Results
First we show table of rows where each row is in format (Precision, Recall, F1)

In [10]:
result_tfidf_sim_cosine

array([[ 0.5       ,  0.17241379,  0.25641026],
       [ 0.3       ,  0.12      ,  0.17142857],
       [ 0.6       ,  0.66666667,  0.63157895],
       [ 0.2       ,  0.66666667,  0.30769231],
       [ 0.1       ,  0.2       ,  0.13333333],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.2       ,  0.33333333,  0.25      ],
       [ 0.3       ,  0.25      ,  0.27272727],
       [ 0.3       ,  0.75      ,  0.42857143],
       [ 0.3       ,  0.33333333,  0.31578947],
       [ 0.1       ,  0.125     ,  0.11111111],
       [ 0.3       ,  0.5       ,  0.375     ],
       [ 0.1       ,  0.2       ,  0.13333333],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.3       ,  0.75      ,  0.42857143],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.1       ,  0.25      ,  0.14285714],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.5       ,  0.5       ,  0.5       ],
       [ 0.2       ,  0.4       ,  0.266

In [11]:
result_tfidf_sim_eucleidan

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.   

In [12]:
result_tf_sim_cosine

array([[ 0.5       ,  0.17241379,  0.25641026],
       [ 0.2       ,  0.08      ,  0.11428571],
       [ 0.4       ,  0.44444444,  0.42105263],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.2       ,  0.33333333,  0.25      ],
       [ 0.3       ,  0.25      ,  0.27272727],
       [ 0.3       ,  0.75      ,  0.42857143],
       [ 0.2       ,  0.22222222,  0.21052632],
       [ 0.2       ,  0.25      ,  0.22222222],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.2       ,  0.5       ,  0.28571429],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.1       ,  0.25      ,  0.14285714],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.3       ,  0.3       ,  0.3       ],
       [ 0.2       ,  0.4       ,  0.266

In [13]:
result_tf_sim_eucleidan

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.   

In [14]:
result_binary_sim_cosine

array([[ 0.3       ,  0.10344828,  0.15384615],
       [ 0.1       ,  0.04      ,  0.05714286],
       [ 0.4       ,  0.44444444,  0.42105263],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.2       ,  0.4       ,  0.26666667],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.1       ,  0.16666667,  0.125     ],
       [ 0.3       ,  0.25      ,  0.27272727],
       [ 0.3       ,  0.75      ,  0.42857143],
       [ 0.3       ,  0.33333333,  0.31578947],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.2       ,  0.33333333,  0.25      ],
       [ 0.1       ,  0.2       ,  0.13333333],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.2       ,  0.5       ,  0.28571429],
       [ 0.1       ,  0.33333333,  0.15384615],
       [ 0.1       ,  0.25      ,  0.14285714],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.3       ,  0.3       ,  0.3       ],
       [ 0.1       ,  0.2       ,  0.133

In [15]:
result_binary_sim_eucleidan

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.1       ,  0.125     ,  0.11111111],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.   

And then mean of columns in the same format

In [16]:
print ("result_tfidf_sim_cosine",np.mean(result_tfidf_sim_cosine, axis=0))
print ("result_tfidf_sim_eucleidan",np.mean(result_tfidf_sim_eucleidan, axis=0))
print ("result_tf_sim_cosine",np.mean(result_tf_sim_cosine, axis=0))
print ("result_tf_sim_eucleidan",np.mean(result_tf_sim_eucleidan, axis=0))
print ("result_binary_sim_cosine",np.mean(result_binary_sim_cosine, axis=0))
print ("result_binary_sim_eucleidan",np.mean(result_binary_sim_eucleidan, axis=0))

result_tfidf_sim_cosine [ 0.21696429  0.31091678  0.23721412]
result_tfidf_sim_eucleidan [ 0.00133929  0.00122032  0.00125625]
result_tf_sim_cosine [ 0.12857143  0.17832052  0.13949442]
result_tf_sim_eucleidan [ 0.00044643  0.00034341  0.0003882 ]
result_binary_sim_cosine [ 0.14821429  0.22185451  0.16494985]
result_binary_sim_eucleidan [ 0.00580357  0.0058095   0.0056909 ]


As we can see from results, the cosine distance is much more better for this task than eucleidian distance.