In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.neighbors import NearestNeighbors
import pickle
import random
import math
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
random.seed(3214)

In [2]:
with open('../data/pickles/main_dict.pickle', 'rb') as f:
    main_dict = pickle.load(f)

In [3]:
df_train = pd.read_csv('../data/csv/train_2.csv', index_col=0)
df_test = pd.read_csv('../data/csv/test_2.csv', index_col=0)

In [4]:
df_train = df_train.drop(df_train.columns[1:553], axis=1)
df_test = df_test.drop(df_test.columns[1:553], axis=1)

In [5]:
features_train = df_train.drop(['file'], 1)
features_test = df_test.drop(['file'], 1)

In [6]:
scaler = StandardScaler()
scaler.fit(features_train)
X_train = scaler.transform(features_train)
X_test = scaler.transform(features_test)

In [7]:
X_train.shape

(7882, 77)

In [8]:
X_test.shape

(100, 77)

In [13]:
def count_good_recommend(query_idx, answer_list, trhold):
    counter = 0
    query_id = int(df_test.iloc[query_idx].file[:-4])
    relevant = [query_id]
    query_tags = main_dict[query_id]['all_tags']
    for idx in answer_list:
        answer_id = int(df_train.iloc[idx].file[:-4])
        answer_tags = main_dict[answer_id]['all_tags']
        score = 0
        i = 0
        for tag in answer_tags:
            if tag in query_tags:
                i += 1
        if i == 0:
            continue
        
        precision = i / len(answer_tags)
        recall = i / len(query_tags)
        score = 2 * precision * recall / (precision + recall)
        if score >= trhold:        
            counter += 1
            relevant.append(answer_id)
    return counter, relevant

In [17]:
def count_ndcg(neighbors, relevant_list):
    dcg = 0
    idcg = 0
    for i in range(len(neighbors)):
        track = int(df_train.iloc[neighbors[i]].file[:-4])
        if track in relevant_list:
            dcg += 1 / math.log(i + 2, 2)
        idcg += 1 / math.log(i + 2, 2)
    ndcg = dcg / idcg
    return ndcg

In [25]:
neigh = NearestNeighbors(10, metric='mahalanobis', algorithm='brute')
neigh.fit(X_train)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='mahalanobis',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [26]:
mean_ap = 0
ndcg = 0
for i in range(100):
    neighbors = neigh.kneighbors([X_test[i]])[1][0]
    relevant_num, relevant_list = count_good_recommend(i, neighbors, trhold=0.4)
#     print(relevant_list)

    p_at_10 = relevant_num / 10
    mean_ap += p_at_10
    ndcg += count_ndcg(neighbors, relevant_list[1:])
mean_ap /= 100 
mean_ndcg = ndcg / 100
print(neigh.metric, mean_ap, mean_ndcg)

mahalanobis 0.087 0.10417105109157106


In [23]:
for dist in ['euclidean', 'manhattan', 'chebyshev']:
    neigh = NearestNeighbors(10, metric=dist)
    neigh.fit(X_train)
    mean_ap = 0
    ndcg = 0
    for i in range(100):
        neighbors = neigh.kneighbors([X_test[i]])[1][0]
        relevant_num, relevant_list = count_good_recommend(i, neighbors, trhold=0.4)
    #     print(relevant_list)

        p_at_10 = relevant_num / 10
        mean_ap += p_at_10
        ndcg += count_ndcg(neighbors, relevant_list[1:])
    mean_ap /= 100 
    mean_ndcg = ndcg / 100
    print(neigh.metric, mean_ap, mean_ndcg)

euclidean 0.09899999999999995 0.11141956880491329
manhattan 0.10599999999999993 0.12524688935720543
chebyshev 0.04800000000000001 0.053965634893757795


In [27]:
cos_similarity_mtx = cosine_similarity(X_test, X_train)

In [29]:
mean_ap = 0
ndcg = 0
for i in range(100):
    arr = cos_similarity_mtx[i]
    neighbors = arr.argsort()[-10:][::-1]
    relevant_num, relevant_list = count_good_recommend(i, neighbors, trhold=0.4)
#     print(relevant_list)
#     print(relevant_list[0], [df_train.iloc[x].track[:-4] for x in neighbors])
    p_at_10 = relevant_num / 10
    mean_ap += p_at_10
    ndcg += count_ndcg(neighbors, relevant_list[1:])
mean_ap /= 100 
mean_ndcg = ndcg / 100

In [30]:
mean_ap

0.11199999999999993

In [31]:
mean_ndcg

0.12419061246513172

In [108]:
for i in range(100):
    arr = cos_similarity_mtx[i]
    cos_neighbors = arr.argsort()[-1:][::-1]
    euclid_neighbors = neigh.kneighbors([X_test[i]])[1][0]
    test = df_test.iloc[i].track
    euclid_recom = df_train.iloc[euclid_neighbors[0]].track
    cos_recom = df_train.iloc[cos_neighbors[0]].track

    print(test, euclid_recom, cos_recom)

010673.mp3 030682.mp3 028480.mp3
066643.mp3 091468.mp3 091468.mp3
000194.mp3 006331.mp3 000211.mp3
074421.mp3 148214.mp3 148214.mp3
131553.mp3 123835.mp3 029961.mp3
024431.mp3 024422.mp3 036987.mp3
144469.mp3 091934.mp3 095724.mp3
087100.mp3 087106.mp3 087106.mp3
069204.mp3 006439.mp3 035549.mp3
027454.mp3 026655.mp3 026655.mp3
004835.mp3 055430.mp3 059721.mp3
000667.mp3 049401.mp3 049401.mp3
014063.mp3 072072.mp3 149452.mp3
137171.mp3 119828.mp3 119828.mp3
061172.mp3 011868.mp3 097814.mp3
107579.mp3 073170.mp3 073170.mp3
137171.mp3 119828.mp3 119828.mp3
024901.mp3 117057.mp3 012737.mp3
048463.mp3 025378.mp3 045102.mp3
145550.mp3 064249.mp3 091790.mp3
011786.mp3 124917.mp3 124917.mp3
073764.mp3 017499.mp3 070655.mp3
012394.mp3 066539.mp3 084157.mp3
073761.mp3 082917.mp3 082917.mp3
108962.mp3 127299.mp3 127299.mp3
000190.mp3 000621.mp3 038833.mp3
062337.mp3 091851.mp3 004799.mp3
064093.mp3 145750.mp3 139933.mp3
004233.mp3 011682.mp3 011682.mp3
075752.mp3 109670.mp3 110070.mp3
124702.mp3

In [67]:
cos_similarity_mtx = cosine_similarity(X_test, X_train)

In [73]:
arr = cos_similarity_mtx[0]

In [76]:
arr.argsort()[-10:][::-1]

array([ 659, 6443, 2378,   71, 1067, 7035, 2826, 2379, 7715, 7009])

In [84]:
cos_similarity_mtx[0].sort()

In [85]:
cos_similarity_mtx[0][-10:]

array([0.76576922, 0.77147324, 0.78109024, 0.78396155, 0.78587406,
       0.78773973, 0.80250647, 0.80814343, 0.81528035, 0.83231566])

In [87]:
math.cos(0)

1.0

In [22]:
print(f'euclidean meanAP {mean_ap}, meanNDCG {mean_ndcg}')

euclidean meanAP 0.09299999999999997, meanNDCG 0.10454909808110224


In [29]:
print(f'namhattan meanAP {mean_ap}, meanNDCG {mean_ndcg}')

namhattan meanAP 0.06899999999999998, meanNDCG 0.08812867556608356


In [36]:
print(f'chebyshev meanAP {mean_ap}, meanNDCG {mean_ndcg}')

chebyshev meanAP 0.05899999999999998, meanNDCG 0.0732865384316484


In [45]:
print(f'mahalanobis meanAP {mean_ap}, meanNDCG {mean_ndcg}')

mahalanobis meanAP 0.04799999999999999, meanNDCG 0.056724564869462514
