# Collaborative user-user filtering

Imports

In [1]:
import numpy as np
import pandas as pd

The original file contains
- 1,019,318 unique users
- 48,373,586 user-song.play count triplets

A subset of 1000 triplets can be found in triplets_1000.txt, where each line is in the format:
    
    userID \tab songID \tab play_count

Read in the data:

In [2]:
user_profiles = pd.read_csv('triplets_50000.txt', sep='\t', names = ['userID','songID', 'play_count'])

In [9]:
user_profiles_full = pd.read_csv('train_triplets.txt', sep='\t', names = ['userID','songID', 'play_count'])
# max playcount is 9667 so we can safely convert to uint16
data_types_dict = {'play_count': np.uint16}
user_profiles_full = user_profiles_full.astype(data_types_dict)

In [10]:
user_profiles_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48373586 entries, 0 to 48373585
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   userID      object
 1   songID      object
 2   play_count  uint16
dtypes: object(2), uint16(1)
memory usage: 830.4+ MB


In [4]:
user_profiles_full.play_count.max()

9667

In [15]:
# user_profiles_full = user_profiles_full.pivot_table(index="userID", columns="songID", values="play_count", aggfunc=np.sum,fill_value=0)
user_profiles_full.groupby(['userID', 'songID']).play_count.sum().unstack()

print(user_profiles_full.memory_usage().sum() / 1e6)
# convert datafram to sparse dataframe
user_profiles_full = user_profiles_full.astype(pd.SparseDtype("int", 0))
# check memory usage in mb
user_profiles_full.memory_usage().sum() / 1e6

MemoryError: cannot allocate memory for array

The problem: the original dataset of triplets is too large to be converted in this way.
Possible solutions:
1. dtype optimization
2. Split data into chunks

In [3]:
user_profiles

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
49995,f0cd8df775b33e171e2f1f5454338e2f82feaa89,SOLFDCL12A8C141E21,2
49996,f0cd8df775b33e171e2f1f5454338e2f82feaa89,SOLZPIJ12A8C13841B,1
49997,f0cd8df775b33e171e2f1f5454338e2f82feaa89,SOMSKHE12AB0181BBC,1
49998,f0cd8df775b33e171e2f1f5454338e2f82feaa89,SOMVLRV12AB01816EB,2


Pivot to tranform the data from long to wide:

In [4]:
user_profiles = user_profiles.pivot(index='userID', columns='songID', values='play_count')

In [5]:
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


In [6]:
# http://millionsongdataset.com/sites/default/files/AdditionalFiles/unique_tracks.txt
songs = pd.read_csv('unique_tracks.txt' ,sep='<SEP>', names=['track_id',  'song_id',  'artist_name', 'song_title'], engine='python') 
songs.head()

Unnamed: 0,track_id,song_id,artist_name,song_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [7]:
from collections import defaultdict
id_to_song_name = defaultdict(lambda : 'NA', zip(songs.song_id, songs.song_title))
unique_songs = user_profiles.columns
names = []
for song in unique_songs:
    names.append(id_to_song_name[song]) 
# check number of missing songs
a = np.array(names)
a[a == 'NA'].size

0

In [8]:
#save it as a csv (do it only once)
#user_profiles.to_csv(path_or_buf= 'data/user_profile_from_1000_triplets.csv')

Drop the columns where all elements are NaN

In [9]:
# this line wont have any effect since each song only exist if at least one user already has listened to it
user_profiles = user_profiles.dropna(axis=1, how='all')
user_profiles

songID,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACTC12AB0186A20,SOAADCB12A81C22AFA,SOAAEJI12AB0188AB5,SOAAEKX12A6D4F7E4E,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAIJG12AAA15D821,SOAAIWE12A8AE4706B,...,SOZZVFP12A8C140F14,SOZZVMW12AB0183B52,SOZZVNT12AF729EBC9,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZXAO12A58A7D379,SOZZYAO12A6701FF36,SOZZYDA12AB01824FB,SOZZYMH12AB0180A51,SOZZZFB12A8AE45CDC
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,,,,,,,,,,,...,,,,,,,,,,
0039bd8483d578997718cdc0bf6c7c88b679f488,,,,,,,,,,,...,,,,,,,,,,
00498f4bab2bfeb17680113c7d9525ad5b0ad401,,,,,,,,,,,...,,,,,,,,,,
00a443baf550f4bbdd974ba73720abf2759166f3,,,,,,,,,,,...,,,,,,,,,,
0152fcbd02b172a874c75a57a913f0f0109ba272,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe979a7b199de3ee8a78486c10e5ed13587fc359,,,,,,,,,,,...,,,,,,,,,,
fed37c4c49c9f217b3371c2f2c0e7541656e55cf,,,,,,,,,,,...,,,,,,,,,,
ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,,,,,,,,,,,...,,,,,,,,,,
ff4322e94814d3c7895d07e6f94139b092862611,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# Replace the NaN with 0s.
user_profiles = user_profiles.fillna(0)

Get **cosine similarity** for play counts between users

In [11]:
# pairwise_distances is the distance between counts, thus 1 - pairwise_distances is the similarity between counts
from sklearn.metrics import pairwise_distances

cosine_sim = 1-pairwise_distances(user_profiles , metric="cosine")

In [88]:
# Calculate the cosine similarity matrix for the users
M_cosine = pd.DataFrame(cosine_sim)
M_cosine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009
0,1.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.014222
1,0.000000,1.000000,0.0,0.007411,0.014263,0.0,0.000672,0.000000,0.000000,0.0,...,0.000000,0.013076,0.005531,0.0,0.011609,0.0,0.001853,0.000000,0.0,0.000000
2,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000
3,0.000000,0.007411,0.0,1.000000,0.000000,0.0,0.001498,0.000000,0.010910,0.0,...,0.014027,0.012416,0.000000,0.0,0.013534,0.0,0.000000,0.000854,0.0,0.023939
4,0.000000,0.014263,0.0,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.008441,0.000000,0.0,0.031711,0.0,0.000000,0.006902,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.000000,0.000000,0.0,0.000000
1006,0.000000,0.001853,0.0,0.000000,0.000000,0.0,0.001476,0.000000,0.004093,0.0,...,0.000000,0.000000,0.010123,0.0,0.000000,0.0,1.000000,0.000000,0.0,0.000000
1007,0.000000,0.000000,0.0,0.000854,0.006902,0.0,0.000000,0.000675,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.001532,0.0,0.000000,1.000000,0.0,0.009003
1008,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,1.0,0.000000


Get **pearson similarity** for all users

In [70]:
pearson_sim = 1-pairwise_distances(user_profiles, metric="correlation")
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009
0,1.000000,-0.000764,-0.000345,-0.000615,-0.000546,-0.001072,-0.000511,-0.000550,-0.000349,-0.000316,...,-0.000572,-0.001126,-0.000490,-0.000542,-0.001237,-0.000556,-0.000763,-0.000419,-0.000404,0.013652
1,-0.000764,1.000000,-0.000601,0.006349,0.013328,-0.001866,-0.000217,-0.000957,-0.000607,-0.000550,...,-0.000995,0.011144,0.004684,-0.000943,0.009485,-0.000967,0.000528,-0.000729,-0.000704,-0.001007
2,-0.000345,-0.000601,1.000000,-0.000483,-0.000429,-0.000843,-0.000402,-0.000432,-0.000274,-0.000248,...,-0.000449,-0.000884,-0.000385,-0.000425,-0.000972,-0.000437,-0.000600,-0.000329,-0.000318,-0.000454
3,-0.000615,0.006349,-0.000483,1.000000,-0.000763,-0.001500,0.000784,-0.000770,0.010428,-0.000442,...,0.013238,0.010865,-0.000685,-0.000758,0.011833,-0.000777,-0.001068,0.000269,-0.000566,0.023149
4,-0.000546,0.013328,-0.000429,-0.000763,1.000000,-0.001332,-0.000635,-0.000683,-0.000433,-0.000392,...,-0.000710,0.007058,-0.000608,-0.000673,0.030241,-0.000690,-0.000948,0.006385,-0.000502,-0.000718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,-0.000556,-0.000967,-0.000437,-0.000777,-0.000690,-0.001357,-0.000647,-0.000696,-0.000441,-0.000400,...,-0.000723,-0.001424,-0.000620,-0.000685,-0.001564,1.000000,-0.000965,-0.000530,-0.000512,-0.000732
1006,-0.000763,0.000528,-0.000600,-0.001068,-0.000948,-0.001863,0.000589,-0.000956,0.003490,-0.000549,...,-0.000993,-0.001956,0.009282,-0.000941,-0.002148,-0.000965,1.000000,-0.000727,-0.000703,-0.001005
1007,-0.000419,-0.000729,-0.000329,0.000269,0.006385,-0.001022,-0.000487,0.000151,-0.000332,-0.000301,...,-0.000545,-0.001073,-0.000467,-0.000516,0.000357,-0.000530,-0.000727,1.000000,-0.000385,0.008457
1008,-0.000404,-0.000704,-0.000318,-0.000566,-0.000502,-0.000987,-0.000471,-0.000506,-0.000321,-0.000291,...,-0.000527,-0.001036,-0.000451,-0.000499,-0.001138,-0.000512,-0.000703,-0.000385,1.000000,-0.000533


Same for euclidean and hamming :

In [71]:
euclidean_sim = 1-pairwise_distances(user_profiles, metric="euclidean")
M_euclidean = pd.DataFrame(euclidean_sim)

hamming_sim = 1-pairwise_distances(user_profiles, metric="hamming")
M_hamming = pd.DataFrame(hamming_sim)



A function that finds k similar users given userID and the user_profiles matrix

In [75]:
from sklearn.neighbors import NearestNeighbors

def get_similarusers(userID, user_profiles, similarity_metric , k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric , algorithm = 'brute')
    knn.fit(user_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(user_profiles.loc[userID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar users to user {}, using {} similarity:\n'.format(k, userID, similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        if user_profiles.index[neigh_ind.flatten()[i]] == userID:
            continue;
        else:
            print('{}: User {}, with similarity of {}'.format(i, user_profiles.index[neigh_ind.flatten()[i]], similarity.flatten()[i]))
            
    return similarity,neigh_ind

In [76]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'cosine', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using cosine similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24960454854408143
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.18669122855068843
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13388321848534124
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.12962980922945277


In [77]:
similarities,indices = get_similarusers( '5a905f000fc1ff3df7ca807d57edb608863db05d', user_profiles , similarity_metric = 'correlation', k = 4)

4 most similar users to user 5a905f000fc1ff3df7ca807d57edb608863db05d, using correlation similarity:

1: User 5696bf760215a24ba6f381b8e466828131ddf13b, with similarity of 0.24690946133746028
2: User cbf471272db7a183cf596a4cb79aa27e7d72dda1, with similarity of 0.1855051128720917
3: User bd4c6e843f00bd476847fb75c47b4fb430a06856, with similarity of 0.13279425369885756
4: User b98490b4e714cbdf98f4e819c150fc6eff83cb28, with similarity of 0.1290498311209165


# Item-Item collaborative filtering

In [13]:
song_profiles = user_profiles.T
song_profiles = song_profiles.fillna(0)
song_profiles

userID,0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0039bd8483d578997718cdc0bf6c7c88b679f488,00498f4bab2bfeb17680113c7d9525ad5b0ad401,00a443baf550f4bbdd974ba73720abf2759166f3,0152fcbd02b172a874c75a57a913f0f0109ba272,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,02c2dbc1119bca9513259f1b1903432538037d10,037167e01a2b265b8ee59694db943f9556876be2,03c90bfd09151973863c4cadd5a749cd7982abc0,...,fca15b7964a099b2860dfdb158a2430fb10c4384,fd13b9d49c54e00ff413fe3c095ba581c7fc611e,fd585aef5c32d3943bd6e7f9f39aa216ba659fd0,fdb815231ee1d66f383b80d279bd58769dfe59ff,fe76c9d535c5834e4a9b91c13e29be6460cb79c4,fe979a7b199de3ee8a78486c10e5ed13587fc359,fed37c4c49c9f217b3371c2f2c0e7541656e55cf,ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf,ff4322e94814d3c7895d07e6f94139b092862611,ffadf9297a99945c0513cd87939d91d8b602936b
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAGQ12A8C1420C8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAACPJ12A81C21360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAACTC12AB0186A20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAADCB12A81C22AFA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAEJI12AB0188AB5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOZZXAO12A58A7D379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZZYAO12A6701FF36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZZYDA12AB01824FB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZZYMH12AB0180A51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.metrics import pairwise_distances_chunked
pearson_sim = 1-next(pairwise_distances_chunked(song_profiles, metric="correlation", n_jobs=4))

In [15]:
# will form 28k x 28k matrix
# matrix is too large, so work wit a chunk of it at a time
M_pearson = pd.DataFrame(pearson_sim)
M_pearson

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27888,27889,27890,27891,27892,27893,27894,27895,27896,27897
0,1.000000,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,-0.000991,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
1,-0.000991,1.000000,-0.000991,-0.000991,-0.000991,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,-0.000991,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
2,-0.000991,-0.000991,1.000000,-0.000991,-0.000991,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,-0.000991,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
3,-0.000991,-0.000991,-0.000991,1.000000,-0.000991,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,-0.000991,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
4,-0.000991,-0.000991,-0.000991,-0.000991,1.000000,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,-0.000991,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4806,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991,-0.001985,-0.000991,-0.000991,-0.000991,...,1.000000,-0.000991,-0.000991,-0.001718,-0.001149,-0.000991,-0.000991,-0.000991,-0.000991,-0.000991
4807,-0.001121,-0.001121,-0.001121,-0.001121,-0.001121,-0.001121,-0.002246,-0.001121,-0.001121,-0.001121,...,-0.001121,-0.001121,-0.001121,-0.001944,-0.001300,-0.001121,-0.001121,-0.001121,-0.001121,-0.001121
4808,-0.001401,-0.001401,-0.001401,-0.001401,-0.001401,-0.001401,-0.002806,-0.001401,-0.001401,-0.001401,...,-0.001401,-0.001401,-0.001401,-0.002428,-0.001623,-0.001401,-0.001401,-0.001401,-0.001401,-0.001401
4809,-0.001985,0.499256,-0.001985,-0.001985,-0.001985,-0.001985,-0.003976,-0.001985,-0.001985,-0.001985,...,-0.001985,-0.001985,-0.001985,-0.003442,-0.002300,-0.001985,-0.001985,-0.001985,-0.001985,-0.001985


In [27]:
from sklearn.neighbors import NearestNeighbors

def get_similar_songs(songID, song_profiles, similarity_metric, k):
    '''Find k most similar users to a given userID'''
    similarity = list()
    neigh_ind = list()
    
    knn = NearestNeighbors(metric = similarity_metric, algorithm = 'brute')
    knn.fit(song_profiles.values) #taking .values to avoid sklearn warning
                                #UserWarning: X does not have valid feature names, but NearestNeighbors was fitted with feature names
    
    neigh_dist, neigh_ind = knn.kneighbors(song_profiles.loc[songID].values.reshape(1,-1), n_neighbors = k+1) #plus one, bcs it includes the user we want to compare against 
    similarity = 1-neigh_dist.flatten()
    print('{} most similar song to song {}, using {} similarity:\n'.format(k, id_to_song_name[songID], similarity_metric))
    
    for i in range(0,len(neigh_ind.flatten())):
        song_id = song_profiles.index[neigh_ind.flatten()[i]]
        if song_id == songID:
            continue;
        else:
            print('{}: song {}, with similarity of {}'.format(i, id_to_song_name[song_id], similarity.flatten()[i]))
            
    return similarity, neigh_ind

In [31]:
song = 'SOZZXAO12A58A7D379'
print('song name: ', id_to_song_name[song])
similarities,indices = get_similar_songs(song, song_profiles, similarity_metric = 'correlation', k = 4)

song name:  I Swear (LP Version)
4 most similar song to song I Swear (LP Version), using correlation similarity:

0: song Bhangra Fever, with similarity of 1.0
1: song Learning To Live With Me, with similarity of 1.0
2: song Don't Speak, with similarity of 1.0
3: song Je Reviens Te Chercher, with similarity of 1.0
4: song It's Goin' Down 2nite, with similarity of 1.0
