In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

combined_df = pd.read_csv("data/subset100playlists.csv")
test_df = pd.read_csv("data/subset100playlists_test.csv")

In [6]:
combined_df.groupby(['pid'])['index'].count()

pid
6011      102
11502     160
19950     109
28391     108
33261     107
54473     169
57440     132
81229     198
85690     160
86423     207
97137     125
104270    170
107586    136
127235    142
152727    137
167617    233
169189    111
184470    139
219176    136
222526    108
224809    118
232425    233
241591    120
244700    125
248265    127
252316    142
264226    154
264300    178
273607    115
309778    155
         ... 
732561    114
754264    193
765870    199
767620    124
769512    104
775664    106
793615    102
800641    188
805069    170
806989    123
814658    108
820621    150
843266    137
854472    191
855953    112
894859    116
899373    181
907412    106
907923    108
910008    180
915320    113
920984    126
922162    132
927001    101
940882    149
954899    151
961257    103
966424    113
982756    116
992893    170
Name: index, Length: 100, dtype: int64

In [7]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_cleaned= combined_df.select_dtypes(include=numerics)

#var_drop = ["index","pid","pos", "count"]
#df_cleaned = df_cleaned.drop(var_drop, axis =1)

df_cleaned = pd.concat([df_cleaned, combined_df['track_uri']],axis=1)
train, test = train_test_split(df_cleaned, test_size=0.2, random_state=209, stratify = combined_df['pid'])

In [8]:
train.describe()

Unnamed: 0,index,pid,pos,count,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0,10849.0
mean,8080.006913,491964.906166,70.895013,8232.196055,0.158974,0.651862,233644.772145,0.685226,0.013163,5.128768,0.18681,-6.428973,0.644668,0.111085,122.082382,3.981381,0.554436
std,4795.423161,295137.962542,45.251782,7411.240935,0.199567,0.147992,50593.161111,0.167972,0.078921,3.67465,0.14447,2.655348,0.478636,0.109864,28.208577,0.254067,0.225426
min,107.0,6011.0,0.0,63.0,6e-06,0.144,19133.0,0.0279,0.0,0.0,0.0157,-34.475,0.0,0.0225,43.509,0.0,0.034
25%,4104.0,244700.0,34.0,2722.0,0.0202,0.549,202667.0,0.574,0.0,1.0,0.0924,-7.678,0.0,0.0386,99.99,4.0,0.386
50%,7565.0,458870.0,68.0,6106.0,0.0736,0.658,225882.0,0.702,0.0,5.0,0.129,-5.9,1.0,0.0604,120.151,4.0,0.559
75%,12571.0,769512.0,102.0,11777.0,0.222,0.757,255773.0,0.813,4.7e-05,8.0,0.25,-4.589,1.0,0.142,139.982,4.0,0.739
max,16248.0,992893.0,232.0,46574.0,0.987,0.98,728413.0,0.996,0.987,11.0,0.971,-0.804,1.0,0.956,210.75,5.0,0.99


In [9]:
train.head()

Unnamed: 0,index,pid,pos,count,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,track_uri
11900,14843,907412,6,2170,0.103,0.537,241985,0.695,0.00315,9,0.134,-5.013,1,0.0371,144.674,4,0.626,spotify:track:2oVyKK5NH2oawyrDZJn581
7619,8761,536000,2,944,0.0446,0.476,201373,0.991,0.00574,10,0.128,-3.458,1,0.052,111.03,4,0.453,spotify:track:5ETe7SElBIgm2NAZY3SpX4
2302,2860,167617,140,1593,0.258,0.39,377413,0.589,3.9e-05,9,0.069,-9.363,1,0.0384,166.093,4,0.599,spotify:track:4NSZz3qzNESLWfCqglnylo
10254,12675,775664,44,9760,0.175,0.953,214648,0.41,0.0,1,0.0768,-6.543,1,0.0705,121.074,4,0.718,spotify:track:1sCxVKWImDZSZKvG0U9B23
10290,12675,775664,80,12997,0.173,0.706,182307,0.751,0.0,9,0.168,-6.323,1,0.0708,91.031,4,0.195,spotify:track:4E5P1XyAFtrjpiIxkydly4


In [10]:
# Function to make predictions based on playlist number
def knn_predict(train, test, playlistid, n_neighbors, n_predictions):
    
    # Create Sparse Matrix
    matrix = pd.crosstab(train.pid, train.track_uri)
    playlist_sparse_matrix = csr_matrix(matrix)
    
    #Create knn model based on sparse matrix from training data
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(playlist_sparse_matrix)
    
    #Identify the distance and index of songs based on knn model 
    distance, index = model_knn.kneighbors(np.array(matrix.loc[playlistid]).reshape(1, -1), n_neighbors = n_neighbors)
 
    #Calculate number of songs in test set for given playlist
    song_count = len(test[test.pid == playlistid])
    print(song_count)
   
    #Obtain recommended playlists from matrix
    rec_playlists = matrix.index[index[0]]

    #List of predicted songs
    pred_songs = []
    
    for playlist in rec_playlists:
        predicted_songs = playlists_df_subset[playlists_df_subset["pid"] == playlist]["track_uri"]
        for song in predicted_songs:
            if len(pred_songs) == n_predictions:
                break
            if len(pred_songs) < n_predictions:
                if song not in matrix.columns.values[matrix.loc[playlistid] == 1]: #Exclude songs in playlist that were part of the training data
                    pred_songs.append(song)
                    
                
    return pred_songs

In [13]:
print(knn_predict(train, test, 775664, 10, 15))

21
['spotify:track:3m660poUr1chesgkkjQM7P', 'spotify:track:2XMTqoHHSH0lvuXrvIEdco', 'spotify:track:5dNfHmqgr128gMY2tc5CeJ', 'spotify:track:66TRwr5uJwPt15mfFkzhbi', 'spotify:track:1KhljCoJ2TgDEldnnfTHHc', 'spotify:track:7H6ev70Weq6DdpZyyTmUXk', 'spotify:track:6mghCOaaSvrke0z1EUVUIf', 'spotify:track:6QgjcU0zLnzq5OrUoSZ3OK', 'spotify:track:1W7zkKgRv9mrLbfdQ8XyH3', 'spotify:track:0kN8xEmgMW9mh7UmDYHlJP', 'spotify:track:3pLTOP0G0etiWUknFoRpsr', 'spotify:track:27GmP9AWRs744SzKcpJsTZ', 'spotify:track:5lnsCyEKWofnC00U4Ax0ti', 'spotify:track:5GXAXm5YOmYT0kL5jHvYBt', 'spotify:track:0prNGof3XqfTvNDxHonvdK']


In [14]:
#Function to calculate r_precision (https://recsys-challenge.spotify.com/rules)
#R-precision is the number of retrieved relevant tracks divided by the number of known relevant tracks (i.e., the number of withheld tracks) 

def r_precision(preds, known):
    score = np.sum(known.isin(preds))/known.shape[0]
    return score

In [15]:
n_neighbors, n_predictions = 25, 15
r_precision_scores = []

#Loop through all unique playlists in test set to identify predicted songs
for index, pid in enumerate(test['pid'].drop_duplicates()): 
    print (index, pid)
    pred_songs = knn_predict(train, test, pid, n_neighbors, n_predictions) 
    y_test = test[test.pid == pid].track_uri
    r_precision_score = r_precision(pred_songs, y_test) 
    r_precision_scores.append(r_precision_score)

0 244700
25
1 966424
23
2 915320
23
3 232425
47
4 264226
31
5 894859
23
6 632423
22
7 505427
34
8 854472
38
9 219176
27
10 57440
26
11 81229
40
12 727800
23
13 252316
28
14 458870
31
15 940882
30
16 765870
40
17 535606
24
18 443152
27
19 54473
34
20 104270
34
21 169189
22
22 314149
30
23 754264
39
24 349985
27
25 608145
25
26 273607
23
27 377320
28
28 489475
21
29 691591
31
30 405597
26
31 264300
36
32 309778
31
33 922162
26
34 19950
22
35 222526
22
36 428657
27
37 775664
21
38 591476
23
39 820621
30
40 910008
36
41 107586
27
42 954899
30
43 805069
34
44 920984
25
45 800641
38
46 85690
32
47 387732
26
48 317025
21
49 614190
28
50 440370
23
51 335195
23
52 224809
24
53 410524
26
54 559031
26
55 429811
22
56 793615
20
57 127235
28
58 961257
21
59 899373
36
60 33261
21
61 769512
21
62 542450
20
63 814658
22
64 843266
27
65 640244
32
66 579367
22
67 589625
36
68 184470
28
69 28391
22
70 855953
22
71 442634
21
72 614335
34
73 732561
23
74 992893
34
75 462165
27
76 386057
23
77 907412
21
78 

In [16]:
avg_rp = np.mean(r_precision_scores)
print('Avg. R-Precision Score: ', avg_rp)

Avg. R-Precision Score:  0.5398848971307835
