In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle

subset100 = pd.read_csv("../raw_data/track_meta_100subset_new.csv")
subset100 = shuffle(subset100)
# subset100 = subset100.reset_index()

### Train-val-test split

In [137]:
features_drop = ["Playlist","Album", "Track", "Artist", "Trackid", "Artist_Name", "Track_Name", "Album_Name", "Artist_uri", "Track_uri", "Album_uri", "artist_genres", "explicit"]
subset100 = subset100.drop(features_drop, axis =1) # remove playlist related features

In [138]:
subset100.dtypes

Playlistid             int64
Track_Duration         int64
acousticness         float64
artist_popularity      int64
danceability         float64
energy               float64
instrumentalness     float64
key                    int64
liveness             float64
loudness             float64
mode                   int64
speechiness          float64
tempo                float64
time_signature         int64
valence              float64
dtype: object

In [139]:
subset100.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
542,38828,258342,0.0291,74,0.641,0.922,0.0,2,0.0862,-4.457,1,0.0786,146.078,4,0.847
1142,114695,209440,0.0139,71,0.748,0.534,2e-06,9,0.0987,-7.876,0,0.0959,106.017,4,0.119
1105,97870,230733,0.0077,46,0.888,0.666,0.00156,10,0.0555,-5.547,0,0.163,125.993,4,0.464
452,22204,290906,0.348,81,0.381,0.594,0.0,3,0.0925,-5.97,1,0.0486,143.86,4,0.273
1153,114695,211908,0.0469,73,0.696,0.801,0.0,5,0.502,-5.641,0,0.0952,93.003,4,0.536


In [140]:
# Train-val-test split (20%)
train, test = train_test_split(subset100, test_size=0.2, random_state=42, stratify = subset100['Playlistid'])
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify = train['Playlistid'])

In [141]:
train.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
810,61388,330769,0.338,39,0.752,0.766,0.0,1,0.0861,-5.11,1,0.251,141.82,4,0.501
665,51590,209640,0.17,70,0.941,0.335,0.0,5,0.262,-10.179,0,0.505,120.041,4,0.707
1863,193450,262521,0.0685,72,0.597,0.531,4e-06,3,0.0617,-6.83,0,0.0295,74.938,4,0.271
2002,198885,204346,0.0524,88,0.779,0.787,0.0,10,0.14,-4.305,0,0.108,124.982,4,0.708
935,73524,263826,0.00381,66,0.866,0.606,6e-06,7,0.0861,-4.762,1,0.04,119.991,4,0.838


In [142]:
test.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
539,38828,215866,0.0808,87,0.707,0.833,4.4e-05,1,0.0778,-2.553,0,0.0359,145.997,4,0.734
578,38828,219773,0.00112,54,0.57,0.886,0.00739,6,0.594,-4.145,0,0.044,158.035,4,0.805
2455,229646,520786,0.0353,70,0.367,0.452,0.00228,10,0.689,-10.422,1,0.0307,113.066,4,0.189
1755,186672,287733,0.0771,73,0.712,0.967,0.0,7,0.356,-5.206,1,0.047,145.024,4,0.895
510,37634,241202,0.231,70,0.503,0.555,0.000532,0,0.118,-4.886,1,0.227,90.916,4,0.467


In [143]:
train = train.reset_index(drop=True)

In [144]:
train.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,61388,330769,0.338,39,0.752,0.766,0.0,1,0.0861,-5.11,1,0.251,141.82,4,0.501
1,51590,209640,0.17,70,0.941,0.335,0.0,5,0.262,-10.179,0,0.505,120.041,4,0.707
2,193450,262521,0.0685,72,0.597,0.531,4e-06,3,0.0617,-6.83,0,0.0295,74.938,4,0.271
3,198885,204346,0.0524,88,0.779,0.787,0.0,10,0.14,-4.305,0,0.108,124.982,4,0.708
4,73524,263826,0.00381,66,0.866,0.606,6e-06,7,0.0861,-4.762,1,0.04,119.991,4,0.838


### Content-based Collaborative Filtering

In [145]:
# train_content
from sklearn.metrics.pairwise import cosine_similarity 

In [146]:
train_noid = train.drop(["Playlistid"], axis=1)
test = test.drop(["Playlistid"], axis=1)

In [None]:
# def cos_similar_songs(cos_matrix, target_inx, k):
    

### Create a cosine-similarity matrix

In [147]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_noid)
train_scaled = scaler.transform(train_noid)
test_scaled = scaler.transform(test)

In [148]:
train_scaled_cos_matrix = cosine_similarity(train_scaled)
# np.dot(np.array(trial_scaled[0,]), np.array(trial_scaled[5,]))/ (np.linalg.norm(trial_scaled[0,]) * np.linalg.norm(trial_scaled[5,]))


In [149]:
train_scaled_cos_matrix[:5]

array([[1.        , 0.77864901, 0.78557408, ..., 0.87935908, 0.78405341,
        0.7471867 ],
       [0.77864901, 1.        , 0.8633858 , ..., 0.82595018, 0.85509706,
        0.88469696],
       [0.78557408, 0.8633858 , 1.        , ..., 0.81665914, 0.92625893,
        0.87363356],
       [0.75907538, 0.90784503, 0.92596571, ..., 0.88500774, 0.91763266,
        0.93526949],
       [0.91890508, 0.83016886, 0.83211765, ..., 0.96618927, 0.80701853,
        0.79176264]])

In [150]:
train.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,61388,330769,0.338,39,0.752,0.766,0.0,1,0.0861,-5.11,1,0.251,141.82,4,0.501
1,51590,209640,0.17,70,0.941,0.335,0.0,5,0.262,-10.179,0,0.505,120.041,4,0.707
2,193450,262521,0.0685,72,0.597,0.531,4e-06,3,0.0617,-6.83,0,0.0295,74.938,4,0.271
3,198885,204346,0.0524,88,0.779,0.787,0.0,10,0.14,-4.305,0,0.108,124.982,4,0.708
4,73524,263826,0.00381,66,0.866,0.606,6e-06,7,0.0861,-4.762,1,0.04,119.991,4,0.838


In [None]:
def cos_similar_songs(cos_matrix, orig_df, target_inx, k):
    target_playlistid = train

In [157]:
cos_matrix = train_scaled_cos_matrix

In [151]:
cos_matrix = train_scaled_cos_matrix
target_inx = 0
## Above should be removed
target_playlistid = train.loc[0, "Playlistid"]

target_track_inx = np.where(train["Playlistid"] == target_playlistid)
# remove rows in matrix with songs that live in the same playlist with the target song
candidate_cos_matrix = np.delete(cos_matrix, target_track_inx, 1) # remoe songs that co-exist in the same playlist

candidate_song_rec = candidate_cos_matrix[target_inx, ] # row of target playlist

## Todo: sort the row with highest (recommend most similar tracks)

In [171]:
cos_matrix[target_inx,]

array([1.        , 0.77864901, 0.78557408, ..., 0.87935908, 0.78405341,
       0.7471867 ])

In [164]:
cos_matrix[:5]

array([[1.        , 0.77864901, 0.78557408, ..., 0.87935908, 0.78405341,
        0.7471867 ],
       [0.77864901, 1.        , 0.8633858 , ..., 0.82595018, 0.85509706,
        0.88469696],
       [0.78557408, 0.8633858 , 1.        , ..., 0.81665914, 0.92625893,
        0.87363356],
       [0.75907538, 0.90784503, 0.92596571, ..., 0.88500774, 0.91763266,
        0.93526949],
       [0.91890508, 0.83016886, 0.83211765, ..., 0.96618927, 0.80701853,
        0.79176264]])

In [156]:
target_track_inx

(array([   0,   70,   95,  122,  127,  133,  135,  185,  290,  318,  380,
         440,  486,  488,  499,  524,  537,  542,  636,  652,  706,  733,
         739,  754,  791, 1065, 1073, 1087, 1097, 1139, 1156, 1227, 1241,
        1263, 1266, 1299, 1325, 1358, 1462, 1482, 1484, 1502, 1526, 1561,
        1575, 1579, 1616, 1622, 1623, 1664, 1672, 1673, 1745, 1754, 1763,
        1781, 1790, 1855, 1865, 1923, 1954]),)

In [161]:
candidate_cos_matrix = np.delete(cos_matrix, target_track_inx, 0)

In [162]:
candidate_cos_matrix

(1909, 1970)

In [153]:
candidate_cos_matrix = cos_matrix[]

(array([   0,   70,   95,  122,  127,  133,  135,  185,  290,  318,  380,
         440,  486,  488,  499,  524,  537,  542,  636,  652,  706,  733,
         739,  754,  791, 1065, 1073, 1087, 1097, 1139, 1156, 1227, 1241,
        1263, 1266, 1299, 1325, 1358, 1462, 1482, 1484, 1502, 1526, 1561,
        1575, 1579, 1616, 1622, 1623, 1664, 1672, 1673, 1745, 1754, 1763,
        1781, 1790, 1855, 1865, 1923, 1954]),)

In [117]:
train.head()

Unnamed: 0,Playlistid,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
763,61388,196600,0.0392,82,0.926,0.336,0.0,0,0.138,-9.326,0,0.594,92.995,4,0.38
647,51590,305840,0.000402,85,0.364,0.841,0.0,8,0.242,-4.831,1,0.119,100.14,5,0.0837
1868,193450,213427,0.00312,77,0.583,0.732,1e-05,11,0.269,-5.728,0,0.0457,145.992,4,0.276
1979,198885,195920,0.0841,83,0.732,0.396,3.6e-05,4,0.105,-9.348,0,0.0286,90.024,4,0.548
934,73524,326280,0.103,81,0.795,0.596,0.0,11,0.164,-3.799,0,0.219,101.985,4,0.357


In [119]:
# target_playlistid
train["Playlistid"]

763      61388
647      51590
1868    193450
1979    198885
934      73524
2378    223534
2589    231023
109       2259
1683    175237
1119    100221
2969    256979
1855    193450
2047    206080
2126    208692
1343    151474
181       2535
1338    151474
892      67941
1153    114695
217       2535
2175    210308
911      68119
2293    221044
533      38828
641      51590
758      59745
3001    256979
1169    117841
1128    107065
1784    190503
         ...  
2900    249966
689      56681
1540    155598
1209    118342
1195    118342
565      38828
541      38828
2407    229646
1179    117841
150       2535
2638    232421
750      59745
2761    241546
2637    232421
792      61388
1155    116737
2379    223534
2520    229646
733      58121
1245    118724
1580    164819
1850    193450
1244    118724
2763    241546
736      58121
1555    155598
630      48330
1736    182533
209       2535
453      22204
Name: Playlistid, Length: 1970, dtype: int64

In [4]:
# Create Binary Sparse Matrix
co_mat = pd.crosstab(train.Playlistid, train.Track_uri)
co_mat = co_mat.clip(upper=1)
assert np.max(co_mat.describe().loc['max']) == 1

co_mat_sparse = csr_matrix(co_mat)

In [5]:
# Train kNN model
col_filter = NearestNeighbors(metric='cosine', algorithm='brute')
col_filter.fit(co_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

## Making Predictions

In [6]:
def nholdout(playlist_id, df):
    '''Pass in a playlist id to get number of songs held out in val/test set'''
    
    return len(df[df.Playlistid == playlist_id].Track_uri)

In [None]:
def kpredict(knnmodel, playlist_id, df):
    '''for a playlist id, generate list of 15*k predictions where k is num holdouts''' 
    
    k = nholdout(playlist_id, df)*15 # number of holdouts
    ref_songs = co_mat.columns.values[co_mat.loc[playlist_id] == 1] # songs already in playlist
    dist, ind = knnmodel.kneighbors(np.array(co_mat.loc[playlist_id]).reshape(1, -1), n_neighbors = 99)
    rec_ind = co_mat.index[ind[0]] # recommended playlists
    
    n_pred = 0
    pred = []
    for i in rec_ind:
        new_songs = co_mat.columns.values[co_mat.loc[i] == 1] # potential recommendations
        for song in new_songs:
            if song not in ref_songs: # only getting songs not already in target playlist
                pred.append(song)
                n_pred += 1
                if n_pred == k:
                    break
        if n_pred == k:
            break
    
    return pred

In [14]:
### Prediction Example
pi = 430 # target playlist index
kpreds = kpredict(col_filter, pi, val) # list of predictions

In [23]:
val_set = val[val.Playlistid == pi]
val_set = val_set['Track_uri'] # ground truth

## Metrics

In [24]:
def r_precision(prediction, val_set):
# prediction should be a list of predictions
# val_set should be pandas Series of ground truths
    score = np.sum(val_set.isin(prediction))/val_set.shape[0]
    return score

In [25]:
### Example Usage
r_precision(kpreds, val_set)

0.0

In [26]:
### NDCG Code Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [28]:
### Example Usage
# Generate binary relevance array
r = np.zeros(len(kpreds))
for i, p in enumerate(kpreds):
    if p in val_set:
        r[i] = 1

ndcg_at_k(r, len(r))

0.0

## Baseline Model Performance

In [57]:
rps = []
ndcgs = []
for pid in co_mat.index:
    ps = kpredict(col_filter, pid, val) # predictions
    vs = val[val.Playlistid == pid].Track_uri # ground truth
    rps.append(r_precision(ps, vs))
    
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs.append(ndcg_at_k(r, len(r)))
    

In [58]:
avg_rp = np.mean(rps)
avg_ndcg = np.mean(ndcgs)
print('Avg. R-Precision: ', avg_rp)
print('Avg. NDCG: ', avg_ndcg)
print('Total Sum: ', np.mean([avg_rp, avg_ndcg]))

Avg. R-Precision:  0.07702539127539126
Avg. NDCG:  0.08034624710411524
Total Sum:  0.07868581918975326
