In [31]:
import pandas as pd
import os

## Load DataFrame

In [6]:
df = pd.read_csv("../data/all.csv")

In [7]:
df.columns

Index(['Age', 'Album_Name', 'Artist', 'Year', 'Description', 'Age Group',
       'Album_ID', 'Album_Name_Spotify', 'Artists_Spotify', 'Track_ID', 'ISRC',
       'Track_Name', 'Artists', 'popularity', 'preview_url', 'image_url',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'lyrics'],
      dtype='object')

In [9]:
song_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', \
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', \
                'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature']

columns = ['key','mode', 'time_signature', 'duration_min','popularity', 'danceability', 'energy','loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

df['duration_min'] = df['duration_ms']/10**3/60

df = df.dropna(subset=columns)
df = df.astype({'key': 'Int64', 'mode':'Int64', 'time_signature':'Int64'})



In [5]:
df.head(3)

Unnamed: 0,Track_ID,Track_Name,preview_url,image_url,Artists,Age,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2FPQI1LRwWszttbRG8hknk,Games Monsters Play,https://p.scdn.co/mp3-preview/33cc59cc1836954e...,https://i.scdn.co/image/ab67616d0000b273d61faa...,"['Herry Monster', 'Grover']",2,5,0.738,0.544,7,-8.557,1,0.346,0.212,0.0,0.0937,0.961,144.448,204267,4
1,6pOoswwC1lNBI2TapMdaEW,Afraid of the Dark,https://p.scdn.co/mp3-preview/cf340f0b536edadd...,https://i.scdn.co/image/ab67616d0000b273d61faa...,['Telly Monster'],2,5,0.505,0.525,0,-10.897,1,0.109,0.355,0.0,0.1,0.444,127.922,141240,4
2,2EEwSq98rKwlRWT7sNCLRc,Eensy Weensy Spider,https://p.scdn.co/mp3-preview/4cdc12aaeb7da4b7...,https://i.scdn.co/image/ab67616d0000b273d61faa...,"['Count Von Count', 'The Sesame Street Kids']",2,9,0.875,0.338,0,-11.382,1,0.397,0.762,0.0,0.0992,0.962,116.027,94693,4


## Recommend Songs by KNN

### Fit Model

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import NearestNeighbors


categorical_columns = ['key','mode', 'time_signature']

numeric_columns = ['Age','duration_min','popularity', 'danceability', 'energy','loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

features = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numeric', 'passthrough', numeric_columns)
])


model = Pipeline([
    ('features', features),
    ('normalize', Normalizer()),
    ('knn', NearestNeighbors(n_neighbors=10))
])

In [15]:
df = df.dropna().drop_duplicates()
df = df.reset_index(drop=True)

model.fit(df)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(),
                                                  ['key', 'mode',
                                                   'time_signature']),
                                                 ('numeric', 'passthrough',
                                                  ['Age', 'duration_min',
                                                   'popularity', 'danceability',
                                                   'energy', 'loudness',
                                                   'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'liveness', 'valence',
                                                   'tempo'])])),
                ('normalize', Normalizer()),
          

In [16]:
model[2]

NearestNeighbors(n_neighbors=10)

In [17]:
from joblib import dump, load
dump(model[2], '../models/knn.joblib')   # dump knn model
dump(df, '../models/songs_df.joblib')


['../models/songs_df.joblib']

In [18]:
distance, indices = model[2].kneighbors()             

In [19]:
indices[0]    # indices[i] is the list of the nearest kneightbors for item-i

array([ 109,  899, 1613,  391, 1722, 1329, 1601, 1338,    3,  110],
      dtype=int64)

In [20]:
df.iloc[indices[1]][0:3]   # show the first three recommendation for song-1

Unnamed: 0,Track_ID,Track_Name,preview_url,image_url,Artists,Age,popularity,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_min
370,6yAArqwRhIx3aBZLh6fzyL,Everybody Dance,https://p.scdn.co/mp3-preview/51bd21a4ca87e14d...,https://i.scdn.co/image/ab67616d0000b273504563...,['Hap Palmer'],2,4,0.852,0.495,0,...,1,0.045,0.205,9e-06,0.268,0.96,131.009,149773,4,2.496217
116,3hn8UqMoSkT9ULTYt69bDs,(No Matter If I'm) Wet or Dry,https://p.scdn.co/mp3-preview/6f5914e297b55fd8...,https://i.scdn.co/image/ab67616d0000b273e8220a...,"[""Sesame Street's Chrissy And The Alphabeats""]",2,5,0.947,0.814,0,...,1,0.0518,0.353,6e-06,0.206,0.909,113.782,98760,4,1.646
2212,5KYQWgoZpru8taYk4SRZGF,Port Side,https://p.scdn.co/mp3-preview/a91b40ab885d6268...,https://i.scdn.co/image/ab67616d0000b273948e5e...,['Captain Bogg & Salty'],3,4,0.885,0.49,0,...,1,0.053,0.00759,0.0216,0.102,0.771,120.077,108933,4,1.81555


### Example: Pick up a song using index, and make recommendations.

In [21]:
knn = load('../models/knn.joblib')   # dump knn model
df = load('../models/songs_df.joblib')


In [23]:
distance, indices = knn.kneighbors()

In [24]:
indices

array([[  109,   899,  1613, ...,  1338,     3,   110],
       [  370,   116,  2212, ...,  1947,  1352,  1610],
       [  688,  2000,  1068, ...,  1902,   281,   816],
       ...,
       [ 8487,  8484,  9148, ...,  5878, 10254,  5879],
       [ 9054,  8796,  7586, ..., 10140,  5987,  7991],
       [ 8136,  8881,  9219, ...,  8013,  4730, 10222]], dtype=int64)

In [25]:
idx = 2         

print ("The song picked: ", df.iloc[idx]['Track_Name'])

recom_idx = indices[idx]

print ("\nRecommendations: ")
df.iloc[recom_idx][0:5]

The song picked:  Eensy Weensy Spider

Recommendations: 


Unnamed: 0,Track_ID,Track_Name,preview_url,image_url,Artists,Age,popularity,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_min
688,616Uq1Ft4Rl56FbH1H5XXH,Lady Bug Picnic,https://p.scdn.co/mp3-preview/773afcdea6634010...,https://i.scdn.co/image/ab67616d0000b27338381e...,['Victor Johnson'],2,10,0.723,0.42,0,...,1,0.0908,0.809,3.5e-05,0.0996,0.914,120.471,114573,4,1.90955
2000,3Bob4IPl4maAnBPiTZbuxK,Siyahamba,https://p.scdn.co/mp3-preview/702f6bf219f80c11...,https://i.scdn.co/image/ab67616d0000b273cb6801...,"['Dan Zanes', 'Friends']",3,11,0.499,0.338,0,...,1,0.0308,0.802,7e-06,0.0996,0.559,135.923,179840,4,2.997333
1068,0YawYQdvhpuWNSMXqY51xm,A Cat Had a Birthday,https://p.scdn.co/mp3-preview/74d99be6214ed824...,https://i.scdn.co/image/ab67616d0000b2738cc725...,"[""Sesame Street's David""]",2,9,0.837,0.458,5,...,1,0.172,0.355,0.0,0.117,0.919,123.939,164507,4,2.741783
2003,3Bh0xcxyhMWmzutOr0LJy8,Firefly,https://p.scdn.co/mp3-preview/bd156077b8b5c5fe...,https://i.scdn.co/image/ab67616d0000b273cb6801...,"['Dan Zanes', 'Friends']",3,10,0.731,0.363,0,...,1,0.0319,0.691,0.00717,0.098,0.597,125.942,184960,4,3.082667
2054,6dn1B2XpFwyahRFVd7PHL0,Mariposa Ole,https://p.scdn.co/mp3-preview/153a3d64b2f3b75a...,https://i.scdn.co/image/ab67616d0000b273bb8c48...,"['Dan Zanes', 'Friends', 'Barbara Brousal']",3,10,0.692,0.283,1,...,1,0.0361,0.592,0.000127,0.103,0.865,119.65,154133,4,2.568883


### Write it as a python method

In [26]:
def make_recommendation(idx, num=5):
    dists, indices = knn.kneighbors()
    num = min(num, 20)
    return df.iloc[indices[idx]][0:num]

In [30]:
make_recommendation(5).drop(columns=['Track_ID','preview_url','image_url'])

Unnamed: 0,Track_Name,Artists,Age,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_min
1075,Do De Rubber Duck,"['Ernie', 'Kermit the Frog', 'Oscar the Grouch...",2,10,0.801,0.595,0,-8.096,1,0.379,0.18,0.0,0.309,0.89,153.626,174547,4,2.909117
1377,High up on the Trapeze,['The Wiggles'],2,8,0.648,0.553,0,-6.318,1,0.0277,0.658,0.000379,0.29,0.811,112.0,106440,4,1.774
2056,Choo Choo Ch'Boogie,"['Dan Zanes', 'Friends', 'Rankin Don aka Fathe...",3,10,0.645,0.612,7,-8.672,1,0.376,0.788,0.0,0.214,0.854,160.585,162307,4,2.705117
484,The Word is No,"[""Sesame Street's Gina"", ""Sesame Street's Maria""]",2,8,0.929,0.762,7,-6.55,1,0.033,0.422,0.000112,0.154,0.971,125.591,116000,4,1.933333
2252,The Honker Duckie Dinger Jamboree,"['Ernie', 'Dinger', 'The Honkers', 'Rubber Duc...",3,11,0.656,0.592,9,-9.244,1,0.122,0.135,0.000157,0.045,0.82,172.439,97693,4,1.628217
