In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

In [2]:
song_data = pd.read_csv('https://static.turi.com/datasets/millionsong/song_data.csv')

In [3]:
print(len(song_data))
song_data.head()

1000000


Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [4]:
file = pd.read_csv('https://static.turi.com/datasets/millionsong/10000.txt',names = ['user_id', 'song_id', 'listen_count'], sep = '\t')

In [5]:
print(len(file))
file.head()

2000000


Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [6]:
song_df = pd.merge(file, song_data.drop_duplicates(['song_id']), on = 'song_id', how = 'left')
song_df = song_df.head(100000)

In [7]:
len(song_df)

100000

In [8]:
song_df['song'] = song_df['title'] + " - " + song_df['artist_name']
song_df.drop(['title', 'artist_name'], inplace = True, axis = 1)

In [9]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,release,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,Thicker Than Water,0,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Flamenco Para Niños,1976,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Graduation,2007,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,In Between Dreams,2005,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,There Is Nothing Left To Lose,1999,Learn To Fly - Foo Fighters


In [10]:
song_grouped = song_df.groupby(['song', 'user_id']).agg({'listen_count': 'count'}).reset_index()
song_grouped.head()

Unnamed: 0,song,user_id,listen_count
0,#!*@ You Tonight [Featuring R. Kelly] (Explici...,83ec242ad34f201b0cf9a29540e1d85af06cd560,1
1,#!*@ You Tonight [Featuring R. Kelly] (Explici...,9236fe7fcb3905ea0daf49408f15a3c8b085b094,1
2,#40 - DAVE MATTHEWS BAND,18dc95f354220b343e98ebbb7c8564291284ed9f,1
3,#40 - DAVE MATTHEWS BAND,3d7d0bfd9f8e1d177e2f87914c4a807228272a9b,1
4,#40 - DAVE MATTHEWS BAND,3f73f44560e822344b0fb7c6b463869743eb9860,1


In [11]:
users_id = song_grouped['user_id'].unique()
len(users_id)

3863

In [12]:
songs_id = song_df['song'].unique()
len(songs_id)

9891

In [42]:
songs_data = song_grouped.pivot('user_id', 'song', 'listen_count').fillna(0)
songs_data.shape

(3863, 9891)

In [81]:
train_data, test_data = train_test_split(songs_data, test_size = 0.2, random_state = 0)

In [82]:
songs_data_sparse = csr_matrix(train_data.values)
songs_data_sparse.shape

(3090, 9891)

In [84]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(train_data)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [85]:
query_index = np.random.choice(100)
distances, indices = model_knn.kneighbors(songs_data_sparse[query_index,:].reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for',train_data.columns[query_index], ':\n')
    else:
        print(i,'. ', train_data.columns[indices.flatten()[i]],' with distance of ', distances.flatten()[i])

Recommendations for 16 Candles - The Crests :

1 .  Ddiamondd - Battles  with distance of  0.7817821097640076
2 .  Corazon Espinado - Santana featuring Mana  with distance of  0.7817821097640076
3 .  Dulce Condena - Los Rodriguez  with distance of  0.8110177634953863
4 .  Down On My Knees - Ayo  with distance of  0.8110177634953863
5 .  Cold As You - Taylor Swift  with distance of  0.8110177634953863


In [86]:
songs_data_sparse_test = csr_matrix(test_data.values)
query_index = np.random.choice(100)
distances, indices = model_knn.kneighbors(songs_data_sparse_test[query_index,:].reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for',test_data.columns[query_index], ':\n')
    else:
        print(i,'. ', test_data.columns[indices.flatten()[i]],' with distance of ', distances.flatten()[i])

Recommendations for A Decade Under The Influence (Album Version) - Taking Back Sunday :

1 .  Black Betty - Ram Jam  with distance of  0.8453279437775635
2 .  Gangsta Lovin' - Eve / Alicia Keys  with distance of  0.8518264426846728
3 .  Girl Next Door - Musiq / Ayana  with distance of  0.8648593805956661
4 .  Crosshairs - Dangerdoom  with distance of  0.8727272727272727
5 .  Désenchantée - Kate Ryan  with distance of  0.8793954621688945
