# Building a song recommender to users using collaborative filtering and KNN


#Import pandas 

In [1]:
import pandas as pd

#Load music data

In [2]:



usage_data = pd.read_table('https://static.turi.com/datasets/millionsong/10000.txt', header = None)
usage_data.rename(columns={0:'user_id', 1:'song_id', 2:'listen_count'}, inplace = True)

song_data = pd.read_csv('https://static.turi.com/datasets/millionsong/song_data.csv')
song_data.drop_duplicates(['song_id'])

song_data = pd.merge(usage_data, song_data, on="song_id", how="left")
song_data['song'] = song_data['artist_name'].map(str) + " - " + song_data['title']

song_data = song_data.drop(['release','year'],axis=1)

song_data



Unnamed: 0,user_id,song_id,listen_count,title,artist_name,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,Jack Johnson - The Cove
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Paco De Lucia - Entre Dos Aguas
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Kanye West - Stronger
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Kanye West - Stronger
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Jack Johnson - Constellations
...,...,...,...,...,...,...
2086941,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJEYPO12AAA8C6B0E,2,Ignorance (Album Version),Paramore,Paramore - Ignorance (Album Version)
2086942,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJJYDE12AF729FC16,4,Two Is Better Than One,Boys Like Girls featuring Taylor Swift,Boys Like Girls featuring Taylor Swift - Two I...
2086943,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJKQSF12A6D4F5EE9,3,What I've Done (Album Version),Linkin Park,Linkin Park - What I've Done (Album Version)
2086944,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJUXGA12AC961885C,1,Up,Justin Bieber,Justin Bieber - Up


# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [3]:
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist_name,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,Jack Johnson - The Cove
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Paco De Lucia - Entre Dos Aguas
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Kanye West - Stronger
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Kanye West - Stronger
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Jack Johnson - Constellations


##Showing the most popular songs in the dataset

In [4]:
song_data['song'].value_counts()

Harmonia - Sehr kosmisch                                   8277
Kings Of Leon - Use Somebody                               7952
Björk - Undo                                               7032
Florence + The Machine - Dog Days Are Over (Radio Edit)    6949
Dwight Yoakam - You're The One                             6412
                                                           ... 
Three Days Grace - Scared                                    51
Ricardo Arjona - Historia Del Portero                        51
Amparanoia - Don´t Leave Me Now                              50
Juanes - No Creo En El Jamas                                 48
Ladytron - Ghosts (Toxic Avenger Mix)                        48
Name: song, Length: 9982, dtype: int64

2086946

In [6]:
s=song_data['song'].value_counts().nlargest(10)
print("songs to recommend based on popularity\n",s)


songs to recommend based on popularity

 Harmonia - Sehr kosmisch                                                                                                                        8277
Kings Of Leon - Use Somebody                                                                                                                    7952
Björk - Undo                                                                                                                                    7032
Florence + The Machine - Dog Days Are Over (Radio Edit)                                                                                         6949
Dwight Yoakam - You're The One                                                                                                                  6412
Kings Of Leon - Revelry                                                                                                                         6145
OneRepublic - Secrets                                            

##Count number of unique users in the dataset

In [7]:
users = song_data['user_id'].unique()

In [8]:
len(users)

76353


# Create a song recommender

# Using Collaborative Filtering

Create a matrix with number of users * number of songs shape.
Each entry in the array represent the number of times songs listened by a particular user

In [9]:
song = song_data.drop_duplicates(['user_id','song','title','artist_name'])
song = song.reset_index(drop=True)

In [10]:
pop_songs=song
#print(pop_songs.head())
pop_songs=pop_songs.sort_values(by='song',ascending=False)
#pop_songs = pop_songs.filter(['listen_count','song_id'], axis=1) 
#pop_songs= pop_songs.drop_duplicates(subset='song_id', keep="first")

result = pop_songs.head(10)
print(result)

                                          user_id             song_id  \
1116748  4f6924c33baef717acc98a07face38d20918e1c0  SOJJJJR12A6D4F9584   
101692   b1eac465eb6838e2a2b86413e93107108bc1bfba  SOJJJJR12A6D4F9584   
358575   798713860923b23c991821cb53e4b1a16e80fe11  SOJJJJR12A6D4F9584   
1232284  be178d1747723e807a35d0a817f710d7552de337  SOJJJJR12A6D4F9584   
1834000  23d6305eb466f7fff2bf840fae90f2bdf6b01151  SOJJJJR12A6D4F9584   
697192   19f1d7c18a777747c7865e8e980c664f63bae687  SOJJJJR12A6D4F9584   
1281140  96dfd6436c2416458cf5a95e16dd2726581a6df9  SOJJJJR12A6D4F9584   
590834   3f27ddaaea5347363f65ea5ee32165579167f1f1  SOJJJJR12A6D4F9584   
1924613  30cdb9e4f95eaea2be1306fc0f557a647a5f8885  SOJJJJR12A6D4F9584   
1447347  66f6867503d85683367f6cf8c71f468f5ca23ed3  SOJJJJR12A6D4F9584   

         listen_count                                              title  \
1116748             1  Lose My Soul feat. Kirk Franklin & Mandisa / A...   
101692              8  Lose My Soul feat. Ki

Use only 50000 items in song_data (jupyter notebook has memory limitatons)

In [11]:
song=song.iloc[0:50000,]
song.shape

(50000, 6)

In [12]:
song.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist_name,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,Jack Johnson - The Cove
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Paco De Lucia - Entre Dos Aguas
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Kanye West - Stronger
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Jack Johnson - Constellations
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Foo Fighters - Learn To Fly


This is going to be the filtering table

In [13]:
song_pivot=song.pivot_table(index='user_id',columns='song_id',values='listen_count')

In [14]:
song_pivot.shape

(1869, 9351)

In [15]:
song_pivot=song_pivot.fillna(0)

In [16]:
song_pivot[song_pivot.SOBYHAJ12A6701BF1D==1]

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAKPM12A58A77210,SOAAOYI12AB01831CE,SOAAROC12A6D4FA420,SOAARXR12A8C133D15,SOAATHE12A8C13ADD6,...,SOZZIOH12A67ADE300,SOZZKPR12A6D4F8147,SOZZLTY12A67AE0AD0,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZYAO12A6701FF36
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1645b689f873529ab85e3b72742be44813e82bd3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2a8a8f48fd4eb5ca4b64874162df4fdf584d89c4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3f73f44560e822344b0fb7c6b463869743eb9860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41ef7daf3d368922c9676d3f90c8fe931cac1d35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
572da9d6331782b8c48924968f0778a331170c20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6f8453b0d9d2199f98c1992995a8445ad6837fd8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7a4b8e7d2905d13422418b4f48cc85100892e013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b2cbcf5ea3c6ea3ee41ceac0ef247c2b1ddedbdc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b80344d063b5ccb3212f76538f3d9e43d87dca9e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c24ec42f0e449ff39a95a01f0795f833b898f71b,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train a Knn model using pivot_table

In [17]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=10,metric='cosine')
Model=knn.fit(song_pivot)

In [18]:
song_pivot[song_pivot.index == 1]

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAKPM12A58A77210,SOAAOYI12AB01831CE,SOAAROC12A6D4FA420,SOAARXR12A8C133D15,SOAATHE12A8C13ADD6,...,SOZZIOH12A67ADE300,SOZZKPR12A6D4F8147,SOZZLTY12A67AE0AD0,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZYAO12A6701FF36
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Predict neighbors for a particular user

In [19]:
user = song_pivot.iloc[1,]
user

song_id
SOAAAGQ12A8C1420C8    0.0
SOAACPJ12A81C21360    0.0
SOAAEJI12AB0188AB5    0.0
SOAAFAC12A67ADF7EB    0.0
SOAAFYH12A8C13717A    0.0
                     ... 
SOZZTCU12AB0182C58    0.0
SOZZTNF12A8C139916    0.0
SOZZVWB12AB0189C30    0.0
SOZZWZV12A67AE140F    0.0
SOZZYAO12A6701FF36    0.0
Name: 000ebc858861aca26bac9b49f650ed424cf882fc, Length: 9351, dtype: float64

In [20]:
distances, indices = Model.kneighbors([user])
distances, indices

(array([[1.22124533e-15, 8.92509706e-01, 8.97647465e-01, 9.08646800e-01,
         9.09924530e-01, 9.10371558e-01, 9.28041741e-01, 9.33108996e-01,
         9.34218178e-01, 9.34245232e-01]]),
 array([[   1,  217, 1506,  546,  473, 1256,  786,   80, 1159,  687]]))

The indices contains only the index values of users from the user table. It doesnt show user_id.
We've to find out user_id of neighbors

In [21]:
neighbors=[]
for item in indices[0][1:]:
    neighbors.append(song[song.index == item].user_id.values[0])
neighbors

['17aa9f6dbdf753831da8f38c71b66b64373de613',
 '484b69dd013df1ec0cfd504886d4f647cb32b08f',
 '5a905f000fc1ff3df7ca807d57edb608863db05d',
 '5a905f000fc1ff3df7ca807d57edb608863db05d',
 'a58de017cbeda1763ea002fe027ed41b4ed53109',
 'baf47ed8da24d607e50d8684cde78b923538640f',
 '4bd88bfb25263a75bbdd467e74018f4ae570e5df',
 '12768858f6a825452e412deb1df36d2d1d9c6791',
 '5a905f000fc1ff3df7ca807d57edb608863db05d']

In [22]:
neighbor_songs=pd.DataFrame(columns=['user_id','song_id','listen_count','title','artist_name','song'])
#user_songs=pd.DataFrame(columns=['user_id','song_id','listen_count','title','artist_name','song'])    
for item in neighbors:
    neighbor_songs=neighbor_songs.append(song[song.user_id == item],ignore_index = True)
#user_songs.append(song[song.user_id =='000ebc858861aca26bac9b49f650ed424cf882fc'],ignore_index = True)
#print(len(user_songs))    
neighbor_songs=neighbor_songs.sort_values('listen_count',ascending=False)
neighbor_songs = neighbor_songs.filter(['listen_count','song'], axis=1) 
neighbor_songs= neighbor_songs.drop_duplicates(subset='song', keep="first")

result = neighbor_songs.head(10)
print("Top 10 recommended song for you ")
result

Top 10 recommended song for you 


Unnamed: 0,listen_count,song
748,28,Sigur Rós - Sé Lest
1241,20,Octopus Project - Rorol
1564,19,Octopus Project - The Way Things Go
813,18,Sigur Ros - ný Batterý
317,18,Sigur Rós - Sæglópur
1121,18,Foo Fighters - Over And Out
1216,18,Salt-N-Pepa - Push It
1447,17,Sigur Rós - Gong
1034,17,OneRepublic - Secrets
636,17,Sigur Rós - Andvari


In [23]:
user_songs=pd.DataFrame(columns=['user_id','song_id','listen_count','title','artist_name','song'])    
user_songs=user_songs.append(song[song.user_id =='000ebc858861aca26bac9b49f650ed424cf882fc'],ignore_index = True)
user_songs=user_songs.sort_values('listen_count',ascending=False)
user_songs
user_songs = user_songs.filter(['listen_count','song'], axis=1) 
user_songs= user_songs.drop_duplicates(subset='song', keep="first")
len(user_songs)

96

In [24]:
#df1.where(df1.values==df2.values).notna()

x=set(neighbor_songs['song']).intersection(set(user_songs['song']))
x

{'Down To The Bone featuring Hil St. Soul - Smile To Shine',
 'Future Rock - Gears',
 "Martin O'Donnell And Michael Salvatori - Reclaimer",
 'Michael Jackson - Thriller',
 'Pete Rock & C.L. Smooth - They Reminisce Over You (Single Version)',
 'The Beach Boys - Cotton Fields (The Cotton Song) (Digitally Remastered 01)'}

In [25]:

s=pd.DataFrame(s) 
print("most recommended songs based on popularity\n\n",s)
s

most recommended songs based on popularity

                                                     song
Harmonia - Sehr kosmisch                            8277
Kings Of Leon - Use Somebody                        7952
Björk - Undo                                        7032
Florence + The Machine - Dog Days Are Over (Rad...  6949
Dwight Yoakam - You're The One                      6412
Kings Of Leon - Revelry                             6145
OneRepublic - Secrets                               5841
Coldplay - Yellow                                   5658
Justin Bieber - Somebody To Love                    5546
Barry Tuckwell/Academy of St Martin-in-the-Fiel...  5385


Unnamed: 0,song
Harmonia - Sehr kosmisch,8277
Kings Of Leon - Use Somebody,7952
Björk - Undo,7032
Florence + The Machine - Dog Days Are Over (Radio Edit),6949
Dwight Yoakam - You're The One,6412
Kings Of Leon - Revelry,6145
OneRepublic - Secrets,5841
Coldplay - Yellow,5658
Justin Bieber - Somebody To Love,5546
Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner - Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile),5385


In [26]:
s['song'] = s['song'].apply(str)
d=s.index
print(d)
g=pd.DataFrame(data=d, columns=['song'])
print("popular songs\n\n",g)
x=set(g['song']).intersection(set(user_songs['song']))
print("user songs\n\n",user_songs)
x

Index(['Harmonia - Sehr kosmisch', 'Kings Of Leon - Use Somebody',
       'Björk - Undo',
       'Florence + The Machine - Dog Days Are Over (Radio Edit)',
       'Dwight Yoakam - You're The One', 'Kings Of Leon - Revelry',
       'OneRepublic - Secrets', 'Coldplay - Yellow',
       'Justin Bieber - Somebody To Love',
       'Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner - Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)'],
      dtype='object')
popular songs

                                                 song
0                           Harmonia - Sehr kosmisch
1                       Kings Of Leon - Use Somebody
2                                       Björk - Undo
3  Florence + The Machine - Dog Days Are Over (Ra...
4                     Dwight Yoakam - You're The One
5                            Kings Of Leon - Revelry
6                              OneRepublic - Secrets
7                                  Coldplay - Yellow
8            

set()