In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Read the data from csv file into a dataframe
df = pd.read_csv('spotify.csv', index_col=0) #making user column as index column
df.head()

Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_1,2,2,8,8,13,1,4,9,1,2,...,14,2,14,1,9,0,11,6,14,7
user_2,13,5,5,5,12,8,10,10,2,2,...,10,6,11,1,1,5,12,8,3,0
user_3,3,9,2,8,0,1,11,7,3,7,...,9,5,7,15,12,13,14,5,0,14
user_4,2,6,7,8,14,0,12,7,8,1,...,9,15,9,14,10,6,11,13,6,0
user_5,11,12,8,6,13,7,0,7,3,13,...,12,14,11,11,11,7,3,6,11,7


In [3]:
print("No. of Users: ", len(df))
print("No. of Songs: ", len(df.columns))

No. of Users:  1000
No. of Songs:  5000


In [4]:
# Convert a dataframe to a numpy array
X = df.iloc[:,:].values

In [5]:
# checking shape of the array
X.shape

(1000, 5000)

## NMF

NMF means Non-negative Matrix Factorization.
NMF is used to decompose single matrix into two different matrices.

In [6]:
# Decompose user_song matrix into user matrix and song matrix
from sklearn.decomposition import NMF

# in matrices multiplication (m x n)(n x p) = (m x p) is the result matrix ==> (m x p) = (m x n)(n x p)
nmf = NMF(n_components=100, max_iter=50, random_state=0)

# here, m is users, p is songs, n is n_components i.e., (1000 x 5000) = (1000 x 100)(100 x 5000)
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=50,
    n_components=100, random_state=0, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [7]:
# Print shape of user_matrix
user_matrix = nmf.transform(X)
user_matrix.shape

(1000, 100)

In [8]:
# Print shape of song_matrix
song_matrix = nmf.components_
song_matrix = song_matrix.T
song_matrix.shape

(5000, 100)

## Song Recommendation

In [9]:
# we have user_matrix and song_matrix
# Let's say we want to recommend some songs to user-1
# for this we need to follow the below steps

### Step-1: We need to find the eucledian distance from user-1 to all other users

In [10]:
# Creating a function to find the eucledian distance

def euc_dist(v1,v2):
    return pow(sum([pow(v1[idx] - v2[idx],2) for idx in range(len(v1))]),0.5)

euc_dist([2,1],[1,1])

1.0

user1 = user_matrix[0]
eucledian_distances = []
for idx in range(1000):
    other_user = user_matrix[idx]
    eu = euc_dist(user1, other_user)
    eucledian_distances.append(eu)

In [11]:
import sys
eucledian_distances = []
user1_idx = 0
counter = 0
for other_user in user_matrix[:]:
    if user1_idx == counter:
        # You need not to compare user-1 vector to user-1 vector
        eucledian_distances.append(sys.float_info.max)
    else:
        user1 = user_matrix[0]
        dist_ = euc_dist(user1,other_user)
        eucledian_distances.append(dist_)
    counter+=1

### Step-2: Find 5 closest users

In [12]:
eucledian_distances[0:6]

[1.7976931348623157e+308,
 12.6825469520894,
 12.074322380446397,
 11.129357457060935,
 12.736158783427857,
 10.7919161674346]

In [13]:
# argsort() returns index value of distances in ascending order
# distance between user-1 to user-1 is 0.0 so, it returns index value of user-1 i.e., '0'
candidate_index = np.argsort(eucledian_distances)

# first user who is closer to user-1 is index 0
# second user who is closer to user-1 is index 352
# third user who is closer to user-1 is index 737

In [14]:
# top 5 users who is closer to user-1
user_index_closer_to_user1 = np.argsort(eucledian_distances)[:5]
user_index_closer_to_user1

array([352, 737, 284, 501, 609], dtype=int64)

### Step-3: Find the songs what these 5 closest songs are listening

In [15]:
# List the songs that these 5 closest users are listening
# Recommend these songs to user1

for id_ in user_index_closer_to_user1:
    temp = pd.DataFrame(df.iloc[id_])
    print(temp[temp.values != 0][0:-1].index)

Index(['song_1', 'song_2', 'song_3', 'song_4', 'song_5', 'song_6', 'song_7',
       'song_8', 'song_9', 'song_10',
       ...
       'song_4988', 'song_4989', 'song_4990', 'song_4991', 'song_4992',
       'song_4993', 'song_4994', 'song_4995', 'song_4996', 'song_4997'],
      dtype='object', length=4682)
Index(['song_1', 'song_2', 'song_3', 'song_4', 'song_5', 'song_7', 'song_8',
       'song_9', 'song_10', 'song_11',
       ...
       'song_4989', 'song_4990', 'song_4991', 'song_4992', 'song_4993',
       'song_4994', 'song_4995', 'song_4997', 'song_4998', 'song_4999'],
      dtype='object', length=4699)
Index(['song_1', 'song_2', 'song_3', 'song_4', 'song_5', 'song_6', 'song_7',
       'song_8', 'song_9', 'song_10',
       ...
       'song_4990', 'song_4991', 'song_4992', 'song_4993', 'song_4994',
       'song_4995', 'song_4996', 'song_4997', 'song_4998', 'song_4999'],
      dtype='object', length=4704)
Index(['song_1', 'song_2', 'song_3', 'song_4', 'song_5', 'song_6', 'song_7',
    

## KMeans

In [16]:
song_matrix.shape

(5000, 100)

In [17]:
# First we will make some clusters
from sklearn.cluster import KMeans

k = 10
kmeans = KMeans(n_clusters=k)
kmeans.fit(song_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [18]:
# find the vector for song-1
index_in_df = list(df.columns).index('song_1')
print(index_in_df)
song1_vec = song_matrix[index_in_df]

0


In [19]:
# Find the cluster in which song-1 will fall
kmeans.predict([song1_vec])

array([0])

In [24]:
# Find all the songs in cluster = [?]
cluster_id = 0
all_labels = list(kmeans.predict(song_matrix))
zero_cluster_candidate_ids = [idx for idx in range(len(all_labels)) if all_labels[idx]==cluster_id]
zero_cluster_candidates = song_matrix[zero_cluster_candidate_ids]

In [25]:
zero_cluster_candidate_ids # Songs corresponding to cluster : 0

[0,
 4,
 6,
 7,
 12,
 14,
 15,
 26,
 28,
 35,
 43,
 48,
 50,
 76,
 78,
 82,
 89,
 102,
 111,
 127,
 130,
 134,
 165,
 178,
 202,
 203,
 231,
 233,
 259,
 266,
 274,
 298,
 308,
 339,
 355,
 356,
 367,
 368,
 400,
 407,
 413,
 423,
 430,
 469,
 470,
 515,
 518,
 519,
 529,
 533,
 534,
 540,
 544,
 551,
 554,
 555,
 568,
 584,
 588,
 592,
 612,
 618,
 621,
 622,
 623,
 631,
 639,
 647,
 652,
 678,
 686,
 688,
 690,
 713,
 720,
 724,
 735,
 745,
 747,
 749,
 757,
 758,
 763,
 773,
 776,
 779,
 780,
 781,
 783,
 797,
 798,
 813,
 817,
 818,
 821,
 827,
 864,
 868,
 877,
 893,
 903,
 909,
 928,
 930,
 933,
 934,
 938,
 940,
 943,
 970,
 981,
 984,
 988,
 1013,
 1036,
 1047,
 1048,
 1077,
 1100,
 1113,
 1125,
 1129,
 1133,
 1134,
 1139,
 1158,
 1161,
 1170,
 1175,
 1189,
 1194,
 1213,
 1226,
 1242,
 1246,
 1250,
 1255,
 1284,
 1292,
 1297,
 1301,
 1313,
 1315,
 1333,
 1336,
 1345,
 1347,
 1354,
 1376,
 1378,
 1392,
 1406,
 1412,
 1419,
 1424,
 1426,
 1440,
 1444,
 1453,
 1458,
 1460,
 1462,


## KNN

In [26]:
# Find the closest songs using NearestNeighbors
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=5)
knn.fit(seventh_cluster_candidates)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [27]:
# The closest songs we recommend to the user
knn.kneighbors([song_matrix[index_in_df]])

(array([[1.93680729, 1.9449503 , 1.99463848, 2.00942528, 2.01741234]]),
 array([[ 54, 111, 402, 242,   9]], dtype=int64))