### Importing the necessary libraries

In [1]:
# Data manipulation and handling and visualization libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#NMF
from sklearn.decomposition import NMF

# clustering library
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

## Loading the dataset

In [2]:
df = pd.read_csv('spotify.xls', index_col = 0)

In [3]:
df.head()

Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_1,2,2,8,8,13,1,4,9,1,2,...,14,2,14,1,9,0,11,6,14,7
user_2,13,5,5,5,12,8,10,10,2,2,...,10,6,11,1,1,5,12,8,3,0
user_3,3,9,2,8,0,1,11,7,3,7,...,9,5,7,15,12,13,14,5,0,14
user_4,2,6,7,8,14,0,12,7,8,1,...,9,15,9,14,10,6,11,13,6,0
user_5,11,12,8,6,13,7,0,7,3,13,...,12,14,11,11,11,7,3,6,11,7


In [4]:
df.shape

(1000, 5000)

## Spotify Reccomendation System

In [5]:
X = df.values
X

array([[ 2,  2,  8, ...,  6, 14,  7],
       [13,  5,  5, ...,  8,  3,  0],
       [ 3,  9,  2, ...,  5,  0, 14],
       ...,
       [ 6,  2,  1, ..., 13,  6, 13],
       [ 7,  0,  8, ..., 11,  9, 12],
       [ 6, 15,  0, ..., 15,  4,  5]], dtype=int64)

## Non Negative Matrix vectorization(NMF)

In [6]:
nmf = NMF(n_components = 100, max_iter = 1000, random_state = 3)

In [None]:
nmf.fit(X)

In [None]:
user_matrix = nmf.transform(X)
song_matrix = nmf.components_.T

In [None]:
user_matrix

In [None]:
user_matrix.shape

In [None]:
song_matrix.shape

## Steps for building a recommendation system - type1(For local library or small dataset)

1. Consider recommending song to the first user(row location 0 in user matrix)
2. Find the Euclidian Distance between this first user and the remaining 999 users
3. Find out top 10 songs heard by nearest 5 users.
4. Recommend this 10 songs to the first user.


## Calculating the distance between two users

In [None]:
def col_dist(u1, u2):
    return pow(sum(pow(u1[x]-u2[x], 2)for x in range(len(u1))), 0.5)

In [None]:
col_dist(user_matrix[0], user_matrix[1])

In [None]:
def dist_from_remaining_users(b_user, user_matrix):
    dist = []
    for i in range(len(user_matrix)):
        if b_user != i:
            dist.append(col_dist(user_matrix[b_user], user_matrix[i]))
            
    return dist

In [None]:
dist_user1 = dist_from_remaining_users(0, user_matrix)

In [None]:
dist_user1

In [None]:
len(dist_user1)

### Finding Nearest 5 Users

In [None]:
nearest_users = np.argsort(dist_user1)[:5]

In [None]:
for i in range(len(nearest_users)):
    nearest_users[i] +=1

In [None]:
nearest_users

In [None]:
for i in nearest_users:
    print("Songs heared by user", i, "are:")
    temp = df.iloc[i]
    print(temp[temp.values !=0].index)

Picking out top 10 songs from above list of songs

In [None]:
def topNsongs(nearest_users, df, N):
    temp = df.iloc[nearest_users]
    dict1 = temp.max().to_dict()
    #print(dict1)
    
# converting every key value pair into a tuple and swapping them for sorting purpose, End result is a list of
# song and number of times heard tuple
    sorted_dict = sorted(dict1.items(), key = lambda keyvalue: (keyvalue[1], keyvalue[0]), reverse = True)[:N]
    # print(sorted_dict)
    return [x[0] for x in sorted_dict]
    # return 0

In [None]:
topNsongs(nearest_users, df, 15)

## Type 2 Recommendation System for larger data set 

- K means clustering

In [None]:
WCSS = {}
for k in range(1, 50):
    k_means = KMeans(n_clusters = k, max_iter = 1000).fit(song_matrix)
    WCSS[k] = k_means.inertia_
    
    #inertia measures how well a dataset was clustered by kmeans


In [None]:
WCSS

In [None]:
plt.plot(WCSS.keys(), WCSS.values())
plt.xlabel("Number of clusters")
plt.ylabel('WCSS')

In [None]:
def songs_in_cluster(N_clusters, df, song_matrix, song_name, n_recommendations):
    kmeans = KMeans(n_clusters = N_clusters, max_iter = 1000).fit(song_matrix)
    index_in_song = df.columns.to_list().index(song_name)
    #print(index_in_song)
    song_vector = song_matrix[index_in_song] # song_vector = genre score of given song
    #print(song_vector)
    all_songs_in_cluster = kmeans.predict(song_matrix)
    #print(len(all_songs_in_cluster))
    
    
    # Finding songs which belongs to same cluster as of songs passed in song_name    
    songs_in_selected_cluster = [X for X in range(len(all_songs_in_cluster))
                                if all_songs_in_cluster[X] == kmeans.predict([song_vector])]
    #print(len(songs_in_selected_cluster))
    song_cluster = song_matrix[songs_in_selected_cluster]
    
    #print(song_cluster)
    neighbors = NearestNeighbors(n_neighbors = n_recommendations)
    neighbors.fit(song_cluster)
    recommended_songs = neighbors.kneighbors([song_matrix[index_in_song]])
    #print(recommended_songs[1][0])
    
    
    songs = df.columns
    return[songs[X] for X in recommended_songs[1][0]]

In [None]:
songs_in_cluster(15, df, song_matrix, 'song_5', 5)