In [136]:
import os
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [137]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
credentials = json.load(open('authorization.json'))
client_id = credentials['client_id']
client_secret = credentials['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [138]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, n_jobs=-1))])
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False, n_jobs=4))
                                 ], verbose=False)


In [139]:
data=pd.read_csv("playlist_0.csv")

In [140]:
data.drop('num_segments', inplace=True, axis=1)
data.drop('num_sections', inplace=True, axis=1)
data.drop('num_bars', inplace=True, axis=1)
#data.drop('first_artist',inplace=True,axis=1)
data.drop('all_artists',inplace=True,axis=1)
data.drop('id',inplace=True,axis=1)

In [141]:
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [142]:
data.head()

Unnamed: 0,title,first_artist,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster_label
0,Teri Isi Ada Pe Sanam,Kumar Sanu,0.478,0.438,6,-13.695,1,0.738,4.2e-05,0.372,0.829,153.06,311798,3,19
1,Kitaben Bahut Si,Asha Bhosle,0.619,0.721,2,-7.53,0,0.423,8.6e-05,0.0587,0.808,97.32,389387,4,18
2,Chand Chhupa Badal Mein,Udit Narayan,0.402,0.61,11,-9.509,0,0.368,0.0,0.109,0.463,86.248,347013,4,4
3,Pehla Pehla Pyar-Spbalasubhramaniam,S. P. Balasubrahmanyam,0.594,0.53,5,-10.226,0,0.443,4e-06,0.421,0.457,101.831,264307,4,4
4,Mere Khwabon Mein,Lata Mangeshkar,0.621,0.512,7,-11.074,0,0.634,0.0,0.0803,0.926,114.802,256440,4,4


In [143]:
def find_song(name):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} '.format(name), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['title'] = [name]
    #song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)



In [144]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'acousticness', 'danceability', 'duration_ms', 'energy', 
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'tempo','time_signature']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['title'] == song['title']) 
                                ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['title'])

In [145]:
def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['title']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [146]:
def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


In [147]:
def recommender( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['title', 'first_artist']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['title'].isin(song_dict['title'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [148]:
recommender([{'title': 'Chhoti Si Aasha'}],  data)

[{'title': 'Chhoti Si Aasha - Version, 1', 'first_artist': 'A.R. Rahman'},
 {'title': 'Tere Dar Par Sanam - Male Version', 'first_artist': 'Kumar Sanu'},
 {'title': 'Albela Sajan', 'first_artist': 'Sultan Khan'},
 {'title': 'Chhupana Bhi Nahin Aata', 'first_artist': 'Pankaj Udhas'},
 {'title': 'Chudiyan Khanak Gayeen', 'first_artist': 'Lata Mangeshkar'},
 {'title': 'Are Yaaro Mere Pyaro', 'first_artist': 'Udit Narayan'},
 {'title': 'Roja - Version, 1', 'first_artist': 'A.R. Rahman'},
 {'title': 'Tumse Milne Ko Dil', 'first_artist': 'Alka Yagnik'},
 {'title': 'Aankhon Ki Gustakhiyan', 'first_artist': 'Kumar Sanu'},
 {'title': 'Jeeye to Jeeye Kaise', 'first_artist': 'Pankaj Udhas'}]

In [10]:
from helper import SongPredcition
nn=SongPredcition()
abc=nn.recommender([{'title': 'Chhoti Si Aasha'}])
for i  in range(len(abc)):
    print(abc[i]['title']+" "+ abc[i]['first_artist'])



test


ConnectionError: HTTPSConnectionPool(host='api.spotify.com', port=443): Max retries exceeded with url: /v1/search?q=track%3A+Chhoti+Si+Aasha+&limit=1&offset=0&type=track (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002516ECDF550>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))