In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle

def load_and_clean():
    """
    spotify, identify = load_and_clean()
    """
    spotify = pd.read_csv('SpotifyFeatures.csv')

    # dataframe that serves to identify songs
    identify = spotify[['artist_name', 'track_id', 'track_name']]

    # dataframe consisting of audio features we want to train on
    spotify = spotify.drop(columns = ['genre',
                                    'mode',
                                    'time_signature',
                                    'key',
                                    'track_id',
                                    'artist_name',
                                    'popularity',
                                    'track_name',
                                    'duration_ms',
                                    'speechiness'])

    return spotify, identify

spotify, identify = load_and_clean()

# spotify.to_csv('spotify.csv', index=False)
# print(spotify.shape)
# print(spotify.head())
# print('-----------------')
# print(identify.shape)
# print(identify.head())

In [2]:
spotify.isnull().sum(0)

acousticness        0
danceability        0
energy              0
instrumentalness    0
liveness            0
loudness            0
tempo               0
valence             0
dtype: int64

In [3]:
spotify.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,tempo,valence
0,0.611,0.389,0.91,0.0,0.346,-1.828,166.969,0.814
1,0.246,0.59,0.737,0.0,0.151,-5.559,174.003,0.816
2,0.952,0.663,0.131,0.0,0.103,-13.879,99.488,0.368
3,0.703,0.24,0.326,0.0,0.0985,-12.178,171.758,0.227
4,0.95,0.331,0.225,0.123,0.202,-21.15,140.576,0.39


In [7]:
identify[identify['track_name'] == 'Worst Nites']

Unnamed: 0,artist_name,track_id,track_name
77647,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
93170,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
111450,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
166863,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites


In [10]:
worst_nites = spotify.iloc[77647].tolist()
worst_nites

[0.00834, 0.741, 0.752, 0.00165, 0.0438, -4.968, 114.02, 0.609]

In [9]:
spotify.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,tempo,valence
0,0.611,0.389,0.91,0.0,0.346,-1.828,166.969,0.814
1,0.246,0.59,0.737,0.0,0.151,-5.559,174.003,0.816
2,0.952,0.663,0.131,0.0,0.103,-13.879,99.488,0.368
3,0.703,0.24,0.326,0.0,0.0985,-12.178,171.758,0.227
4,0.95,0.331,0.225,0.123,0.202,-21.15,140.576,0.39


In [123]:
from sklearn.cluster import KMeans
import json

In [17]:
def knn_predictor(audio_feats, k=20):
    """
    similar_song_ids, visual_df = knn_predictor(audio_features)
    """
    # Scale the data with standard scaler
    scaler = StandardScaler()
    spotify_scaled = scaler.fit_transform(spotify)

    ################################################
    audio_feats_scaled = scaler.transform([audio_feats])

    ## Nearest Neighbors model
    knn = NearestNeighbors(n_neighbors=k, algorithm='kd_tree')
    knn.fit(spotify_scaled)

    # prediction 
    prediction = knn.kneighbors(audio_feats_scaled)

    # # Get the indexes of the list of similar songs
    # if prediction[0][0][0] == 0.0:
    #     similar_songs_index = prediction[1][0][1:].tolist()
    # else:
    similar_songs_index = prediction[1][0][:k].tolist()
    
    # Create an empty list to store simlar song names
    similar_song_ids = []
    similar_song_names = []

    # loop over the indexes and append song names to empty list above
    for i in similar_songs_index:
        song_id = identify['track_id'].iloc[i]
        similar_song_ids.append(song_id)
        song_name = identify['track_name'].iloc[i]
        similar_song_names.append(song_name)

    #################################################

    column_names = spotify.columns.tolist()

    # put scaled audio features into a dataframe
    audio_feats_scaled_df = pd.DataFrame(audio_feats_scaled, columns=column_names)

    # create empty list of similar songs' features
    similar_songs_features = []

    # loop through the indexes of similar songs to get audio features for each
    #. similar song
    for index in similar_songs_index:
        list_of_feats = spotify.iloc[index].tolist()
        similar_songs_features.append(list_of_feats)

    # scale the features and turn them into a dataframe
    similar_feats_scaled = scaler.transform(similar_songs_features)
    similar_feats_scaled_df = pd.DataFrame(similar_feats_scaled, columns=column_names)

    

    # get the % difference between the outputs and input songs
    col_names = similar_feats_scaled_df.columns.to_list()
    diff_df = pd.DataFrame(columns=col_names)
    for i in range(k):
        diff = abs(similar_feats_scaled_df.iloc[i] - audio_feats_scaled_df.iloc[0])
        # print('type: ', type(similar_feats_scaled_df.iloc[i]))
        diff_df.loc[i] = diff
    
    # add sums of differences 
    diff_df['sum'] = diff_df.sum(axis=1)
    diff_df = diff_df.sort_values(by=['sum'])
    diff_df = diff_df.reset_index(drop=True)

    # add track_id to DF
    diff_df['track_id'] = similar_song_ids

    # reorder cols to have track_id as first column
    cols = list(diff_df)
    cols.insert(0, cols.pop(cols.index('track_id')))
    diff_df = diff_df.loc[:, cols]

    # Remove the suggestion of the same song (all 0's)
    diff_df = diff_df[~(diff_df == 0).any(axis=1)]

    # Grab only the unique 10 songs
    diff_df = diff_df.drop_duplicates(subset=['sum'])[:10]

    diff_df = diff_df.reset_index(drop=True)

    print(diff_df)

    return diff_df


worst_nites = spotify.iloc[77647].tolist()

test_audio_features = worst_nites

diff_df = knn_predictor(test_audio_features)

diff_json = diff_df.to_json(orient='records')

print(diff_json)
# print(diff_json)
# print('-----------------')
# print('Recommended song_ids:')
# print(similar_song_ids)
# print('Recommended song_names:')
# print(similar_song_names)
# print('-----------------')
# print(visual_df)

track_id  acousticness  danceability    energy  \
0  4Tfobc8QPPPKVlk7KKJpYZ      0.074866      0.016163  0.011387   
1  66eQL4ghCuYWdcqz50BPXF      0.089523      0.172407  0.106280   
2  6kRBYjaEtuCsPNAqoLCR34      0.051188      0.016163  0.041753   
3  0JfAMd3xTqm7ZYhBmQjYzt      0.074020      0.086203  0.091097   
4  4sJoIeb8zWYCLHSLM0az3b      0.102490      0.043102  0.053140   
5  6aAHtvwGUJIIFcczWauwWc      0.104181      0.118529  0.189786   
6  21RzyxY3EFaxVy6K4RqaU9      0.110664      0.059265  0.045549   
7  79UX8fkSsowWI1HOd8VoYt      0.021786      0.059265  0.098689   
8  7HjK2whApIuUgRVP8akqjk      0.012853      0.210120  0.197377   
9  0HqkqqlMMKeyhC3I7mNV7b      0.006088      0.247834  0.068323   

   instrumentalness  liveness  loudness     tempo   valence       sum  
0          0.005450  0.179551  0.126538  0.032137  0.099975  0.546067  
1          0.005450  0.031270  0.003001  0.129293  0.023071  0.560295  
2          0.008819  0.283449  0.024674  0.097609  0.049988  0.

In [None]:
# # create empty list for averaged features of recommended songs
    # similar_feats_averaged = []

    # # loop through columns of audio features and get average of each column for 5
    # #. recommended songs
    # for col in column_names:
    #     avg = similar_feats_scaled_df[col].mean()
    #     similar_feats_averaged.append(avg)
    
    # # print('Sum of means of all predicted songs: ', sum(similar_feats_averaged))

    # # turn averages into 1 row dataframe
    # similar_feats_averaged_df = pd.DataFrame([similar_feats_averaged], columns=column_names)

    # # concatenate this with input songs audio features to be used for visualizing
    # visual_df = pd.concat([audio_feats_scaled_df, similar_feats_averaged_df], ignore_index=True)

    # genre_map = {'Movie': 0, 'R&B': 1, 'A Capella': 2, 'Alternative': 3, 'Country': 4, 'Dance': 5, 'Electronic': 6, 'Anime': 7, 'Folk': 8, 'Blues': 9, 'Opera': 10, 'Hip-Hop': 11,     "Children's Music": 12, 'Children’s Music': 12, 'Rap': 13, 'Indie': 14, 'Classical': 15, 'Pop': 16, 'Reggae': 17, 'Reggaeton': 18, 'Jazz': 19, 'Rock': 20, 'Ska': 21, 'Comedy':      22, 'Soul': 23, 'Soundtrack': 24, 'World': 25}

    # spotify['genre'] = spotify['genre'].map(genre_map)