In [31]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def load_and_clean():
    """
    spotify, identify = load_and_clean()
    """
    spotify = pd.read_csv('SpotifyFeatures.csv')

    # dataframe that serves to identify songs
    identify = spotify[['artist_name', 'track_id', 'track_name']]

    # dataframe consisting of audio features we want to train on
    spotify = spotify.drop(columns = ['mode',
                                    'time_signature',
                                    'key',
                                    'track_id',
                                    'artist_name',
                                    'track_name'])

    genre_map = {'Movie': 0, 'R&B': 1, 'A Capella': 2, 'Alternative': 3, 'Country': 4, 'Dance': 5, 'Electronic': 6, 'Anime': 7, 'Folk': 8, 'Blues': 9, 'Opera': 10, 'Hip-Hop': 11, "Children's Music": 12, 'Children’s Music': 12, 'Rap': 13, 'Indie': 14, 'Classical': 15, 'Pop': 16, 'Reggae': 17, 'Reggaeton': 18, 'Jazz': 19, 'Rock': 20, 'Ska': 21, 'Comedy': 22, 'Soul': 23, 'Soundtrack': 24, 'World': 25}

    spotify['genre'] = spotify['genre'].map(genre_map)

    return spotify, identify

spotify, identify = load_and_clean()
print(spotify.shape)
print(spotify.head())
print('-----------------')
print(identify.shape)
print(identify.head())

(232725, 12)
   genre  popularity  acousticness  danceability  duration_ms  energy  \
0      0           0         0.611         0.389        99373   0.910   
1      0           1         0.246         0.590       137373   0.737   
2      0           3         0.952         0.663       170267   0.131   
3      0           0         0.703         0.240       152427   0.326   
4      0           4         0.950         0.331        82625   0.225   

   instrumentalness  liveness  loudness  speechiness    tempo  valence  
0             0.000    0.3460    -1.828       0.0525  166.969    0.814  
1             0.000    0.1510    -5.559       0.0868  174.003    0.816  
2             0.000    0.1030   -13.879       0.0362   99.488    0.368  
3             0.000    0.0985   -12.178       0.0395  171.758    0.227  
4             0.123    0.2020   -21.150       0.0456  140.576    0.390  
-----------------
(232725, 3)
         artist_name                track_id                        track_name
0

In [32]:
spotify.isnull().sum(0)

genre               0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
dtype: int64

In [33]:
spotify['genre'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25], dtype=int64)

In [34]:
genre_map = {'Movie': 0, 'R&B': 1, 'A Capella': 2, 'Alternative': 3, 'Country': 4, 'Dance': 5, 'Electronic': 6, 'Anime': 7, 'Folk': 8, 'Blues': 9, 'Opera': 10, 'Hip-Hop': 11, "Children's Music": 12, 'Children’s Music': 12, 'Rap': 13, 'Indie': 14, 'Classical': 15, 'Pop': 16, 'Reggae': 17, 'Reggaeton': 18, 'Jazz': 19, 'Rock': 20, 'Ska': 21, 'Comedy': 22, 'Soul': 23, 'Soundtrack': 24, 'World': 25}

In [35]:
spotify.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,0,0,0.611,0.389,99373,0.91,0.0,0.346,-1.828,0.0525,166.969,0.814
1,0,1,0.246,0.59,137373,0.737,0.0,0.151,-5.559,0.0868,174.003,0.816
2,0,3,0.952,0.663,170267,0.131,0.0,0.103,-13.879,0.0362,99.488,0.368
3,0,0,0.703,0.24,152427,0.326,0.0,0.0985,-12.178,0.0395,171.758,0.227
4,0,4,0.95,0.331,82625,0.225,0.123,0.202,-21.15,0.0456,140.576,0.39


In [49]:
def knn_predictor(audio_feats):
  """
  similar_song_ids, visual_df = knn_predictor(audio_features)
  """
  # Scale the data with standard scaler
  scaler = StandardScaler()
  spotify_scaled = scaler.fit_transform(spotify) 

  ################################################
  audio_feats_scaled = scaler.transform([audio_feats])

  ## Nearest Neighbors model
  nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
  nn.fit(spotify_scaled)

  # prediction 
  prediction = nn.kneighbors(audio_feats_scaled)

  # Get the indexes of the list of similar songs
  if prediction[0][0][0] == 0.0:
    similar_songs_index = prediction[1][0][1:].tolist()
  else:
    similar_songs_index = prediction[1][0][:5].tolist()
  
  # Create an empty list to store simlar song names
  similar_song_ids = []
  similar_song_names = []

  # loop over the indexes and append song names to empty list above
  for i in similar_songs_index:
    song_id = identify['track_id'].iloc[i]
    similar_song_ids.append(song_id)
    song_name = identify['track_name'].iloc[i]
    similar_song_names.append(song_name)

  #################################################
  
  column_names = spotify.columns.tolist()

  # put scaled audio features into a dataframe
  audio_feats_scaled_df = pd.DataFrame(audio_feats_scaled, columns=column_names)

  # create empty list of similar songs' features
  similar_songs_features = []

  # loop through the indexes of similar songs to get audio features for each
  #. similar song
  for index in similar_songs_index:
    list_of_feats = spotify.iloc[index].tolist()
    similar_songs_features.append(list_of_feats)

  # scale the features and turn them into a dataframe
  similar_feats_scaled = scaler.transform(similar_songs_features)
  similar_feats_scaled_df = pd.DataFrame(similar_feats_scaled, columns=column_names)

  # create empty list for averaged features of recommended songs
  similar_feats_averaged = []
  
  # loop through columns of audio features and get average of each column for 5
  #. recommended songs
  for col in column_names:
    avg = similar_feats_scaled_df[col].mean()
    similar_feats_averaged.append(avg)

  # turn averages into 1 row dataframe
  similar_feats_averaged_df = pd.DataFrame([similar_feats_averaged], columns=column_names)

  # concatenate this with input songs audio features to be used for visualizing
  visual_df = pd.concat([audio_feats_scaled_df, similar_feats_averaged_df], ignore_index=True)

  return similar_song_ids, similar_song_names, visual_df


test_audio_features = [5, 3, .5, .5, 100000, 0.8, 0.5, 0.25, -100.0, 0.02, 125.0, 100.0]
similar_song_ids, similar_song_names, visual_df = knn_predictor(test_audio_features)

print('-----------------')
print('Recommended song_ids:')
print(similar_song_ids)
print('Recommended song_names:')
print(similar_song_names)
print('-----------------')
print(visual_df)

-----------------
Recommended song_ids:
['4tygYGKESZsvsHDjgHKeCS', '2ENL43xerIA73IypQohcKh', '6molwsVZCgoAUlnSSzdb5b', '5IUdhVJhpNiwr3C5KMrPvn', '6j2fR9yEGXutTaTp6pZLXp']
Recommended song_names:
['Goosey Goosey Gander (Wurlitzer)', "Edward's Harp", 'The Crooked Man', 'Pat a Cake for Bedtime', 'The Crooked Man']
-----------------
      genre  popularity  acousticness  danceability  duration_ms    energy  \
0 -1.114815   -2.096080      0.370495     -0.292900    -1.136096  0.869379   
1 -0.283534   -2.008119      1.492357      0.431208    -1.228306 -0.673958   

   instrumentalness  liveness   loudness  speechiness     tempo     valence  
0          1.161613  0.176478 -15.076232    -0.543155  0.237336  382.770122  
1          2.018375 -0.446302  -1.729674    -0.280431  1.043496    2.089798  
