In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle

def load_and_clean():
    """
    spotify, identify = load_and_clean()
    """
    spotify = pd.read_csv('https://raw.githubusercontent.com/BW-pilot/MachineLearning/master/CSVs/spotify_final.csv')

    # spotify = spotify.drop(columns=['genre',
    #                                 'mode',
    #                                 'time_signature',
    #                                 'key',
    #                                 'popularity',
    #                                 'duration_ms',
    #                                 'speechiness'])
    # spotify.to_csv('spotify_final.csv')

    # dataframe that serves to identify songs
    identify = spotify[['artist_name', 'track_id', 'track_name']]

    # identify.to_csv('identify_tracks.csv')

    # dataframe consisting of audio features we want to train on
    spotify = spotify.drop(columns = ['track_id',
                                    'artist_name',
                                    'track_name'])

    return spotify, identify

spotify, identify = load_and_clean()

# spotify.to_csv('spotify.csv', index=False)
# print(spotify.shape)
# print(spotify.head())
# print('-----------------')
# print(identify.shape)
# print(identify.head())

In [36]:
spotify.isnull().sum(0)

Unnamed: 0          0
acousticness        0
danceability        0
energy              0
instrumentalness    0
liveness            0
loudness            0
tempo               0
valence             0
dtype: int64

In [37]:
spotify.head()

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,tempo,valence
0,0,0.611,0.389,0.91,0.0,0.346,-1.828,166.969,0.814
1,1,0.246,0.59,0.737,0.0,0.151,-5.559,174.003,0.816
2,2,0.952,0.663,0.131,0.0,0.103,-13.879,99.488,0.368
3,3,0.703,0.24,0.326,0.0,0.0985,-12.178,171.758,0.227
4,4,0.95,0.331,0.225,0.123,0.202,-21.15,140.576,0.39


In [16]:
identify[identify['track_name'] == 'Worst Nites']

Unnamed: 0,artist_name,track_id,track_name
77647,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
93170,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
111450,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites
166863,Foster The People,7lVoniii4QwhNjCeHij2xZ,Worst Nites


In [17]:
worst_nites = spotify.iloc[77647].tolist()
worst_nites

[77647.0, 0.00834, 0.741, 0.752, 0.00165, 0.0438, -4.968, 114.02, 0.609]

In [18]:
spotify.head()

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,tempo,valence
0,0,0.611,0.389,0.91,0.0,0.346,-1.828,166.969,0.814
1,1,0.246,0.59,0.737,0.0,0.151,-5.559,174.003,0.816
2,2,0.952,0.663,0.131,0.0,0.103,-13.879,99.488,0.368
3,3,0.703,0.24,0.326,0.0,0.0985,-12.178,171.758,0.227
4,4,0.95,0.331,0.225,0.123,0.202,-21.15,140.576,0.39


In [19]:
print(spotify.shape)

(232725, 9)


In [27]:
def nn_predictor(audio_feats, k=25):
    """
    similar_song_ids, visual_df = knn_predictor(audio_features)
    """
    # Scale the data with standard scaler
    scaler = StandardScaler()
    spotify_scaled = scaler.fit_transform(spotify)

    ################################################
    audio_feats_scaled = scaler.transform([audio_feats])

    ## Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=k, algorithm='kd_tree')
    nn.fit(spotify_scaled)
    
    # prediction
    prediction = nn.kneighbors(audio_feats_scaled)
    
    
    # # Get the indexes of the list of similar songs
    # if prediction[0][0][0] == 0.0:
    #     similar_songs_index = prediction[1][0][1:].tolist()
    # else:
    similar_songs_index = prediction[1][0][:k].tolist()
    
    # Create an empty list to store simlar song names
    similar_song_ids = []
    similar_song_names = []

    # loop over the indexes and append song names to empty list above
    for i in similar_songs_index:
        song_id = identify['track_id'].iloc[i]
        similar_song_ids.append(song_id)
        song_name = identify['track_name'].iloc[i]
        similar_song_names.append(song_name)

    #################################################

    column_names = spotify.columns.tolist()

    # put scaled audio features into a dataframe
    audio_feats_scaled_df = pd.DataFrame(audio_feats_scaled, columns=column_names)

    # create empty list of similar songs' features
    similar_songs_features = []

    # loop through the indexes of similar songs to get audio features for each
    #. similar song
    for index in similar_songs_index:
        list_of_feats = spotify.iloc[index].tolist()
        similar_songs_features.append(list_of_feats)

    # scale the features and turn them into a dataframe
    similar_feats_scaled = scaler.transform(similar_songs_features)
    similar_feats_scaled_df = pd.DataFrame(similar_feats_scaled, columns=column_names)

    

    # get the % difference between the outputs and input songs
    col_names = similar_feats_scaled_df.columns.to_list()
    diff_df = pd.DataFrame(columns=col_names)
    for i in range(k):
        diff = abs(similar_feats_scaled_df.iloc[i] - audio_feats_scaled_df.iloc[0])
        # print('type: ', type(similar_feats_scaled_df.iloc[i]))
        diff_df.loc[i] = diff
    
    # add sums of differences 
    diff_df['sum'] = diff_df.sum(axis=1)
    diff_df = diff_df.sort_values(by=['sum'])
    diff_df = diff_df.reset_index(drop=True)

    # add track_id to DF
    diff_df['track_id'] = similar_song_ids

    # reorder cols to have track_id as first column
    cols = list(diff_df)
    cols.insert(0, cols.pop(cols.index('track_id')))
    diff_df = diff_df.loc[:, cols]

    # Remove the suggestion of the same song (all 0's)
    diff_df = diff_df[~(diff_df == 0).any(axis=1)]

    # Grab only the unique 10 songs
    diff_df = diff_df.drop_duplicates(subset=['sum'])[:10]

    diff_df = diff_df.reset_index(drop=True)

    # print(diff_df)

    return diff_df


worst_nites = spotify.iloc[77647].tolist()
wn_id = identify.iloc[77647]
worst_nites_json = wn_id.to_json(orient='records')
# print(worst_nites_json)
test_audio_features = worst_nites

diff_df = nn_predictor(test_audio_features)
print(diff_df.columns)
print(diff_df)

diff_json = diff_df.to_json(orient='records')

print(diff_json)
# print(diff_json)
# print('-----------------')
# print('Recommended song_ids:')
# print(similar_song_ids)
# print('Recommended song_names:')
# print(similar_song_names)
# print('-----------------')
# print(visual_df)

Index(['track_id', 'Unnamed: 0', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'tempo', 'valence', 'sum'],
      dtype='object')
                 track_id  Unnamed: 0  acousticness  danceability    energy  \
0  0HqkqqlMMKeyhC3I7mNV7b    0.126522      0.074866      0.016163  0.011387   
1  6vSq5q5DCs1IvwKIq53hj2    0.252821      0.036813      0.070040  0.049344   
2  6ayTAnEwP3Yo4DOjLsN5r2    0.245215      0.021786      0.059265  0.098689   
3  2BhkQtorT9Vka8GLUzUeH4    0.087479      0.157455      0.037714  0.394754   
4  2GHkACm48ILzrwW6NoWsnL    0.018755      0.020182      0.177794  0.068323   
5  0PGxTaUcMggyJxTyLy0uOx    0.130630      0.039350      0.032326  0.113871   
6  1FK0sZfZzCAIDUn7htCemp    0.176923      0.034840      0.086203  0.155624   
7  7lVoniii4QwhNjCeHij2xZ    0.092689      0.017927      0.167019  0.037957   
8  6vSq5q5DCs1IvwKIq53hj2    0.085321      0.045551      0.290936  0.254313   
9  5KNoyjCDbXVPhzAX3CZsnC    0.248

In [21]:
# # create empty list for averaged features of recommended songs
    # similar_feats_averaged = []

    # # loop through columns of audio features and get average of each column for 5
    # #. recommended songs
    # for col in column_names:
    #     avg = similar_feats_scaled_df[col].mean()
    #     similar_feats_averaged.append(avg)
    
    # # print('Sum of means of all predicted songs: ', sum(similar_feats_averaged))

    # # turn averages into 1 row dataframe
    # similar_feats_averaged_df = pd.DataFrame([similar_feats_averaged], columns=column_names)

    # # concatenate this with input songs audio features to be used for visualizing
    # visual_df = pd.concat([audio_feats_scaled_df, similar_feats_averaged_df], ignore_index=True)

    # genre_map = {'Movie': 0, 'R&B': 1, 'A Capella': 2, 'Alternative': 3, 'Country': 4, 'Dance': 5, 'Electronic': 6, 'Anime': 7, 'Folk': 8, 'Blues': 9, 'Opera': 10, 'Hip-Hop': 11,     "Children's Music": 12, 'Children’s Music': 12, 'Rap': 13, 'Indie': 14, 'Classical': 15, 'Pop': 16, 'Reggae': 17, 'Reggaeton': 18, 'Jazz': 19, 'Rock': 20, 'Ska': 21, 'Comedy':      22, 'Soul': 23, 'Soundtrack': 24, 'World': 25}

    # spotify['genre'] = spotify['genre'].map(genre_map)

In [22]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling

## Using a Neural Network

In [34]:
spotify = pd.read_csv('https://raw.githubusercontent.com/BW-pilot/MachineLearning/master/CSVs/spotify_final.csv')
spotify.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,acousticness,danceability,energy,instrumentalness,liveness,loudness,tempo,valence
0,0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0.611,0.389,0.91,0.0,0.346,-1.828,166.969,0.814
1,1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,0.246,0.59,0.737,0.0,0.151,-5.559,174.003,0.816
2,2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,0.952,0.663,0.131,0.0,0.103,-13.879,99.488,0.368
3,3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0.703,0.24,0.326,0.0,0.0985,-12.178,171.758,0.227
4,4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,0.95,0.331,0.225,0.123,0.202,-21.15,140.576,0.39


In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# load in the data
spotify = pd.read_csv('https://raw.githubusercontent.com/BW-pilot/MachineLearning/master/CSVs/spotify_final.csv')
print(spotify.head())


def normalize(vectors):
    return vectors / np.linalg.norm(vectors, axis=1, keepdims=True)


def predict(model, input_vector):
    return model.predict(input_vector).argsort()


def build_model(weights):
    model = Sequential([
        # Dot product between feature vector and reference vectors
        Dense(input_shape=(weights.shape[1],),
              units=weights.shape[0],
              activation='linear',
              name='dense_1',
              use_bias=False)
    ])
    model.set_weights([weights.T])
    return model


def get_results(input_vector, features, best_match=True, amount=5):
    """
    get_results(input_vector, features, best_match=True, amount=5)
    input_vector: audio features of the song to suggest similar songs to,
    plus track_id
    features: full database to suggest songs from
    best_match=True: True if you want most similar songs, False if least
    similar
    amount=5: amount of results to return.
    returns a list (might be a numpy array?) of indices from the original
    database
    """

    col_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
                 'key', 'liveness',	'loudness',	'speechiness', 'tempo',
                 'valence', 'id']
    input_vector_df = pd.DataFrame([input_vector], columns=col_names)

    cols_to_drop = ['Unnamed: 0', 'artists', 'duration_ms', 'explicit', 'id',
                    'mode', 'name', 'popularity', 'release_date', 'year']
    
    tr_id = input_vector_df['id'].values[0]
    ids = features['id']
    input_vec = input_vector_df.drop(columns=['id'])
    feats = features.drop(columns=cols_to_drop)
    # norm_vector = normalize(input_vec.values)
    norm_vector = normalize(input_vec)
    norm_features = normalize(feats)
    model = build_model(norm_features)
    prediction = np.array(predict(model, norm_vector).argsort())
    prediction = prediction.reshape(prediction.shape[1])
    feats['id'] = ids

    if best_match:
        if tr_id in ids[prediction[-amount:]]:
            return feats.loc[prediction[-amount-1:-1]]
        return feats.loc[prediction[-amount:]]
    return feats.loc[prediction[:amount]]

    
test_audio_features = [0.5,	0.7, 0.7, 0.0, 3, 0.1, -3, 0.03, 130, 0.9,
                       '6oXghnUUe9u2iIZPNfCxjl']   

results_1 = get_results(spotify.iloc[0], spotify, amount=5)
# print('-------------------------')
# print(results_1)

# results_2 = get_results(test_audio_features, spotify, amount=10)
# print('-------------------------')
# print(results_2)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

   Unnamed: 0        artist_name                        track_name  \
0           0     Henri Salvador       C'est beau de faire un Show   
1           1  Martin & les fées  Perdu d'avance (par Gad Elmaleh)   
2           2    Joseph Williams    Don't Let Me Be Lonely Tonight   
3           3     Henri Salvador    Dis-moi Monsieur Gordon Cooper   
4           4       Fabien Nataf                         Ouverture   

                 track_id  acousticness  danceability  energy  \
0  0BRjO6ga9RKCKjfDqeFgWV         0.611         0.389   0.910   
1  0BjC1NfoEOOusryehmNudP         0.246         0.590   0.737   
2  0CoSDzoNIKCRs124s9uTVy         0.952         0.663   0.131   
3

TypeError: can only concatenate str (not "list") to str

## Experimenting with connecting to Spotify API

In [24]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials # To access authorised Spotify data
import os
from dotenv import load_dotenv

load_dotenv()

CLIENT_ID = os.getenv("MY_ID")
CLIENT_SECRET = os.getenv("MY_SECRET")

PLAYLIST_ID = '37i9dQZF1DWYJ5kmTbkZiz' # Spotify playlist id

# API Login
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

# Getting tracks of the playlist
tracks = sp.user_playlist_tracks(user = 'spotify', playlist_id = PLAYLIST_ID)
tracks_uri_list = [x['track']['uri'] for x in tracks['items']]

# Getting features of tracks
features = []
for i in tracks_uri_list:
    features = features + sp.audio_features(i)

# Creating feature dataframe
cols_to_drop = ['id', 'analysis_url', 'key', 'time_signature', 'track_href', 'type', 'uri', 'mode', 'duration_ms']
features_df = pd.DataFrame(features).drop(cols_to_drop, axis=1)
features_df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.792,0.625,-5.609,0.0536,0.00776,0.00188,0.329,0.37,103.967
1,0.723,0.809,-3.081,0.0625,0.00346,0.00123,0.565,0.274,98.007
2,0.755,0.522,-4.368,0.0575,0.533,0.0,0.0685,0.925,89.96
3,0.865,0.521,-6.932,0.0371,0.548,0.000115,0.0989,0.748,129.059
4,0.729,0.756,-5.119,0.0294,0.131,0.0,0.0527,0.522,104.945
5,0.641,0.922,-4.457,0.0786,0.0291,0.0,0.0862,0.847,146.078
6,0.778,0.317,-10.732,0.334,0.592,0.0,0.0881,0.327,140.048
7,0.532,0.783,-5.697,0.0523,0.0038,0.0012,0.161,0.643,124.08
8,0.749,0.925,-5.034,0.227,0.241,0.0,0.52,0.641,86.989
9,0.701,0.425,-10.965,0.375,0.328,0.13,0.1,0.562,135.128
