In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
# Init seaborn
sns.set()

In [3]:
# Read in data
data = pd.read_csv('tracks.csv')
data.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [4]:
# Get metrics on the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                586672 non-null  object 
 1   name              586601 non-null  object 
 2   popularity        586672 non-null  int64  
 3   duration_ms       586672 non-null  int64  
 4   explicit          586672 non-null  int64  
 5   artists           586672 non-null  object 
 6   id_artists        586672 non-null  object 
 7   release_date      586672 non-null  object 
 8   danceability      586672 non-null  float64
 9   energy            586672 non-null  float64
 10  key               586672 non-null  int64  
 11  loudness          586672 non-null  float64
 12  mode              586672 non-null  int64  
 13  speechiness       586672 non-null  float64
 14  acousticness      586672 non-null  float64
 15  instrumentalness  586672 non-null  float64
 16  liveness          58

In [5]:
# Test for empty values
data.isnull().sum()

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

In [6]:
# Clean the data
data['name'].fillna('Unknown Title', inplace=True)

In [7]:
# Test for empty values again
data.isnull().sum()

id                  0
name                0
popularity          0
duration_ms         0
explicit            0
artists             0
id_artists          0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64

In [8]:
# Extract only the features
features = data.drop(columns=
                    ['id', 'name', 'artists', 'id_artists', 'release_date']
                    )
# Get any correlations
features.corr()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
popularity,1.0,0.027681,0.211758,0.187,0.302315,0.015299,0.327028,-0.033655,-0.047357,-0.370882,-0.236487,-0.04874,0.004643,0.071364,0.086759
duration_ms,0.027681,1.0,-0.016747,-0.120371,0.024825,0.004626,0.000337,-0.02761,-0.125771,-0.064434,0.069278,0.002137,-0.163202,-0.00121,0.037552
explicit,0.211758,-0.016747,1.0,0.150229,0.123076,0.010932,0.134603,-0.051754,0.102258,-0.149018,-0.06752,-0.013114,-0.016539,0.005745,0.044447
danceability,0.187,-0.120371,0.150229,1.0,0.241563,0.018824,0.251436,-0.044719,0.19931,-0.242951,-0.2259,-0.106168,0.52815,-0.040783,0.146196
energy,0.302315,0.024825,0.123076,0.241563,1.0,0.036294,0.764735,-0.065246,-0.053506,-0.715412,-0.195839,0.124632,0.372276,0.230099,0.188983
key,0.015299,0.004626,0.010932,0.018824,0.036294,1.0,0.027109,-0.129491,-0.001035,-0.027414,-0.006776,-0.006774,0.019992,0.004904,0.008928
loudness,0.327028,0.000337,0.134603,0.251436,0.764735,0.027109,1.0,-0.040464,-0.167112,-0.519423,-0.329306,0.029529,0.275448,0.189288,0.164377
mode,-0.033655,-0.02761,-0.051754,-0.044719,-0.065246,-0.129491,-0.040464,1.0,-0.017992,0.058918,-0.010008,0.007009,0.011474,0.007871,-0.015892
speechiness,-0.047357,-0.125771,0.102258,0.19931,-0.053506,-0.001035,-0.167112,-0.017992,1.0,0.06906,-0.10244,0.207047,0.046501,-0.086911,-0.114171
acousticness,-0.370882,-0.064434,-0.149018,-0.242951,-0.715412,-0.027414,-0.519423,0.058918,0.06906,1.0,0.204412,-0.004738,-0.180939,-0.195235,-0.173927


In [9]:
# Normalize the data as needed (from https://thecleverprogrammer.com/2021/03/03/spotify-recommendation-system-with-machine-learning/)
from sklearn.preprocessing import MinMaxScaler
datatypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
normarization = data.select_dtypes(include=datatypes)
for col in normarization.columns:
    MinMaxScaler(col)

In [10]:
# Cluster based off features to semi-predict the genre (from https://thecleverprogrammer.com/2021/03/03/spotify-recommendation-system-with-machine-learning/)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
features = kmeans.fit_predict(normarization)
data['features'] = features
MinMaxScaler(data['features'])

MinMaxScaler(feature_range=0         6
1         6
2         0
3         0
4         0
         ..
586667    2
586668    0
586669    0
586670    0
586671    7
Name: features, Length: 586672, dtype: int32)

In [11]:
class Recommender():
    def __init__(self, data):
        self.data = data
    
    def recommend(self, songs, amount=1):
        distance = []
        song = self.data[(self.data.name.str.lower() == songs.lower())].head(1).values[0]
        rec = self.data[self.data.name.str.lower() != songs.lower()]
        for songs in tqdm(rec.values):
            d = 0
            for column in np.arange(len(rec.columns)):
                if not column in [0, 1, 5, 6, 7]:
                    d = d + np.absolute(float(song[column]) - float(songs[column]))
            distance.append(d)
        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['artists', 'name']
        return rec[columns][:amount]

In [12]:
recommendations = Recommender(data)

In [14]:
recommendations.recommend("Runaway", 20)

100%|██████████| 586618/586618 [00:21<00:00, 26707.48it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,artists,name
389113,['Royal Republic'],Getting Along
435579,['Matanza'],Meio Psicopata
256580,"['MYA', 'Fer Palacio']",250X
33889,['Connie Francis'],That's Amore
49965,['Cartola'],Minha
416291,['Jakob Ahlbom'],Forget
569690,['White 2115'],Gubię kroki
570167,"['SB Maffija', 'White 2115']",Gubię kroki
258112,['Elvis Presley'],I'm So Lonesome I Could Cry - Live at the Hono...
554567,['Joan Sebastian'],Toro Capirote
