In [29]:
import pandas as pd
import numpy as np
from math import floor

In [30]:
dataset = pd.read_csv('datasets/spotify_tracks.csv')
pd.set_option('display.max_columns', None)
dataset.shape

(114000, 21)

In [31]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


Checking if there are missing values and, consequently, dropping the rows with missing values

In [32]:
dataset.isnull().sum()

Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [33]:
dataset.dropna(inplace=True)

Checking the number of distinct values for each column

There are multiple rows with the same **track_id**

In [34]:
dataset.nunique(axis=0)

Unnamed: 0          113999
track_id             89740
artists              31437
album_name           46589
track_name           73608
popularity             101
duration_ms          50696
explicit                 2
danceability          1174
energy                2083
key                     12
loudness             19480
mode                     2
speechiness           1489
acousticness          5061
instrumentalness      5346
liveness              1722
valence               1790
tempo                45652
time_signature           5
track_genre            114
dtype: int64

Rows are uniformly divided by **track_genre**

In [35]:
dataset['track_genre'].value_counts().sort_values()

track_genre
k-pop           999
alternative    1000
ambient        1000
anime          1000
black-metal    1000
               ... 
trip-hop       1000
turkish        1000
sleep          1000
punk           1000
indie          1000
Name: count, Length: 114, dtype: int64

In [36]:
genres = list(map(lambda x : 'GENRE_' + x, dataset['track_genre'].unique()))

We can drop the column **Unnamed: 0** because it's not useful

Then we can drop some duplicate rows of the dataset

In [37]:
dataset.drop(columns=['Unnamed: 0'], inplace=True)
dataset.drop_duplicates(inplace=True)

In [38]:
columns_except_track_genre = list(dataset.columns)
columns_except_track_genre.remove('track_genre')

columns_except_track_genre_and_popularity = list(dataset.columns)
columns_except_track_genre_and_popularity.remove('track_genre')
columns_except_track_genre_and_popularity.remove('popularity')

Dataset now has 113549 rows

There are rows that are completely equal except for their **track_genre** attribute

We aggregate these rows joining their **track_genre** attributes with a semicolon

After this operation the dataset will have 90460 rows, 23089 less than before

In [39]:
dataset = dataset.groupby(columns_except_track_genre)['track_genre'].apply(lambda x: ';'.join(x)).reset_index()
dataset.shape

(90460, 20)

In [40]:
dataset['track_genre'].value_counts().sort_values()

track_genre
alt-rock;blues;rock                    1
honky-tonk;rockabilly                  1
indian;indie-pop;indie;malay;rock      1
anime;j-pop;j-rock;punk-rock           1
german;progressive-house               1
                                    ... 
salsa                                989
idm                                  990
sleep                                997
study                                998
tango                                999
Name: count, Length: 1436, dtype: int64

There are still some couples of examples with the same **track_id**

In [41]:
dataset.nunique(axis=0)

track_id            89740
artists             31437
album_name          46589
track_name          73608
popularity            101
duration_ms         50696
explicit                2
danceability         1174
energy               2083
key                    12
loudness            19480
mode                    2
speechiness          1489
acousticness         5061
instrumentalness     5346
liveness             1722
valence              1790
tempo               45652
time_signature          5
track_genre          1436
dtype: int64

In [42]:
dataset['track_id'].value_counts()

track_id
2wSEG8f9WIqyrdlXjdxzGu    2
2WyKvnMEF9H8CKHvmAXSm3    2
14dJexYlvd3t3XAtD1pYW1    2
0vp47IigZ7JVQ89jDetO7H    2
4ElNxglBjcrASiGn58t9Jm    2
                         ..
2bsyecmZCgdlsCZ3sWVZ99    1
2bsehqodemh5uNa72hwWJ7    1
2bsTRbvMKmgyDYr0mK3jTe    1
2bsF8h0aj0bgLSgG2FwdeU    1
7zz7iNGIWhmfFE7zlXkMma    1
Name: count, Length: 89740, dtype: int64

All these couples are the same row except for **popularity** and **track_genre**

In [43]:
len(dataset.groupby(columns_except_track_genre_and_popularity).groups)

89740

In [44]:
dataset[dataset['track_id'] == '2wSEG8f9WIqyrdlXjdxzGu']

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
34163,2wSEG8f9WIqyrdlXjdxzGu,iamjakehill,Sleeping All Alone,Sleeping All Alone,53,210067,True,0.812,0.38,5,-8.171,1,0.0362,0.712,0.0,0.0994,0.899,115.975,4,sad
34164,2wSEG8f9WIqyrdlXjdxzGu,iamjakehill,Sleeping All Alone,Sleeping All Alone,54,210067,True,0.812,0.38,5,-8.171,1,0.0362,0.712,0.0,0.0994,0.899,115.975,4,emo


We aggregate these couples of examples joining their **track_genre** attributes with a semicolon and taking the mean of their **popularity** attributes

In [45]:
dataset = dataset.groupby(columns_except_track_genre_and_popularity, as_index=False).agg({'popularity' : lambda x: floor(np.mean(x)), 
                                                                                          'track_genre' : lambda x: ';'.join(x)})
dataset

Unnamed: 0,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,track_genre
0,0000vdREvCVMxbQTkS888c,Rill,Lolly,Lolly,160725,True,0.910,0.37400,8,-9.844,0,0.1990,0.075700,0.00301,0.1540,0.432,104.042,4,44,german
1,000CC8EParg64OmTxVnZ0p,Glee Cast,Glee Love Songs,It's All Coming Back To Me Now (Glee Cast Vers...,322933,False,0.269,0.51600,0,-7.361,1,0.0366,0.406000,0.00000,0.1170,0.341,178.174,4,47,club
2,000Iz0K615UepwSJ5z2RE5,Paul Kalkbrenner;Pig&Dan,X,Böxig Leise - Pig & Dan Remix,515360,False,0.686,0.56000,5,-13.264,0,0.0462,0.001140,0.18100,0.1110,0.108,119.997,4,22,minimal-techno
3,000RDCYioLteXcutOjeweY,Jordan Sandhu,Teeje Week,Teeje Week,190203,False,0.679,0.77000,0,-3.537,1,0.1900,0.058300,0.00000,0.0825,0.839,161.721,4,62,hip-hop
4,000qpdoc97IMTBvF8gwcpy,Paul Kalkbrenner,Zeit,Tief,331240,False,0.519,0.43100,6,-13.606,0,0.0291,0.000964,0.72000,0.0916,0.234,129.971,4,19,minimal-techno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89735,7zxHiMmVLt4LGWpOMqOpUh,Haricharan;Gopi Sundar,Bangalore Days,"Aethu Kari Raavilum - From ""Bangalore Days""",325156,False,0.766,0.38200,7,-11.464,0,0.0324,0.698000,0.00143,0.1570,0.672,119.992,4,56,pop-film
89736,7zxpdh3EqMq2JCkOI0EqcG,Piano Genie,Disney Favourites,"Two Worlds (From ""Tarzan"")",109573,False,0.529,0.00879,10,-32.266,1,0.0587,0.996000,0.95900,0.0916,0.510,82.694,4,23,disney
89737,7zyYmIdjqqiX6kLryb7QBx,Eric Chou,學著愛,以後別做朋友,260573,False,0.423,0.36000,3,-9.458,1,0.0372,0.728000,0.00000,0.1050,0.291,130.576,4,61,mandopop
89738,7zybSU9tFO9HNlwmGF7stc,Stereoclip,Echoes,Sunset Drive,234300,False,0.649,0.83400,10,-11.430,0,0.0397,0.268000,0.93200,0.0974,0.150,125.004,4,54,electronic


In [46]:
dataset[dataset['track_id'] == '2wSEG8f9WIqyrdlXjdxzGu']

Unnamed: 0,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,track_genre
33884,2wSEG8f9WIqyrdlXjdxzGu,iamjakehill,Sleeping All Alone,Sleeping All Alone,210067,True,0.812,0.38,5,-8.171,1,0.0362,0.712,0.0,0.0994,0.899,115.975,4,53,sad;emo


In [47]:
dataset['track_genre'].value_counts().sort_values()

track_genre
alt-rock;blues;rock                                                                    1
chill;singer-songwriter;songwriter                                                     1
children;disney                                                                        1
blues;british;country;folk;psych-rock                                                  1
blues;country;folk;j-pop;j-rock;power-pop;psych-rock;singer-songwriter;songwriter      1
                                                                                    ... 
comedy                                                                               989
idm                                                                                  990
sleep                                                                                997
study                                                                                998
tango                                                                                999
Name: cou

Finally **track_id** has become a unique identifier of the songs

We can now drop the column **track_id** because it's not useful

In [48]:
dataset.shape

(89740, 20)

In [49]:
dataset.nunique(axis=0)

track_id            89740
artists             31437
album_name          46589
track_name          73608
duration_ms         50696
explicit                2
danceability         1174
energy               2083
key                    12
loudness            19480
mode                    2
speechiness          1489
acousticness         5061
instrumentalness     5346
liveness             1722
valence              1790
tempo               45652
time_signature          5
popularity            101
track_genre          1584
dtype: int64

In [50]:
dataset.drop(columns=['track_id'], inplace=True)
dataset.drop_duplicates(inplace=True) # removing the id introduces few duplicates, we remove them
dataset.shape

(89665, 19)

We can now transform each genre in a boolean feature and remove the **track_genre** column

In [51]:
temp = pd.DataFrame(dict.fromkeys(genres, 0), index=dataset.index)
dataset = pd.concat([dataset, temp], axis=1)
dataset.head(10)

Unnamed: 0,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,track_genre,GENRE_acoustic,GENRE_afrobeat,GENRE_alt-rock,GENRE_alternative,GENRE_ambient,GENRE_anime,GENRE_black-metal,GENRE_bluegrass,GENRE_blues,GENRE_brazil,GENRE_breakbeat,GENRE_british,GENRE_cantopop,GENRE_chicago-house,GENRE_children,GENRE_chill,GENRE_classical,GENRE_club,GENRE_comedy,GENRE_country,GENRE_dance,GENRE_dancehall,GENRE_death-metal,GENRE_deep-house,GENRE_detroit-techno,GENRE_disco,GENRE_disney,GENRE_drum-and-bass,GENRE_dub,GENRE_dubstep,GENRE_edm,GENRE_electro,GENRE_electronic,GENRE_emo,GENRE_folk,GENRE_forro,GENRE_french,GENRE_funk,GENRE_garage,GENRE_german,GENRE_gospel,GENRE_goth,GENRE_grindcore,GENRE_groove,GENRE_grunge,GENRE_guitar,GENRE_happy,GENRE_hard-rock,GENRE_hardcore,GENRE_hardstyle,GENRE_heavy-metal,GENRE_hip-hop,GENRE_honky-tonk,GENRE_house,GENRE_idm,GENRE_indian,GENRE_indie-pop,GENRE_indie,GENRE_industrial,GENRE_iranian,GENRE_j-dance,GENRE_j-idol,GENRE_j-pop,GENRE_j-rock,GENRE_jazz,GENRE_k-pop,GENRE_kids,GENRE_latin,GENRE_latino,GENRE_malay,GENRE_mandopop,GENRE_metal,GENRE_metalcore,GENRE_minimal-techno,GENRE_mpb,GENRE_new-age,GENRE_opera,GENRE_pagode,GENRE_party,GENRE_piano,GENRE_pop-film,GENRE_pop,GENRE_power-pop,GENRE_progressive-house,GENRE_psych-rock,GENRE_punk-rock,GENRE_punk,GENRE_r-n-b,GENRE_reggae,GENRE_reggaeton,GENRE_rock-n-roll,GENRE_rock,GENRE_rockabilly,GENRE_romance,GENRE_sad,GENRE_salsa,GENRE_samba,GENRE_sertanejo,GENRE_show-tunes,GENRE_singer-songwriter,GENRE_ska,GENRE_sleep,GENRE_songwriter,GENRE_soul,GENRE_spanish,GENRE_study,GENRE_swedish,GENRE_synth-pop,GENRE_tango,GENRE_techno,GENRE_trance,GENRE_trip-hop,GENRE_turkish,GENRE_world-music
0,Rill,Lolly,Lolly,160725,True,0.91,0.374,8,-9.844,0,0.199,0.0757,0.00301,0.154,0.432,104.042,4,44,german,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Glee Cast,Glee Love Songs,It's All Coming Back To Me Now (Glee Cast Vers...,322933,False,0.269,0.516,0,-7.361,1,0.0366,0.406,0.0,0.117,0.341,178.174,4,47,club,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Paul Kalkbrenner;Pig&Dan,X,Böxig Leise - Pig & Dan Remix,515360,False,0.686,0.56,5,-13.264,0,0.0462,0.00114,0.181,0.111,0.108,119.997,4,22,minimal-techno,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Jordan Sandhu,Teeje Week,Teeje Week,190203,False,0.679,0.77,0,-3.537,1,0.19,0.0583,0.0,0.0825,0.839,161.721,4,62,hip-hop,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Paul Kalkbrenner,Zeit,Tief,331240,False,0.519,0.431,6,-13.606,0,0.0291,0.000964,0.72,0.0916,0.234,129.971,4,19,minimal-techno,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Chad Daniels,Busy Being Awesome,Thanksgiving Chicken,127040,True,0.536,0.78,5,-9.449,0,0.945,0.792,0.0,0.735,0.452,173.912,3,24,comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Pink Sweat$;Kirby,New RnB,Better,176320,False,0.613,0.471,1,-6.644,0,0.107,0.316,1e-06,0.117,0.406,143.064,4,0,chill;soul,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,177266,False,0.554,0.921,2,-4.589,1,0.0758,0.0194,0.0881,0.329,0.7,183.571,1,38,punk-rock;ska,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Old Crow Medicine Show,O.C.M.S.,Poor Man,214600,False,0.58,0.29,2,-11.942,1,0.0272,0.261,0.0,0.125,0.497,91.321,4,30,bluegrass,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Tokyo Ghetto Pussy,Disco 2001,Love Generation,410666,False,0.531,0.95,9,-9.744,0,0.0433,0.00122,0.826,0.0613,0.553,159.974,4,17,happy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
def fill_genres(row):
    track_genres = row['track_genre'].split(';')
    for g in track_genres:
        row['GENRE_' + g] = 1
    return row

dataset = dataset.apply(fill_genres, axis=1)

In [53]:
dataset.head(10)

Unnamed: 0,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,track_genre,GENRE_acoustic,GENRE_afrobeat,GENRE_alt-rock,GENRE_alternative,GENRE_ambient,GENRE_anime,GENRE_black-metal,GENRE_bluegrass,GENRE_blues,GENRE_brazil,GENRE_breakbeat,GENRE_british,GENRE_cantopop,GENRE_chicago-house,GENRE_children,GENRE_chill,GENRE_classical,GENRE_club,GENRE_comedy,GENRE_country,GENRE_dance,GENRE_dancehall,GENRE_death-metal,GENRE_deep-house,GENRE_detroit-techno,GENRE_disco,GENRE_disney,GENRE_drum-and-bass,GENRE_dub,GENRE_dubstep,GENRE_edm,GENRE_electro,GENRE_electronic,GENRE_emo,GENRE_folk,GENRE_forro,GENRE_french,GENRE_funk,GENRE_garage,GENRE_german,GENRE_gospel,GENRE_goth,GENRE_grindcore,GENRE_groove,GENRE_grunge,GENRE_guitar,GENRE_happy,GENRE_hard-rock,GENRE_hardcore,GENRE_hardstyle,GENRE_heavy-metal,GENRE_hip-hop,GENRE_honky-tonk,GENRE_house,GENRE_idm,GENRE_indian,GENRE_indie-pop,GENRE_indie,GENRE_industrial,GENRE_iranian,GENRE_j-dance,GENRE_j-idol,GENRE_j-pop,GENRE_j-rock,GENRE_jazz,GENRE_k-pop,GENRE_kids,GENRE_latin,GENRE_latino,GENRE_malay,GENRE_mandopop,GENRE_metal,GENRE_metalcore,GENRE_minimal-techno,GENRE_mpb,GENRE_new-age,GENRE_opera,GENRE_pagode,GENRE_party,GENRE_piano,GENRE_pop-film,GENRE_pop,GENRE_power-pop,GENRE_progressive-house,GENRE_psych-rock,GENRE_punk-rock,GENRE_punk,GENRE_r-n-b,GENRE_reggae,GENRE_reggaeton,GENRE_rock-n-roll,GENRE_rock,GENRE_rockabilly,GENRE_romance,GENRE_sad,GENRE_salsa,GENRE_samba,GENRE_sertanejo,GENRE_show-tunes,GENRE_singer-songwriter,GENRE_ska,GENRE_sleep,GENRE_songwriter,GENRE_soul,GENRE_spanish,GENRE_study,GENRE_swedish,GENRE_synth-pop,GENRE_tango,GENRE_techno,GENRE_trance,GENRE_trip-hop,GENRE_turkish,GENRE_world-music
0,Rill,Lolly,Lolly,160725,True,0.91,0.374,8,-9.844,0,0.199,0.0757,0.00301,0.154,0.432,104.042,4,44,german,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Glee Cast,Glee Love Songs,It's All Coming Back To Me Now (Glee Cast Vers...,322933,False,0.269,0.516,0,-7.361,1,0.0366,0.406,0.0,0.117,0.341,178.174,4,47,club,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Paul Kalkbrenner;Pig&Dan,X,Böxig Leise - Pig & Dan Remix,515360,False,0.686,0.56,5,-13.264,0,0.0462,0.00114,0.181,0.111,0.108,119.997,4,22,minimal-techno,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Jordan Sandhu,Teeje Week,Teeje Week,190203,False,0.679,0.77,0,-3.537,1,0.19,0.0583,0.0,0.0825,0.839,161.721,4,62,hip-hop,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Paul Kalkbrenner,Zeit,Tief,331240,False,0.519,0.431,6,-13.606,0,0.0291,0.000964,0.72,0.0916,0.234,129.971,4,19,minimal-techno,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Chad Daniels,Busy Being Awesome,Thanksgiving Chicken,127040,True,0.536,0.78,5,-9.449,0,0.945,0.792,0.0,0.735,0.452,173.912,3,24,comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Pink Sweat$;Kirby,New RnB,Better,176320,False,0.613,0.471,1,-6.644,0,0.107,0.316,1e-06,0.117,0.406,143.064,4,0,chill;soul,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,177266,False,0.554,0.921,2,-4.589,1,0.0758,0.0194,0.0881,0.329,0.7,183.571,1,38,punk-rock;ska,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Old Crow Medicine Show,O.C.M.S.,Poor Man,214600,False,0.58,0.29,2,-11.942,1,0.0272,0.261,0.0,0.125,0.497,91.321,4,30,bluegrass,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Tokyo Ghetto Pussy,Disco 2001,Love Generation,410666,False,0.531,0.95,9,-9.744,0,0.0433,0.00122,0.826,0.0613,0.553,159.974,4,17,happy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
dataset.drop(columns=['track_genre'], inplace=True)

The dataset is now cleaned and ready to be used for making predictions

In [55]:
dataset.to_csv('datasets/spotify_tracks_cleaned.csv', index=False)