# Predicting Song Genres

## Initial attempt
* Try to predict the genres using only the numerical features

In [62]:
import pandas as pd
pd.options.display.max_columns = None

songs = pd.read_csv('data/spotify_data.csv', index_col=[0])
songs.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Before cleaning the data

In [63]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114000 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   artists           113999 non-null  object 
 2   album_name        113999 non-null  object 
 3   track_name        113999 non-null  object 
 4   popularity        114000 non-null  int64  
 5   duration_ms       114000 non-null  int64  
 6   explicit          114000 non-null  bool   
 7   danceability      114000 non-null  float64
 8   energy            114000 non-null  float64
 9   key               114000 non-null  int64  
 10  loudness          114000 non-null  float64
 11  mode              114000 non-null  int64  
 12  speechiness       114000 non-null  float64
 13  acousticness      114000 non-null  float64
 14  instrumentalness  114000 non-null  float64
 15  liveness          114000 non-null  float64
 16  valence           114000 

### After cleaning the data

In [64]:
songs = songs.drop_duplicates(['track_name','artists'])
# Drop rows with missing values
songs.dropna(inplace=True)
songs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81343 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          81343 non-null  object 
 1   artists           81343 non-null  object 
 2   album_name        81343 non-null  object 
 3   track_name        81343 non-null  object 
 4   popularity        81343 non-null  int64  
 5   duration_ms       81343 non-null  int64  
 6   explicit          81343 non-null  bool   
 7   danceability      81343 non-null  float64
 8   energy            81343 non-null  float64
 9   key               81343 non-null  int64  
 10  loudness          81343 non-null  float64
 11  mode              81343 non-null  int64  
 12  speechiness       81343 non-null  float64
 13  acousticness      81343 non-null  float64
 14  instrumentalness  81343 non-null  float64
 15  liveness          81343 non-null  float64
 16  valence           81343 non-null  float64
 1

### Fit model

In [87]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

X = songs.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
y = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, 
                                                    stratify=y_encoded, random_state=100)
# Train model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

### Evaluate performance

In [88]:
# Evaluate model
predictions = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.24906263445817198


In [89]:
# Get more detailed performance information for each class
f1 = f1_score(y_test, predictions, average=None)
class_to_f1 = dict(zip(label_encoder.classes_, f1))
# Sort f1-score in descending order
class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
class_to_f1

{'grindcore': 0.7874015748031497,
 'comedy': 0.7780678851174936,
 'sleep': 0.7440633245382586,
 'honky-tonk': 0.5859872611464969,
 'iranian': 0.578125,
 'romance': 0.56047197640118,
 'study': 0.5558441558441558,
 'kids': 0.5255102040816327,
 'black-metal': 0.5164556962025316,
 'salsa': 0.49582172701949856,
 'drum-and-bass': 0.4728260869565218,
 'classical': 0.4631578947368421,
 'tango': 0.46153846153846156,
 'detroit-techno': 0.44680851063829785,
 'hardstyle': 0.43697478991596644,
 'chicago-house': 0.4266666666666667,
 'new-age': 0.42105263157894735,
 'pagode': 0.41982507288629745,
 'idm': 0.37078651685393255,
 'forro': 0.36118598382749323,
 'piano': 0.3586206896551724,
 'j-dance': 0.34628975265017664,
 'breakbeat': 0.3446475195822454,
 'j-idol': 0.3296089385474861,
 'sertanejo': 0.32903225806451614,
 'minimal-techno': 0.3281733746130031,
 'disney': 0.32642487046632124,
 'gospel': 0.3082437275985663,
 'pop-film': 0.3044776119402985,
 'happy': 0.30352303523035234,
 'opera': 0.3006134969

## Experiment #1: Balancing
* Try to overcome the class imbalance problem by using under or over sampling

### Undersampling

In [90]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

sampler = RandomUnderSampler()
decision_tree = DecisionTreeClassifier()
pipeline = Pipeline([('balancing', sampler), ('classifier', decision_tree)])
pipeline.fit(X_train, y_train)
# Evaluate model
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy using under sampling: {accuracy}")

Accuracy using under sampling: 0.1793595181019116


### Oversampling

In [91]:
sampler = RandomOverSampler()
decision_tree = DecisionTreeClassifier()
pipeline = Pipeline([('balancing', sampler), ('classifier', decision_tree)])
pipeline.fit(X_train, y_train)
# Evaluate model
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy using over sampling: {accuracy}")

Accuracy using over sampling: 0.2439609072469113


In [92]:
# Get more detailed performance information for each class
f1 = f1_score(y_test, predictions, average=None)
class_to_f1 = dict(zip(label_encoder.classes_, f1))
# Sort f1-score in descending order
class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
class_to_f1

{'grindcore': 0.8359788359788362,
 'comedy': 0.7783505154639175,
 'sleep': 0.7768595041322314,
 'honky-tonk': 0.6198830409356725,
 'study': 0.5714285714285714,
 'romance': 0.526027397260274,
 'kids': 0.5245901639344263,
 'tango': 0.5238095238095238,
 'iranian': 0.5159574468085107,
 'black-metal': 0.4856396866840731,
 'detroit-techno': 0.4444444444444445,
 'drum-and-bass': 0.44141689373297,
 'classical': 0.4295774647887324,
 'chicago-house': 0.42077922077922075,
 'salsa': 0.404692082111437,
 'pagode': 0.39759036144578314,
 'sertanejo': 0.39344262295081966,
 'new-age': 0.3756345177664974,
 'minimal-techno': 0.36990595611285265,
 'idm': 0.35543766578249336,
 'forro': 0.3457446808510638,
 'hardstyle': 0.33810888252148996,
 'j-idol': 0.33149171270718225,
 'breakbeat': 0.3297872340425532,
 'samba': 0.32635983263598334,
 'j-dance': 0.3161764705882353,
 'piano': 0.3021582733812949,
 'gospel': 0.3011583011583012,
 'happy': 0.29353233830845776,
 'party': 0.2915254237288135,
 'disney': 0.29023746

### Experiment \#1 conclusion:
_Balancing doesn't seem to have much effect_