# Import packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.base            import BaseEstimator
from sklearn.metrics         import make_scorer
from sklearn.preprocessing   import MultiLabelBinarizer
from sklearn.preprocessing   import StandardScaler
from sklearn.preprocessing   import OneHotEncoder
from sklearn.impute          import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.multiclass      import OneVsRestClassifier
from sklearn.metrics         import f1_score
from sklearn.metrics         import recall_score
from sklearn.ensemble        import RandomForestClassifier
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection      import permutation_importance


# Helper class

In [2]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

# Load data

https://www.kaggle.com/iqbalbasyar/spotify-genre-classification

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/mmkaddoura/song-genre-prediction/main/SpotifyFeatures.csv')

# Preporcess target
Create a single row for each song with a list of all genres as the target column  

In [4]:
def add_labels(row):
    return genres.loc[row['track_id']]

genres = data.groupby('track_id')['genre'].apply(list)

data['labels'] = data.apply(add_labels, axis=1)

data = data[data.duplicated(subset='track_id', keep='first')]

data = data.drop('genre', axis=1)

# Train test split

Creat train and test sets

In [5]:
target        = data.labels
data_features = data.drop('labels', axis=1)

X_train, X_test, y_train, y_test = train_test_split(data_features, target, test_size=0.2)

# Tranform target

Create binary labels for each class

In [6]:
mlb = MultiLabelBinarizer()

mlb.fit(y_train)

y_train = mlb.transform(y_train)
y_test  = mlb.transform(y_test)

# Preprocess features 
Impute missing values, one hot encode categorical variables, scale the continuous variables, drop the three ID columns

In [7]:
# categorical columns
cat_cols = ['key', 'mode', 'time_signature']

# continuous columns
con_cols = ['duration_ms',
            'energy',
            'loudness',
            'instrumentalness',
            'acousticness',
            'popularity',
            'valence',
            'tempo',
            'danceability',
            'liveness',
            'speechiness']

# categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
                     ('ohe', OneHotEncoder(handle_unknown='ignore')),
                     ])

# continuous pipeline
con_pipe = Pipeline([('imputer', SimpleImputer(strategy='median', add_indicator=True)),
                     ('scaler', StandardScaler()),
                     ])


# preporcessing pipeline
preprocessing = ColumnTransformer(transformers=
                                  [('categorical', cat_pipe, cat_cols),
                                   ('continuous',  con_pipe, con_cols),
                                   ],
                                  remainder='drop')

# Creating metrics

I chose to evaluate models using F1 score and recall. F1 is used to evaluate the importance of both true positives and true negatives. I chose to additionally look at recall because in this domain, it is important to see how many of the applicable genres we can predict correctly. I am not looking at accuracy because accuracy checks to see whether or not the model correctly predicted all labels.

In [8]:
f1_scorer = make_scorer(f1_score, average='weighted', zero_division=1)
recall_scorer = make_scorer(recall_score, average='weighted', zero_division=1)

# Random Search CV for Best Model

Find the best model and its hyperparameters using a randomized search with cross validation.

In [9]:
pipe = Pipeline([('preprocessing', preprocessing), 
                 ('clf',  OneVsRestClassifier(DummyEstimator(), n_jobs=-1))])

candidates = [
    {'clf__estimator': [RandomForestClassifier()],
     'clf__estimator__bootstrap': [True, False],
     'clf__estimator__ccp_alpha': [0.0],
     'clf__estimator__criterion': ['gini', 'entropy'],
     'clf__estimator__max_depth': [1, 10, 100],
     'clf__estimator__max_features': ['sqrt', 'log2'],
     'clf__estimator__max_leaf_nodes': [10, 100, 1000],
     'clf__estimator__max_samples': [None],
     'clf__estimator__min_impurity_decrease': [0.0],
     'clf__estimator__min_impurity_split': [None],
     'clf__estimator__min_samples_leaf': [1, 10, 20],
     'clf__estimator__min_samples_split': [2],
     'clf__estimator__min_weight_fraction_leaf': [0.0],
     'clf__estimator__n_estimators': [10, 100, 1000],
     'clf__estimator__n_jobs': [-1],
     'clf__estimator__oob_score': [False],
     'clf__estimator__random_state': [None],
     'clf__estimator__verbose': [0],
     'clf__estimator__warm_start': [False]
    },
    {'clf__estimator': [KNeighborsClassifier()],
     'clf__estimator__algorithm': ['auto', 'ball_tree', 'kd_tree'],
     'clf__estimator__leaf_size': [10, 30, 50],
     'clf__estimator__metric': ['minkowski'],
     'clf__estimator__metric_params': [None],
     'clf__estimator__n_jobs': [-1],
     'clf__estimator__n_neighbors': [3, 5, 11, 15],
     'clf__estimator__p': [2],
     'clf__estimator__weights': ['uniform', 'distance']
    }
]

scoring = {
    'f1': f1_scorer,
    'recall': recall_scorer
}

rscv = RandomizedSearchCV(estimator=pipe, 
                          param_distributions=candidates,
                          scoring=scoring,
                          refit='f1',
                          n_iter=13,
                          cv=5, 
                          n_jobs=-1,
                          verbose=False)

best_model = rscv.fit(X_train, y_train)

In [13]:
best_model

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessing',
                                              ColumnTransformer(transformers=[('categorical',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=True,
                                                                                                              strategy='most_frequent')),
                                                                                               ('ohe',
                                                                                                OneHotEncoder(handle_unknown='ignore'))]),
                                                                               ['key',
                                                                                'mode',
                          

# Best model parameters

In [10]:
best_model.best_params_

{'clf__estimator__warm_start': False,
 'clf__estimator__verbose': 0,
 'clf__estimator__random_state': None,
 'clf__estimator__oob_score': False,
 'clf__estimator__n_jobs': -1,
 'clf__estimator__n_estimators': 100,
 'clf__estimator__min_weight_fraction_leaf': 0.0,
 'clf__estimator__min_samples_split': 2,
 'clf__estimator__min_samples_leaf': 10,
 'clf__estimator__min_impurity_split': None,
 'clf__estimator__min_impurity_decrease': 0.0,
 'clf__estimator__max_samples': None,
 'clf__estimator__max_leaf_nodes': 1000,
 'clf__estimator__max_features': 'sqrt',
 'clf__estimator__max_depth': 100,
 'clf__estimator__criterion': 'gini',
 'clf__estimator__ccp_alpha': 0.0,
 'clf__estimator__bootstrap': False,
 'clf__estimator': RandomForestClassifier(bootstrap=False, max_depth=100, max_features='sqrt',
                        max_leaf_nodes=1000, min_samples_leaf=10, n_jobs=-1)}

# Performance on test set

Applying the best model on the test test to see how well it does on unseen data.

In [11]:
y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
recall    = recall_score(y_test, y_pred, average='weighted', zero_division=1)

print(f'F1 score: {f1:0.5f}')
print(f'Recall: {recall:0.5f}')

F1 score: 0.66704
Recall: 0.57460


# Feature Importance

Which features does the model see as most important?

In [12]:
r = permutation_importance(best_model, X_test, y_test, n_repeats=15)

for i in r.importances_mean.argsort()[::-1]:
    print(f"{X_train.columns[i]}",
        f"{r.importances_mean[i]:.3f}"
        f" +/- {r.importances_std[i]:.3f}")  

popularity 0.177 +/- 0.001
speechiness 0.177 +/- 0.002
danceability 0.152 +/- 0.002
acousticness 0.127 +/- 0.002
instrumentalness 0.123 +/- 0.002
energy 0.111 +/- 0.002
loudness 0.100 +/- 0.002
valence 0.092 +/- 0.001
duration_ms 0.075 +/- 0.001
tempo 0.069 +/- 0.001
mode 0.058 +/- 0.001
liveness 0.048 +/- 0.001
key 0.038 +/- 0.001
time_signature 0.007 +/- 0.000
track_id 0.000 +/- 0.000
track_name 0.000 +/- 0.000
artist_name 0.000 +/- 0.000


# Conclusion

Using 56,000 unqique songs with 233,000 genre labels, we are able to predict the genres that belong to a song using their audio features with a Random Forest Classifier with an F1 score of 0.67.