# Code Pudding 2024
***

The purpose of this notebook will be to analyse data retrieved from the [Spotify Web API](https://developer.spotify.com/documentation/web-api) in order to train various machine learning models to predict the genre of any given song. Once the models have been trained, validated and tested, a function will be built that feeds the data from the API to the best preforming model, and it's genre will be predicted.

## Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
import sklearn.metrics as metrics
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
data = pd.read_csv('spotify_data_more.csv', index_col='track_id')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 44AyOl4qVkzS48vBsbNXaC to 3tHCG0ISOA0pXscIdNrJml
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genre             9000 non-null   object 
 1   danceability      9000 non-null   float64
 2   energy            9000 non-null   float64
 3   key               9000 non-null   int64  
 4   loudness          9000 non-null   float64
 5   mode              9000 non-null   int64  
 6   speechiness       9000 non-null   float64
 7   acousticness      9000 non-null   float64
 8   instrumentalness  9000 non-null   float64
 9   liveness          9000 non-null   float64
 10  valence           9000 non-null   float64
 11  tempo             9000 non-null   float64
 12  type              9000 non-null   object 
 13  uri               9000 non-null   object 
 14  track_href        9000 non-null   object 
 15  analysis_url      9000 non-null   object 
 16  duration

In [3]:
data = data.drop(['type', 'uri', 'track_href', 'analysis_url'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 44AyOl4qVkzS48vBsbNXaC to 3tHCG0ISOA0pXscIdNrJml
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genre             9000 non-null   object 
 1   danceability      9000 non-null   float64
 2   energy            9000 non-null   float64
 3   key               9000 non-null   int64  
 4   loudness          9000 non-null   float64
 5   mode              9000 non-null   int64  
 6   speechiness       9000 non-null   float64
 7   acousticness      9000 non-null   float64
 8   instrumentalness  9000 non-null   float64
 9   liveness          9000 non-null   float64
 10  valence           9000 non-null   float64
 11  tempo             9000 non-null   float64
 12  duration_ms       9000 non-null   int64  
 13  time_signature    9000 non-null   int64  
dtypes: float64(9), int64(4), object(1)
memory usage: 1.0+ MB


## Model Training

In [4]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
print(train.shape)
print(test.shape)

X_train = train.drop(['genre'], axis=1)
y_train = train['genre']
X_test = test.drop(['genre'], axis=1)
y_test = test['genre']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(8100, 14)
(900, 14)
(8100, 13) (8100,)
(900, 13) (900,)


In [5]:
# Initializing KFold Instance
cross_validator = KFold(n_splits=3, shuffle=True, random_state=42)

In [6]:
# Creating Dummy model param grid
dummy_params = {
    'strategy':['most_frequent', 'prior', 'stratified', 'uniform', 'constant'],
    'constant':['rock']
}

# Initializing DummyClassifier and its GridSearchCV
dummy_model = DummyClassifier(random_state=42)
dummy_grid_roc_auc = GridSearchCV(dummy_model, dummy_params, scoring='roc_auc_ovo_weighted', cv=cross_validator)
dummy_grid_f1 = GridSearchCV(dummy_model, dummy_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
dummy_grid_roc_auc.fit(X_train, y_train)
dummy_grid_f1.fit(X_train, y_train)
dummy_f1 = cross_val_score(dummy_grid_roc_auc.best_estimator_, X_train, y_train, scoring='f1_weighted', cv=cross_validator).mean()
dummy_roc_auc = cross_val_score(dummy_grid_f1.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()


print(dummy_grid_roc_auc.best_estimator_, f'ROC_AUC: {dummy_grid_roc_auc.best_score_}')
print('F1 of ROC_AUC model: ', dummy_f1, '\n')
print(dummy_grid_f1.best_estimator_, f'F1: {dummy_grid_f1.best_score_}')
print('ROC_AUC of F1 model: ', dummy_roc_auc)

DummyClassifier(constant='rock', random_state=42, strategy='most_frequent') ROC_AUC: 0.5
F1 of ROC_AUC model:  0.04257496923701276 

DummyClassifier(constant='rock', random_state=42, strategy='uniform') F1: 0.1676583761097187
ROC_AUC of F1 model:  0.5


In [8]:
# Creating DecisionTree model param grid
tree_params = {
    'max_depth':np.arange(3, 11, 1),
    'min_samples_split':[2,4,6]
}

# Initializing DecisionTree and its GridSearchCV
tree_model = DecisionTreeClassifier(random_state=42)
tree_grid_roc_auc = GridSearchCV(tree_model, tree_params, scoring='roc_auc_ovo_weighted', cv=cross_validator)
tree_grid_f1 = GridSearchCV(tree_model, tree_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
tree_grid_roc_auc.fit(X_train, y_train)
tree_grid_f1.fit(X_train, y_train)
tree_f1 = cross_val_score(tree_grid_roc_auc.best_estimator_, X_train, y_train, scoring='f1_weighted', cv=cross_validator).mean()
tree_roc_auc = cross_val_score(tree_grid_f1.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

print(tree_grid_roc_auc.best_estimator_, f'ROC_AUC: {tree_grid_roc_auc.best_score_}')
print('F1 of ROC_AUC model: ', tree_f1, '\n')
print(tree_grid_f1.best_estimator_, f'F1: {tree_grid_f1.best_score_}')
print('ROC_AUC of F1 model: ', tree_roc_auc)

DecisionTreeClassifier(max_depth=7, min_samples_split=6, random_state=42) ROC_AUC: 0.7844794389067227
F1 of ROC_AUC model:  0.48894107684397276 

DecisionTreeClassifier(max_depth=9, min_samples_split=6, random_state=42) F1: 0.49247320622517093
ROC_AUC of F1 model:  0.7702834658469125


In [13]:
# Creating RandomForest model param grid
forest_params = {
    'n_estimators':[500, 1000, 1500],
    'max_depth':np.arange(20, 41, 2),
}

# Initializing RandomForest and its GridSearchCV
forest_model = RandomForestClassifier(random_state=42)
forest_grid_roc_auc = GridSearchCV(forest_model, forest_params, scoring='roc_auc_ovo_weighted', cv=cross_validator)
forest_grid_f1 = GridSearchCV(forest_model, forest_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
forest_grid_roc_auc.fit(X_train, y_train)
forest_grid_f1.fit(X_train, y_train)
forest_f1 = cross_val_score(forest_grid_roc_auc.best_estimator_, X_train, y_train, scoring='f1_weighted', cv=cross_validator).mean()
forest_roc_auc = cross_val_score(forest_grid_f1.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

print(forest_grid_roc_auc.best_estimator_, f'ROC_AUC: {forest_grid_roc_auc.best_score_}')
print('F1 of ROC_AUC model: ', forest_f1, '\n')
print(forest_grid_f1.best_estimator_, f'F1: {forest_grid_f1.best_score_}')
print('ROC_AUC of F1 model: ', forest_roc_auc)

RandomForestClassifier(max_depth=22, n_estimators=1500, random_state=42) ROC_AUC: 0.8656457394367386
F1 of ROC_AUC model:  0.5755083909317781 

RandomForestClassifier(max_depth=22, n_estimators=1500, random_state=42) F1: 0.5755083909317781
ROC_AUC of F1 model:  0.7702834658469125


In [11]:
# Creating LightGBM model param grid
lightgbm_params = {
    'num_leaves':[31, 100, 200],
    'learning_rate':[0.01]
}

# Initializing LightGBM and its GridSearchCV
lightgbm_model = lgb.LGBMClassifier(random_state=42, verbosity=-1)
lightgbm_grid_roc_auc = GridSearchCV(lightgbm_model, lightgbm_params, scoring='roc_auc_ovo_weighted', cv=cross_validator)
lightgbm_grid_f1 = GridSearchCV(lightgbm_model, lightgbm_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
lightgbm_grid_roc_auc.fit(X_train, y_train)
lightgbm_grid_f1.fit(X_train, y_train)
lightgbm_f1 = cross_val_score(lightgbm_grid_roc_auc.best_estimator_, X_train, y_train, scoring='f1_weighted', cv=cross_validator).mean()
lightgbm_roc_auc = cross_val_score(lightgbm_grid_f1.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

print(lightgbm_grid_roc_auc.best_estimator_, f'ROC_AUC: {lightgbm_grid_roc_auc.best_score_}')
print('F1 of ROC_AUC model: ', lightgbm_f1, '\n')
print(lightgbm_grid_f1.best_estimator_, f'F1: {lightgbm_grid_f1.best_score_}')
print('ROC_AUC of F1 model: ', lightgbm_roc_auc)

LGBMClassifier(learning_rate=0.01, num_leaves=100, random_state=42,
               verbosity=-1) ROC_AUC: 0.8431479275343451
F1 of ROC_AUC model:  0.5523200542615104 

LGBMClassifier(learning_rate=0.01, num_leaves=200, random_state=42,
               verbosity=-1) F1: 0.5565323696002716
ROC_AUC of F1 model:  0.8430191651830828


In [12]:
# Creating CatBoost model param grid
catboost_params = {
    'iterations':[1001, 2001],
    'learning_rate':[0.01]
}

# Initializing CatBoost and its GridSearchCV
catboost_model = CatBoostClassifier(random_seed=42, verbose=1000)
catboost_grid_roc_auc = GridSearchCV(catboost_model, catboost_params, scoring='roc_auc_ovo_weighted', cv=cross_validator)
catboost_grid_f1 = GridSearchCV(catboost_model, catboost_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
catboost_grid_roc_auc.fit(X_train, y_train)
catboost_grid_f1.fit(X_train, y_train)
catboost_f1 = cross_val_score(catboost_grid_roc_auc.best_estimator_, X_train, y_train, scoring='f1_weighted', cv=cross_validator).mean()
catboost_roc_auc = cross_val_score(catboost_grid_f1.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()


print(catboost_grid_roc_auc.best_estimator_, f'ROC_AUC: {catboost_grid_roc_auc.best_score_}')
print('F1 of ROC_AUC model: ', catboost_f1, '\n')
print(catboost_grid_f1.best_estimator_, f'F1: {catboost_grid_f1.best_score_}')
print('ROC_AUC of F1 model: ', catboost_roc_auc)

0:	learn: 1.7825969	total: 10.6ms	remaining: 10.6s
1000:	learn: 0.9863546	total: 9.21s	remaining: 0us
0:	learn: 1.7827187	total: 44.7ms	remaining: 44.7s
1000:	learn: 0.9912005	total: 9.07s	remaining: 0us
0:	learn: 1.7832833	total: 42.1ms	remaining: 42.1s
1000:	learn: 0.9852248	total: 9.09s	remaining: 0us
0:	learn: 1.7825969	total: 16.8ms	remaining: 33.7s
1000:	learn: 0.9863546	total: 9s	remaining: 8.99s
2000:	learn: 0.8357130	total: 17.9s	remaining: 0us
0:	learn: 1.7827187	total: 10.3ms	remaining: 20.5s
1000:	learn: 0.9912005	total: 9.01s	remaining: 9s
2000:	learn: 0.8355803	total: 18s	remaining: 0us
0:	learn: 1.7832833	total: 33.8ms	remaining: 1m 7s
1000:	learn: 0.9852248	total: 9.13s	remaining: 9.13s
2000:	learn: 0.8346204	total: 19.2s	remaining: 0us
0:	learn: 1.7820866	total: 13.9ms	remaining: 27.8s
1000:	learn: 1.0167922	total: 9.78s	remaining: 9.77s
2000:	learn: 0.8863225	total: 20.9s	remaining: 0us
0:	learn: 1.7825969	total: 50.1ms	remaining: 50.1s
1000:	learn: 0.9863546	total: 9

## Test

In [17]:
test_roc_auc_predict = forest_grid_roc_auc.best_estimator_.predict(X_test)
test_roc_auc_proba = forest_grid_roc_auc.best_estimator_.predict_proba(X_test)
test_f1_predict = forest_grid_f1.best_estimator_.predict(X_test)
test_f1_proba = forest_grid_f1.best_estimator_.predict_proba(X_test)

test_roc_auc = metrics.roc_auc_score(y_test, test_roc_auc_proba, average='weighted', multi_class='ovo')
test_roc_auc_f1 = metrics.f1_score(y_test, test_roc_auc_predict, average='weighted')
test_f1 = metrics.f1_score(y_test, test_f1_predict, average='weighted')
test_f1_roc_auc = metrics.roc_auc_score(y_test, test_f1_proba, average='weighted', multi_class='ovo')
    


print(forest_grid_roc_auc.best_estimator_, f'Test ROC_AUC: {test_roc_auc}')
print('F1 of ROC_AUC model: ', test_roc_auc_f1, '\n')
print(forest_grid_f1.best_estimator_, f'Test F1: {test_f1}')
print('ROC_AUC of F1 model: ', test_f1_roc_auc)

RandomForestClassifier(max_depth=22, n_estimators=1500, random_state=42) Test ROC_AUC: 0.8818380990339227
F1 of ROC_AUC model:  0.6007459537175455 

RandomForestClassifier(max_depth=22, n_estimators=1500, random_state=42) Test F1: 0.6007459537175455
ROC_AUC of F1 model:  0.8818380990339227
