In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv("../data/features_30_sec.csv")
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 60 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   filename                 1000 non-null   object 
 1   length                   1000 non-null   int64  
 2   chroma_stft_mean         1000 non-null   float64
 3   chroma_stft_var          1000 non-null   float64
 4   rms_mean                 1000 non-null   float64
 5   rms_var                  1000 non-null   float64
 6   spectral_centroid_mean   1000 non-null   float64
 7   spectral_centroid_var    1000 non-null   float64
 8   spectral_bandwidth_mean  1000 non-null   float64
 9   spectral_bandwidth_var   1000 non-null   float64
 10  rolloff_mean             1000 non-null   float64
 11  rolloff_var              1000 non-null   float64
 12  zero_crossing_rate_mean  1000 non-null   float64
 13  zero_crossing_rate_var   1000 non-null   float64
 14  harmony_mean             

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit, train_test_split


le = LabelEncoder()
y_enc = le.fit_transform(df["label"])  # Converts strings to integers

# Features
X = df.drop(columns=["label", "filename", "length"]).values.astype(np.float64)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=0
)
# gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
# train_idx, test_idx = next(gss.split(X, y_enc, groups=file_id))
# X_train, X_test = X[train_idx], X[test_idx]
# y_train, y_test = y_enc[train_idx], y_enc[test_idx]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 57)
(200, 57)
(800,)
(200,)


In [5]:
from sklearn.calibration import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Classifiers
classifiers = [
    LogisticRegression(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0, max_depth=6, min_samples_leaf=5),
    HistGradientBoostingClassifier(
        max_depth=3,  # shallower trees
        min_samples_leaf=6,
        learning_rate=0.01,  # slower learning rate
        random_state=0,
        l2_regularization=0.1,
    ),
    KNeighborsClassifier(),
    MLPClassifier(random_state=0, max_iter=100),
    XGBClassifier(
        max_depth=4,  # shallower trees
        learning_rate=0.008,  # smaller step size
        random_state=0,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.01,
        reg_lambda=1.0,
    ),
]

results = []

for clf in classifiers:

    # Add scaling where necessary
    if isinstance(clf, (SVC, MLPClassifier, KNeighborsClassifier, LogisticRegression)):
        pipeline = Pipeline([("scaler", StandardScaler()), ("clf", clf)])
    else:
        pipeline = Pipeline([("clf", clf)])

    # Cross-validate
    scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        scoring={
            "accuracy": "accuracy",
            "f1_macro": "f1_macro",
        },
        return_train_score=True,
    )

    # Save mean scores only
    results.append(
        {
            "Model": clf.__class__.__name__,
            "Train Accuracy": np.mean(scores["train_accuracy"]),
            "Test Accuracy": np.mean(scores["test_accuracy"]),
            "Train F1-macro": np.mean(scores["train_f1_macro"]),
            "Test F1-macro": np.mean(scores["test_f1_macro"]),
            "Fit Time (s)": np.mean(scores["fit_time"]),
            "Accuracy Gap": np.mean(scores["train_accuracy"])
            - np.mean(scores["test_accuracy"]),
            "F1-macro Gap": np.mean(scores["train_f1_macro"])
            - np.mean(scores["test_f1_macro"]),
        }
    )

# Convert to DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["Test F1-macro", "Test Accuracy", "F1-macro Gap"],
    ascending=[False, False, True],
).reset_index(drop=True)
df_results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1-macro,Test F1-macro,Fit Time (s),Accuracy Gap,F1-macro Gap
0,MLPClassifier,0.946875,0.725,0.946076,0.716842,0.550435,0.221875,0.229235
1,SVC,0.885,0.72625,0.882864,0.716777,0.039715,0.15875,0.166087
2,LogisticRegression,0.913125,0.7025,0.91104,0.69262,0.084124,0.210625,0.21842
3,KNeighborsClassifier,0.800938,0.68625,0.798946,0.681313,0.002925,0.114688,0.117632
4,XGBClassifier,0.958125,0.67875,0.957063,0.672371,3.765162,0.279375,0.284692
5,RandomForestClassifier,0.883125,0.65375,0.881054,0.640691,0.525871,0.229375,0.240363
6,HistGradientBoostingClassifier,0.936562,0.6475,0.934949,0.639904,2.681405,0.289062,0.295045


In [6]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", StandardScaler()), ("clf", SVC(random_state=0))])
# Hyperparameter grid
param_grid = {
    "clf__C": [2],  # Regularization strength
    "clf__kernel": ["rbf"],  # Common kernels
    "clf__gamma": [0.01],  # For 'rbf' and 'poly'
    "clf__tol": [1e-3],  # Convergence tolerance
}
scoring = {"f1_macro": "f1_macro", "accuracy": "accuracy"}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=30,  # number of random trials
    scoring=scoring,
    refit="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

print("Best SVC Params:", random_search.best_params_)
best_params = random_search.best_params_

best_index = random_search.best_index_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best SVC Params: {'clf__tol': 0.001, 'clf__kernel': 'rbf', 'clf__gamma': 0.01, 'clf__C': 2}


In [7]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
)
import time


best_params = random_search.best_params_
best_index = random_search.best_index_
# CV summary from GridSearchCV
best_train_acc = random_search.cv_results_["mean_train_accuracy"][best_index]
best_test_acc = random_search.cv_results_["mean_test_accuracy"][best_index]
best_train_f1 = random_search.cv_results_["mean_train_f1_macro"][best_index]
best_test_f1 = random_search.cv_results_["mean_test_f1_macro"][best_index]
fit_time = random_search.cv_results_["mean_fit_time"][best_index]

print(f"CV Train Accuracy (mean): {best_train_acc:.4f}")
print(f"CV Test Accuracy (mean): {best_test_acc:.4f}")
print(f"CV Train F1 (mean): {best_train_f1:.4f}")
print(f"CV Test F1 (mean): {best_test_f1:.4f}")
print(f"CV Fit Time (mean, s): {fit_time:.4f}")

# Use the best estimator from GridSearchCV
best_model = random_search.best_estimator_

# Evaluate on hold-out test set
start = time.time()
y_pred_test = best_model.predict(X_test)
test_time = time.time() - start

# Compute metrics
test_acc = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test, average="macro")

# Print results
print(f"Holdout Accuracy: {test_acc:.4f}")
print(f"Holdout F1 Score (macro): {test_f1:.4f}")
print(f"Holdout Prediction Time (s): {test_time:.4f}")

CV Train Accuracy (mean): 0.8956
CV Test Accuracy (mean): 0.7238
CV Train F1 (mean): 0.8937
CV Test F1 (mean): 0.7155
CV Fit Time (mean, s): 0.0743
Holdout Accuracy: 0.6600
Holdout F1 Score (macro): 0.6728
Holdout Prediction Time (s): 0.0442


In [8]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Pipeline with Logistic Regression
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(random_state=0, max_iter=500)),
    ]
)

# Hyperparameter grid for Logistic Regression
param_grid = {
    "clf__C": [0.15, 0.2, 0.25],  # Inverse of regularization strength
    "clf__penalty": ["l2"],  # Regularization type
    "clf__solver": [
        "saga",
    ],  # Solvers that support l1/l2
    "clf__class_weight": [None],  # Handle class imbalance
}

scoring = {"f1_macro": "f1_macro", "accuracy": "accuracy"}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=40,  # Number of random trials
    scoring=scoring,
    refit="f1_macro",  # Optimize for F1-macro
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)

# Fit on training data
random_search.fit(X_train, y_train)

# Best parameters
print("Best Logistic Regression Params:", random_search.best_params_)
best_params = random_search.best_params_
best_index = random_search.best_index_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Logistic Regression Params: {'clf__solver': 'saga', 'clf__penalty': 'l2', 'clf__class_weight': None, 'clf__C': 0.2}


In [9]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
)
import time


best_params = random_search.best_params_
best_index = random_search.best_index_
# CV summary from GridSearchCV
best_train_acc = random_search.cv_results_["mean_train_accuracy"][best_index]
best_test_acc = random_search.cv_results_["mean_test_accuracy"][best_index]
best_train_f1 = random_search.cv_results_["mean_train_f1_macro"][best_index]
best_test_f1 = random_search.cv_results_["mean_test_f1_macro"][best_index]
fit_time = random_search.cv_results_["mean_fit_time"][best_index]

print(f"CV Train Accuracy (mean): {best_train_acc:.4f}")
print(f"CV Test Accuracy (mean): {best_test_acc:.4f}")
print(f"CV Train F1 (mean): {best_train_f1:.4f}")
print(f"CV Test F1 (mean): {best_test_f1:.4f}")
print(f"CV Fit Time (mean, s): {fit_time:.4f}")

# Use the best estimator from GridSearchCV
best_model = random_search.best_estimator_

# Evaluate on hold-out test set
start = time.time()
y_pred_test = best_model.predict(X_test)
test_time = time.time() - start

# Compute metrics
test_acc = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test, average="macro")

# Print results
print(f"Holdout Accuracy: {test_acc:.4f}")
print(f"Holdout F1 Score (macro): {test_f1:.4f}")
print(f"Holdout Prediction Time (s): {test_time:.4f}")

CV Train Accuracy (mean): 0.8591
CV Test Accuracy (mean): 0.7000
CV Train F1 (mean): 0.8551
CV Test F1 (mean): 0.6914
CV Fit Time (mean, s): 2.8393
Holdout Accuracy: 0.7200
Holdout F1 Score (macro): 0.7308
Holdout Prediction Time (s): 0.0013


In [10]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", StandardScaler()), ("clf", MLPClassifier(random_state=0))])
# Hyperparameter grid
param_grid = {
    "clf__hidden_layer_sizes": [
        (128,),
    ],  # slightly smaller or 2-layer networks
    "clf__activation": ["relu"],  # test smoother gradients with tanh
    "clf__solver": ["adam"],  # keep adam
    "clf__alpha": [2.3],  # slightly stronger regularization to reduce overfit
    "clf__learning_rate": ["constant"],  # adaptive might improve test performance
    "clf__max_iter": [160],  # allow more iterations to ensure convergence
}
scoring = {"f1_macro": "f1_macro", "accuracy": "accuracy"}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=30,  # number of random trials
    scoring=scoring,
    refit="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

print("Best MLP Params:", random_search.best_params_)
best_params = random_search.best_params_

best_index = random_search.best_index_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best MLP Params: {'clf__solver': 'adam', 'clf__max_iter': 160, 'clf__learning_rate': 'constant', 'clf__hidden_layer_sizes': (128,), 'clf__alpha': 2.3, 'clf__activation': 'relu'}


In [11]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
)
import time


best_params = random_search.best_params_
best_index = random_search.best_index_
# CV summary from GridSearchCV
best_train_acc = random_search.cv_results_["mean_train_accuracy"][best_index]
best_test_acc = random_search.cv_results_["mean_test_accuracy"][best_index]
best_train_f1 = random_search.cv_results_["mean_train_f1_macro"][best_index]
best_test_f1 = random_search.cv_results_["mean_test_f1_macro"][best_index]
fit_time = random_search.cv_results_["mean_fit_time"][best_index]

print(f"CV Train Accuracy (mean): {best_train_acc:.4f}")
print(f"CV Test Accuracy (mean): {best_test_acc:.4f}")
print(f"CV Train F1 (mean): {best_train_f1:.4f}")
print(f"CV Test F1 (mean): {best_test_f1:.4f}")
print(f"CV Fit Time (mean, s): {fit_time:.4f}")

# Use the best estimator from GridSearchCV
best_model = random_search.best_estimator_

# Evaluate on hold-out test set
start = time.time()
y_pred_test = best_model.predict(X_test)
test_time = time.time() - start

# Compute metrics
test_acc = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test, average="macro")

# Print results
print(f"Holdout Accuracy: {test_acc:.4f}")
print(f"Holdout F1 Score (macro): {test_f1:.4f}")
print(f"Holdout Prediction Time (s): {test_time:.4f}")

CV Train Accuracy (mean): 0.8906
CV Test Accuracy (mean): 0.7213
CV Train F1 (mean): 0.8881
CV Test F1 (mean): 0.7093
CV Fit Time (mean, s): 1.1256
Holdout Accuracy: 0.7400
Holdout F1 Score (macro): 0.7525
Holdout Prediction Time (s): 0.0019


In [12]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# KNN pipeline
pipe = Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier())])

# Hyperparameter grid
param_grid = {
    "clf__n_neighbors": [11, 13, 15],  # explore smaller to moderate k
    "clf__weights": ["uniform"],  # uniform or weighted by distance
    "clf__metric": ["manhattan"],  # distance metrics
    "clf__p": [1, 2],  # L1 or L2 for Minkowski
}

# Scoring metrics
scoring = {"f1_macro": "f1_macro", "accuracy": "accuracy"}

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Randomized Search
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=50,  # number of random trials
    scoring=scoring,
    refit="f1_macro",  # refit best model by macro F1
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)

# Fit on training data
random_search.fit(X_train, y_train)

# Best parameters
print("Best KNN Params:", random_search.best_params_)
best_params = random_search.best_params_
best_index = random_search.best_index_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best KNN Params: {'clf__weights': 'uniform', 'clf__p': 1, 'clf__n_neighbors': 13, 'clf__metric': 'manhattan'}


In [13]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)
from sklearn.preprocessing import label_binarize
import time
import numpy as np

best_params = random_search.best_params_
best_index = random_search.best_index_
# CV summary from GridSearchCV
best_train_acc = random_search.cv_results_["mean_train_accuracy"][best_index]
best_test_acc = random_search.cv_results_["mean_test_accuracy"][best_index]
best_train_f1 = random_search.cv_results_["mean_train_f1_macro"][best_index]
best_test_f1 = random_search.cv_results_["mean_test_f1_macro"][best_index]
fit_time = random_search.cv_results_["mean_fit_time"][best_index]

print(f"CV Train Accuracy (mean): {best_train_acc:.4f}")
print(f"CV Test Accuracy (mean): {best_test_acc:.4f}")
print(f"CV Train F1 (mean): {best_train_f1:.4f}")
print(f"CV Test F1 (mean): {best_test_f1:.4f}")
print(f"CV Fit Time (mean, s): {fit_time:.4f}")

# Use the best estimator from GridSearchCV
best_model = random_search.best_estimator_

# Evaluate on hold-out test set
start = time.time()
y_pred_test = best_model.predict(X_test)
test_time = time.time() - start

# Compute metrics
test_acc = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test, average="macro")

# Print results
print(f"Holdout Accuracy: {test_acc:.4f}")
print(f"Holdout F1 Score (macro): {test_f1:.4f}")
print(f"Holdout Prediction Time (s): {test_time:.4f}")

CV Train Accuracy (mean): 0.7169
CV Test Accuracy (mean): 0.6487
CV Train F1 (mean): 0.7123
CV Test F1 (mean): 0.6430
CV Fit Time (mean, s): 0.0040
Holdout Accuracy: 0.6650
Holdout F1 Score (macro): 0.6656
Holdout Prediction Time (s): 0.0284


In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import time

# === Base models with scaling ===
lr_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "clf",
            LogisticRegression(
                solver="saga", penalty="l2", C=0.2, max_iter=500, random_state=0
            ),
        ),
    ]
)

mlp_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "clf",
            MLPClassifier(
                hidden_layer_sizes=(128,),
                solver="adam",
                learning_rate="constant",
                alpha=2.3,
                max_iter=160,
                random_state=0,
            ),
        ),
    ]
)

knn_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "clf",
            KNeighborsClassifier(
                n_neighbors=13, weights="uniform", metric="manhattan", p=1
            ),
        ),
    ]
)

y_true = y_test

lr_pipe.fit(X_train, y_train)
mlp_pipe.fit(X_train, y_train)
knn_pipe.fit(X_train, y_train)

y_pred_mlp = mlp_pipe.predict(X_test)
y_pred_lr = lr_pipe.predict(X_test)
y_pred_knn = knn_pipe.predict(X_test)

errors_mlp = (y_true != y_pred_mlp).astype(int)
errors_lr = (y_true != y_pred_lr).astype(int)
errors_knn = (y_true != y_pred_knn).astype(int)

corr_mlp_lr = np.corrcoef(errors_mlp, errors_lr)[0, 1]
corr_mlp_knn = np.corrcoef(errors_mlp, errors_knn)[0, 1]
corr_lr_knn = np.corrcoef(errors_lr, errors_knn)[0, 1]

print("Error correlation MLP-LR:", corr_mlp_lr)
print("Error correlation MLP-KNN:", corr_mlp_knn)
print("Error correlation LR-KNN:", corr_lr_knn)

Error correlation MLP-LR: 0.7727995396408661
Error correlation MLP-KNN: 0.4970265802013242
Error correlation LR-KNN: 0.4303445905856845


In [15]:
# === Voting ensemble ===
ensemble = VotingClassifier(
    estimators=[("knn", knn_pipe), ("mlp", mlp_pipe)],
    voting="soft",  # uses predicted probabilities
    weights=[1, 3],
)

# === Cross-validation metrics ===
scoring = {"accuracy": "accuracy", "f1_macro": "f1_macro"}
cv_results = cross_validate(
    ensemble,
    X_train,
    y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1,
)

print(f"CV Train Accuracy (mean): {cv_results['train_accuracy'].mean():.4f}")
print(f"CV Test Accuracy (mean): {cv_results['test_accuracy'].mean():.4f}")
print(f"CV Train F1 (mean): {cv_results['train_f1_macro'].mean():.4f}")
print(f"CV Test F1 (mean): {cv_results['test_f1_macro'].mean():.4f}")

# === Fit and holdout evaluation ===
start_fit = time.time()
ensemble.fit(X_train, y_train)
fit_time = time.time() - start_fit

start_pred = time.time()
y_pred = ensemble.predict(X_test)
pred_time = time.time() - start_pred

accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"CV Fit Time (mean, s): {cv_results['fit_time'].mean():.4f}")
print(f"Holdout Accuracy: {accuracy:.4f}")
print(f"Holdout F1 Score (macro): {f1_macro:.4f}")
print(f"Holdout Prediction Time (s): {pred_time:.4f}")
# Detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

CV Train Accuracy (mean): 0.8850
CV Test Accuracy (mean): 0.7350
CV Train F1 (mean): 0.8827
CV Test F1 (mean): 0.7260
CV Fit Time (mean, s): 0.9848
Holdout Accuracy: 0.7350
Holdout F1 Score (macro): 0.7461
Holdout Prediction Time (s): 0.0176

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.80      0.73        15
           1       0.61      1.00      0.76        11
           2       0.56      0.67      0.61        27
           3       0.52      0.77      0.62        22
           4       0.75      0.52      0.62        23
           5       0.90      0.50      0.64        18
           6       0.89      0.80      0.84        20
           7       0.90      0.75      0.82        24
           8       0.38      0.40      0.39        15
           9       0.74      0.56      0.64        25

    accuracy                           0.67       200
   macro avg       0.69      0.68      0.67       200
weighted avg       0.70      

In [16]:
# Final chosen model - MLP

scoring = {"accuracy": "accuracy", "f1_macro": "f1_macro"}
cv_results = cross_validate(
    ensemble,
    X_train,
    y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1,
)

print(f"CV Train Accuracy (mean): {cv_results['train_accuracy'].mean():.4f}")
print(f"CV Test Accuracy (mean): {cv_results['test_accuracy'].mean():.4f}")
print(f"CV Train F1 (mean): {cv_results['train_f1_macro'].mean():.4f}")
print(f"CV Test F1 (mean): {cv_results['test_f1_macro'].mean():.4f}")

# === Fit and holdout evaluation ===
start_fit = time.time()
mlp_pipe.fit(X_train, y_train)
fit_time = time.time() - start_fit

start_pred = time.time()
y_pred = mlp_pipe.predict(X_test)
pred_time = time.time() - start_pred

accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"CV Fit Time (mean, s): {cv_results['fit_time'].mean():.4f}")
print(f"Holdout Accuracy: {accuracy:.4f}")
print(f"Holdout F1 Score (macro): {f1_macro:.4f}")
print(f"Holdout Prediction Time (s): {pred_time:.4f}")
# Detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

CV Train Accuracy (mean): 0.8850
CV Test Accuracy (mean): 0.7350
CV Train F1 (mean): 0.8827
CV Test F1 (mean): 0.7260
CV Fit Time (mean, s): 1.0526
Holdout Accuracy: 0.7400
Holdout F1 Score (macro): 0.7525
Holdout Prediction Time (s): 0.0013

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.80      0.73        15
           1       0.61      1.00      0.76        11
           2       0.56      0.67      0.61        27
           3       0.52      0.77      0.62        22
           4       0.75      0.52      0.62        23
           5       0.90      0.50      0.64        18
           6       0.89      0.80      0.84        20
           7       0.90      0.75      0.82        24
           8       0.38      0.40      0.39        15
           9       0.74      0.56      0.64        25

    accuracy                           0.67       200
   macro avg       0.69      0.68      0.67       200
weighted avg       0.70      

In [17]:
import pickle

pickle.dump(mlp_pipe, open("model.pkl", "wb"))