In [1]:
import os
import itertools

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import KFold
# from keras.wrappers.scikit_learn import KerasClassifier as skKerasClassifier
# from scikeras.wrappers import KerasClassifier

# import keras.api._v2.keras as keras #noqa
from keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense, BatchNormalization, Dropout
from keras import activations
# from keras.losses import CategoricalCrossentropy
from keras.callbacks import History, EarlyStopping, ModelCheckpoint
# from keras import regularizers
# import keras.backend as K

from genreclassification.utils import get_project_root

import dataframe_image as dfi
pd.set_option("display.max_columns", None)

2022-12-17 21:05:44.470185: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# mlnn hyperparameter gridsearch

### potential parameter to explore
* learning rate
* batch size
* kernel initialiser
* maybe activation functions
* model optimiser
* dropout rate
* hidden dimensions

---
## data split

In [2]:
df_30 = pd.read_csv(
    get_project_root() / "data/features_30_sec.csv"
)

df_3 = pd.read_csv(
    get_project_root() / "data/features_3_sec.csv"
)

### features / targets:

In [3]:
# get the target labels:
df_3_y = df_3["label"]

# find the training features:
df_3_x = df_3.drop(
    columns=["filename", "length", "label"]
)

### test split before scaling:

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    df_3_x,
    df_3_y,
    test_size=0.2,
    random_state=42
)

### MinMax scaling for the training set:

In [5]:
scaler = MinMaxScaler((0,1))
# scale fetures and get column names
scale_cols = x_train.columns
x_scaled = scaler.fit_transform(x_train[scale_cols])
# retrieve column names for scaled df:
x_train_scaled = pd.DataFrame(
    x_scaled,
    columns=scale_cols
)


---
## mlnn

### split training set into training and validation
### not requred as using cross CV for tuning

In [6]:
# x_train, x_val, y_train, y_val = train_test_split(
#     x_train,
#     y_train,
#     test_size=0.2,
#     random_state=42
# )

In [7]:
# for d in [(x_train, "x_train"), (x_val, "  x_val"), (y_train, "y_train"), (y_val, "  y_val")]:
#     print(f"{d[1]}: {d[0].shape}")

### encoding the categorical classes
* the model will produce a probability score for each of the 10 classes, assigning the most likely label

In [8]:
# start by mapping the labels to numerical values:
catno_to_label = {key:value for (key, value) in enumerate(sorted(set(y_train)))}
# catno_to_label

In [9]:
# reverse this:
label_to_catno = {v:k for k,v in catno_to_label.items()}
# label_to_catno

In [10]:
y_train = y_train.map(label_to_catno)
# y_val = y_val.map(label_to_catno)

In [11]:
label_as_binary = LabelBinarizer()
y_train = label_as_binary.fit_transform(y_train)
# y_val = label_as_binary.fit_transform(y_val)
# y_val = label_as_binary.transform(y_val)

In [12]:
# df is easier to handle later in k-folds
y_train = pd.DataFrame(y_train)

In [13]:
for d in [
    (x_train, "x_train"),
    # (x_val, "  x_val"),
    (y_train, "y_train"),
    # (y_val, "  y_val")
]:
    print(f"{d[1]}: {d[0].shape}")

x_train: (7992, 57)
y_train: (7992, 10)


---
### define model functions ready for gridsearch

#### build:

In [14]:
def build_mlnn(
    hidden_dim=128,
    activation_fn=activations.selu,
    dropout_rate=0.2
):
    model = Sequential(name="mlnn_genre_classification")
    #input layer
    model.add(Dense(
        units=32,
        activation=activation_fn,
        # kernel_regularizer=regularizers.l2(0.01),
        input_shape=(x_train_scaled.shape[1],),
        name="input")
    )
    # batch normalistion:
    model.add(BatchNormalization())
    # dropout
    model.add(Dropout(dropout_rate))
    # hidden layer
    model.add(Dense(
        units=hidden_dim,
        activation=activation_fn,
        # kernel_regularizer=regularizers.l2(0.01),
        name="hidden1"
    ))
    # batch normalistion:
    model.add(BatchNormalization())
    # dropout
    model.add(Dropout(dropout_rate))
    # hidden layer
    model.add(Dense(
        units=hidden_dim,
        activation=activation_fn,
        # kernel_regularizer=regularizers.l2(0.01),
        name="hidden2"
    ))
    # batch normalistion:
    model.add(BatchNormalization())
    # dropout
    model.add(Dropout(dropout_rate))
    # hidden layer
    model.add(Dense(
        units=hidden_dim,
        activation="selu",
        # kernel_regularizer=regularizers.l2(0.01),
        name="hidden3"
    ))
    # batch normalistion:
    model.add(BatchNormalization())
    # dropout
    model.add(Dropout(dropout_rate))
    # output layer
    model.add(Dense(
        units=10,
        activation="softmax",
        name="output"
    ))

    return model

#### compile:

In [15]:
def mlnn_compile(
    model,
    optimiser=optimizers.Adam,
    learning_rate=0.0001
):
    model.compile(
        loss="categorical_crossentropy",
        optimizer=optimiser(learning_rate=learning_rate),
        metrics=["accuracy"]
    )
    
    return model

#### fit:

In [16]:
def mlnn_fit(
    model,
    x,
    y,
    val_xy,
    epochs=700,
    batch_size=32,
):
    history=History()
    
    early_stopping = EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        restore_best_weights=True,
        patience=epochs,
        baseline=None
    )

    model.fit(
        x,
        y,
        validation_data=val_xy,
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
        callbacks=[
            history,
            early_stopping
        ]
    )

    return model



---
### function for average metrics:

In [17]:
def average_metrics(y_test, y_pred):
    scorers = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_micro": f1_score(y_test, y_pred, average="micro"),
        "f1_macro": f1_score(y_test, y_pred, average="macro"),
        "recall_micro": recall_score(y_test, y_pred, average="micro"),
        "recall_macro": recall_score(y_test, y_pred, average="macro"),
        "precision_micro": precision_score(y_test, y_pred, average="micro"),
        "precision_macro": precision_score(y_test, y_pred, average="macro")
    }
    df = pd.DataFrame(index=scorers.keys(), columns=["metric score"])
    for scorer in scorers.keys():
        df.at[scorer, "metric score"] = scorers[scorer]
    df=df.transpose()
    return df

---
### define parameter grid

In [18]:
params = {
    "hidden_dim": [64, 128],
    # "activation": [activations.selu, activations.relu],
    # "kernel_initialiser": ["GlorotUniform"],
    "dropout_rate": [0.1, 0.2],
    # "optimiser": ["Adam"],
    "learning_rate": [0.001, 0.0005, 0.0001],
    "batch_size": [32, 64],
}

In [19]:
# get the combinations for params in a list to avoid writing many nested loops:
keys, values = zip(*params.items())
param_combs = [dict(zip(keys, v)) for v in itertools.product(*values)]

---
### custom gridsearchCV
* have had to resort to a custom gridsearch loop using 3-fold cross validation due to:
    * there being tunable parameters in the build, compile, and fit functions of the mlnn
    * handling of the 10 class output and the requirement to translate the encoding
* -> split into 5 folds to ensure that there is a good amount of training data in each fold

In [20]:
params

{'hidden_dim': [64, 128],
 'dropout_rate': [0.1, 0.2],
 'learning_rate': [0.001, 0.0005, 0.0001],
 'batch_size': [32, 64]}

In [22]:
print(len(param_combs))
# param_combs

24


In [None]:
# same folds for each parameter grid point:
kf = KFold(n_splits=5, shuffle=True, random_state=13)

# store the average results of each grid point:
gs_results_pickle = get_project_root() / "output/gs_results.pkl"
if os.path.exists(gs_results_pickle):
    gs_results = pd.read_pickle(gs_results_pickle)
else:
    gs_results = pd.DataFrame(
        columns=[
            'params', 'accuracy', 'f1_micro', 'f1_macro', 'recall_micro', 'recall_macro',
            'precision_micro', 'precision_macro'
        ]
    )

# for param_comb in param_combs[:2]:
for c, param_comb in enumerate(param_combs):
    print(f"search: {c}\n{param_comb}")
    hd, dr = param_comb["hidden_dim"], param_comb["dropout_rate"]
    lr = param_comb["learning_rate"]
    bs = param_comb["batch_size"]

    param_results = pd.DataFrame()
    i=0
    for train, val in kf.split(x_train_scaled):
        print(f"training fold: {i}")
        i+=1
        x_train_fold, y_train_fold = x_train_scaled.iloc[train], y_train.iloc[train]
        x_val_fold, y_val_fold = x_train_scaled.iloc[val], y_train.iloc[val]
        
        # train model at grid point for this fold:
        model = build_mlnn(
            hidden_dim=hd,
            dropout_rate=dr
        )
        model = mlnn_compile(
            model,
            learning_rate=lr
        )
        model = mlnn_fit(
            model,
            x_train_fold,
            y_train_fold,
            (x_val_fold, y_val_fold),
            batch_size=bs,
            # reduce num epochs while searching
            epochs=300
        )
        # make prediction on val set for this fold with model from best epoch:
        fold_results = pd.DataFrame()

        pred = model.predict(
            x_val_fold,
            verbose=0
        )
        pred = np.argmax(pred, axis=1)
        pred = pd.Series(pred)
        pred = pred.map(catno_to_label)

        # wrangle y_val:
        y_val_fold = np.array(y_val_fold)
        y_val_fold = np.argmax(y_val_fold, axis=1)
        y_val_fold = pd.Series(y_val_fold).map(catno_to_label)
        y_val_fold.reset_index(drop=True, inplace=True)

        fold_results = average_metrics(y_val_fold, pred)
        param_results = pd.concat([param_results, fold_results])
        
    # average the scores for these parameters over the folds:
    param_results = pd.DataFrame(param_results.mean()).transpose()
    param_results["params"] = str(param_comb)
    
    gs_results = pd.concat([gs_results, param_results])
    gs_results.reset_index(drop=True, inplace=True)
    # save in case of dead kernel:
    gs_results.to_pickle(gs_results_pickle)   


### recall scores from search grid

In [26]:
# retrieve scores saved as pickle:
gs_results = pd.read_pickle(
    get_project_root() / "output/gs_results_20221217.pkl"
)
gs_results

Unnamed: 0,params,accuracy,f1_micro,f1_macro,recall_micro,recall_macro,precision_micro,precision_macro
0,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.883133,0.883133,0.882759,0.883133,0.883773,0.883133,0.884334
1,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.87763,0.87763,0.877656,0.87763,0.878411,0.87763,0.879543
2,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.87325,0.87325,0.872539,0.87325,0.873486,0.87325,0.875621
3,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.870619,0.870619,0.870545,0.870619,0.87078,0.870619,0.873557
4,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.820946,0.820946,0.819868,0.820946,0.821326,0.820946,0.820212
5,"{'hidden_dim': 64, 'dropout_rate': 0.1, 'learn...",0.80468,0.80468,0.803206,0.80468,0.805368,0.80468,0.803934
6,"{'hidden_dim': 64, 'dropout_rate': 0.2, 'learn...",0.831832,0.831832,0.830589,0.831832,0.832471,0.831832,0.834986
7,"{'hidden_dim': 64, 'dropout_rate': 0.2, 'learn...",0.82595,0.82595,0.824845,0.82595,0.827254,0.82595,0.829002
8,"{'hidden_dim': 64, 'dropout_rate': 0.2, 'learn...",0.817317,0.817317,0.816293,0.817317,0.818775,0.817317,0.820747
9,"{'hidden_dim': 64, 'dropout_rate': 0.2, 'learn...",0.811936,0.811936,0.810923,0.811936,0.812441,0.811936,0.812695


In [27]:
best = gs_results[gs_results["accuracy"] == gs_results["accuracy"].max()]
best

Unnamed: 0,params,accuracy,f1_micro,f1_macro,recall_micro,recall_macro,precision_micro,precision_macro
12,"{'hidden_dim': 128, 'dropout_rate': 0.1, 'lear...",0.918168,0.918168,0.917783,0.918168,0.918157,0.918168,0.919244


In [28]:
best = best["params"].reset_index(drop=True)[0]
best

"{'hidden_dim': 128, 'dropout_rate': 0.1, 'learning_rate': 0.001, 'batch_size': 32}"

* reduced number of epochs while parameter searching -> train model with best params on all training set and test on test

In [None]:
# model.get_config()