# Predictive analysis of naval incidents in the USA, 2002 - 2015: <br>
## Annex 5.2. Data Model MergedActivity

> Author: [Oscar Anton](https://www.linkedin.com/in/oscanton/) <br>
> Date: 2024 <br>
> License: [CC BY-NC-ND 4.0 DEED](https://creativecommons.org/licenses/by-nc-nd/4.0/) <br>
> Version: 0.9 <br>

# 0. Loadings

### Libraries

In [None]:
# System environment
import os

# Data general management
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sample balance
import cube                                     # https://github.com/acdmammoths/parallelcubesampling
from imblearn.over_sampling import SMOTENC      # https://github.com/scikit-learn-contrib/imbalanced-learn

# NaN imputation
from sklearn.experimental import enable_iterative_imputer       # Needed for IterativeImputer
from sklearn.impute import IterativeImputer
# Data scaling
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
# Data splitting
from sklearn.model_selection import train_test_split, KFold, cross_val_score
# Model training
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Dropout, concatenate
from keras.utils import plot_model
from keras.callbacks import EarlyStopping

# h2o framework
import h2o
from h2o.automl import H2OAutoML

# Model metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, f1_score, mean_absolute_error, roc_auc_score, roc_curve, auc, cohen_kappa_score, confusion_matrix, recall_score, precision_score

# Model Export
import joblib

# Model explainers
import dalex as dx

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### General Variables

In [None]:
# Main data folders
merged_activity_folder = '../3.DataPreprocess/DataMergedActivity'
datasets_folder = 'Datasets'
models_folder = 'Models'

# Toggle for export data to external file
file_export_enabled = False
# Toggle for train model
train_model_enabled = False

# Available CPU cores for multiprocessing (training models)
n_jobs = os.cpu_count() - 1

# Random seed for reproducibility
seed = 42

### Base dataframe

In [None]:
# Load dataframe
merged_activity = pd.read_feather(merged_activity_folder + '/' + 'merged_activity.feather')

# Check dataframe
merged_activity.head(5)

# 1. Dataframe creation

## 1.1. Variable transformation

In [None]:
# Dropping unnecessary variables for the models
columns_to_drop = ['vessel_id', 'imo_number', 'vessel_name', 'event_type', 
                   'build_year', 'wave_hgt', 'visibility', 'casualty', 
                   'pollution', 'flag_abbr', 'classification_society', 'solas_desc']

merged_activity = merged_activity.drop(columns=columns_to_drop)

# Renaming columns
merged_activity = merged_activity.rename(columns={'length': 'vessel_length', 'event_class': 'y'})

# Date & Time variables to continuous data
merged_activity['date'] = merged_activity['date'].dt.dayofyear

merged_activity['hour'] = (pd.to_numeric(merged_activity['hour'].str.split(':').str[0]) + 
                         pd.to_numeric(merged_activity['hour'].str.split(':').str[1])/60).round(2)

# Function: group minority values 
def lump_factorials(column, prop=0.008, other_level="other value"):
    counts = column.value_counts(normalize=True)
    mask = column.isin(counts[counts < prop].index)
    column[mask] = other_level
    return column

# Apply function for reducing variability in vessel_class
merged_activity[['vessel_class']] = merged_activity[['vessel_class']].apply(lump_factorials)

# Converting some columns to factors (categorical)
columns_to_factorize = ['region', 'watertype', 'damage_status', 'vessel_class', 'y']
merged_activity[columns_to_factorize] = merged_activity[columns_to_factorize].astype('category')

# Check structure
merged_activity.dtypes

In [None]:
# Check target variable frecuency
merged_activity['y'].value_counts()

## 1.2. Target variable balance

### 1.2.1. Subsampling: Cube Method

In [None]:
def cube_subsampling(data, target_size):
    # Drop all incomplete cases if not lower than target size (keeping best information)
    if len(data.dropna()) >= target_size:
        data = data.dropna()

    # Constant column of "1s"
    ONE = np.full(len(data), 1)

    # Select numerical variables
    X1 = data.select_dtypes(include=['number']).reset_index(drop=True)

    # Categorical variables to numerical (one hot encoding)
    X2 = pd.get_dummies(data.drop(columns='y').select_dtypes(exclude=['number']).reset_index(drop=True))

    # Join data
    X = np.column_stack((ONE, X1, X2))

    # Inclusion probability (constant according to target size)
    init_probs = np.full(len(X), target_size / len(X))

    # Inclusion index (algorithm by Alexander Lee, Stefan Walzer-Goldfeld, Shukry Zablah, Matteo Riondato, AAAI'22 Student Abstract)
    sample_indexes = cube.sample_cube_parallel(X, init_probs, n_jobs, is_pop_size_fixed=True, is_sample_size_fixed=True, seed=seed)

    # Return selected rows according sample_indexes
    return data.loc[sample_indexes == 1, :]

#### Critical events: Subsampling to 9000 rows

In [None]:
# Call cube_subsampling for y = Critical Events 
CriticalEvents_cube = cube_subsampling(merged_activity[merged_activity['y'] == 'Critical Events'], 9000)

# Check sampled length
print(f"Critical Events subsample dimensions: {CriticalEvents_cube.shape}")

#### Maritime Accidents: Subsampling to 9000 rows

In [None]:
# Call cube_subsampling for y = Critical Events 
MaritimeAccidents_cube = cube_subsampling(merged_activity[merged_activity['y'] == 'Maritime Accidents'], 9000)

# Check sampled length
print(f"Maritime Accidents subsample dimensions: {MaritimeAccidents_cube.shape}")

#### Material Issues: Subsampling to 9000 rows

In [None]:
# Call cube_subsampling for y = Critical Events 
MaterialIssues_cube = cube_subsampling(merged_activity[merged_activity['y'] == 'Material Issues'], 9000)

# Check sampled length
print(f"Material Issues subsample dimensions: {MaterialIssues_cube.shape}")

### 1.2.2. NaN imputation

Note: In this case NaN imputation is before oversampling because used algorithm needs complete cases to work properly

In [None]:
# Identify columns with NaN
nan_columns = ['air_temp', 'wind_speed', 'damage_assessment'] 

# MICE (Multiple Imputation by Chained Equations ) imputation
imputer = IterativeImputer(random_state=seed)
imputed_values = imputer.fit_transform(merged_activity[nan_columns])

# Imputation apply
imputed_data = merged_activity.copy()
imputed_data[nan_columns] = imputed_values

# Check imputation
imputed_data.isna().sum()

### 1.2.3. Oversampling: SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
# Split target variable
y = imputed_data['y']
X = imputed_data.drop(columns=['y'])

# Set target size for variables to be oversampled
target_size = 9000

# Create SMOTE-NC (Synthetic Minority Over-sampling Technique for Nominal and Continuous)
smote = SMOTENC(sampling_strategy={'Onboard Emergencies': target_size, 'Third-party Damages': target_size},
categorical_features=[4, 7, 8, 9],
random_state=seed)

# Apply SMOTE-NC fit
X_resampled, y_resampled = smote.fit_resample(X, y)

# Back to join data
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

# Check balance
resampled_data['y'].value_counts()

# Extract resampled 'Onboard Emergencies' class
onboard_emergencies_smote = resampled_data[resampled_data['y'] == 'Onboard Emergencies']
# Check sampled length
print(f"Onboard Emergencies oversample dimensions: {onboard_emergencies_smote.shape}")

# Extract resampled 'Third-party Damages' class
thirdparty_damages_smote = resampled_data[resampled_data['y'] == 'Third-party Damages']
# Check sampled length
print(f"Third-party Damages oversample dimensions: {thirdparty_damages_smote.shape}")

### 1.2.4. All events data join

In [None]:
# Sampled pieces join
merged_activity_general = pd.concat([CriticalEvents_cube, MaritimeAccidents_cube, MaterialIssues_cube,
                                         onboard_emergencies_smote, thirdparty_damages_smote]).reset_index(drop=True) 

## 1.3. Data consolidation

In [None]:
# Drop statistically irrelevant variables, according to EDA
merged_activity_general = merged_activity_general.drop(columns=['date', 'latitude', 'damage_assessment'])

# Export to external file
if file_export_enabled :
    (merged_activity_general
    .reset_index(drop=True)
    .to_feather(datasets_folder + '/' + 'merged_activity_general.feather'))
else:
    merged_activity_general = pd.read_feather(datasets_folder + '/' + 'merged_activity_general.feather')

### 1.3.1. Explanatory variables: Scale numerics & OHE not numerics

In [None]:
# Scale numeric variables
data_scaled = pd.DataFrame(StandardScaler()
                            .fit_transform(merged_activity_general
                            .select_dtypes(include=['number'])))

# Rename numeric column names
data_scaled.columns = merged_activity_general.select_dtypes(include=['number']).columns


# One hot encoding for not numeric variables
data_ohe = (pd.get_dummies(merged_activity_general
                .drop(columns=['y'])
                .select_dtypes(exclude=['number']))
                .astype(int))


# X: Scaled and ohe join
X = pd.concat([data_ohe, data_scaled], axis=1).drop(columns=['index'])

# Verify variables
for column in X.columns:
    print(f"Name: {column} | Type: {X[column].dtype} | Levels: {X[column].nunique()}")

### 1.3.2. Target variable: Numeric Labeling

In [None]:
# Label encoding
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(merged_activity_general['y']))

# Verify
np.unique(y)

## 1.4. Train / Test Split

In [None]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
# Save / Load dataframes in one file (h5 format for multiple data)
if file_export_enabled :
    dfs = {'X_train':X_train, 'X_test':X_test, 'y_train':y_train, 'y_test':y_test}
    for key, df in dfs.items():
        df.to_hdf(datasets_folder + '/' + 'datasets_MA_splited.h5', key=key, format='table')
        print(f'{key} {df.shape} saved') 
else:
    dfs = ['X_train', 'X_test', 'y_train', 'y_test']
    for df in dfs:
        globals()[df] = pd.read_hdf(datasets_folder + '/' + 'datasets_MA_splited.h5', key = df)
        print(f'{df} {eval(df).shape} loaded') 

# 2. Model Training

#### Performance functions (sklearn)

In [None]:
# Function: Table with main metrics data
def ma_model_metrics(model, X, y, styled=False):
    # Predictions (absolute)
    y_pred = model.predict(X)
    
    # Data binarize for auc calculation
    y_bin = label_binarize(y , classes=np.unique(y))
    y_pred_bin = label_binarize(y_pred , classes=np.unique(y))

    # Calculate main metrics
    roc_auc = round(roc_auc_score(y_bin, y_pred_bin), 4)
    accuracy = round(accuracy_score(y, y_pred), 4)
    kappa = round(cohen_kappa_score(y, y_pred), 4)
    rmse = round(mean_squared_error(y, y_pred), 4)
    mae = round(mean_absolute_error(y, y_pred), 4)
    r2 = round(r2_score(y, y_pred), 4)
    f1 = round(f1_score(y, y_pred, average='macro'), 4)
    
    # Build multiindex table
    df = pd.DataFrame([['ROC AUC:', roc_auc],['Accuracy:', accuracy], ['Kappa:', kappa],
                        ['RMSE:', rmse], ['MAE:', mae], ['R2:', r2], ['F1:', f1]],
                        columns=('metric', 'value'))

    if styled:
        title = f'{model.__class__.__name__} Training'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.hide()
    else:
        return df

In [None]:
# Function: Table for recall & precision, sensitivity & specificity
def sens_spec(model, X, y, styled=False):
    # Predictions (absolute)
    y_pred = model.predict(X)

    # Recall & Precision values
    recall = round(recall_score(y, y_pred, average='macro'), 4)
    precision = round(precision_score(y, y_pred, average='macro'), 4)

    # Confusion matrix
    conf_matrix = confusion_matrix(y, y_pred)

    # List compression for sens & spec values calculation
    sensitivity, specificity = zip(*[(round(recall_score(y, y_pred, labels=[i], average='macro'), 4),
                                    round(conf_matrix[i, i] / sum(conf_matrix[:, i]), 4))
                                    for i in range(len(conf_matrix))])

    # Labels and indexes
    column_labels = label_encoder.inverse_transform(model.classes_)
    index_1 = ['Recall:', 'Precision:']
    index_2 = [recall, precision]
    index_ = [' - ',' - ']
    index_3 = ['Sensitivity:', 'Specificity:']

    # Build multiindex table
    df = pd.DataFrame([sensitivity, specificity])
    df.columns = column_labels
    df.index = [index_1, index_2, index_, index_3]

    # Dataframe style
    if styled:
        title = f'{model.__class__.__name__} Model'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.set_table_styles([{'selector': 'th.col_heading', 'props': 'text-align: center;'}], overwrite=False)
    else:
        return df

In [None]:
# Function: Table with Confusion Matrix
def ma_confusion_matrix_table(model, X, y):
    # Predictions (absolute)
    y_pred = model.predict(X)

    # Get labels decoding target variable 
    labels = label_encoder.inverse_transform(model.classes_)

    # Build table
    df = pd.DataFrame(confusion_matrix(y, y_pred),
                            columns=pd.MultiIndex.from_product([[f'{model.__class__.__name__}: Confusion Matrix'], labels]),
                            index=labels)

    # Dataframe style
    styled_df = df.style.set_table_styles([
        {'selector': 'th.col_heading', 'props': 'text-align: center;'},
        {'selector': 'td', 'props': 'text-align: center;'},
    ], overwrite=False)

    return styled_df

In [None]:
# Function: Plot Multiclass ROC Curve
def roc_curve_plot(model, X, y):
    # Predictions (absolute)
    y_pred = model.predict(X)

    # Data binarize for auc calculation
    y_bin = label_binarize(y , classes=model.classes_)
    y_pred_bin = label_binarize(y_pred , classes=model.classes_)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in model.classes_:
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_pred_bin[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_pred_bin.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Plot ROC curve for each class
    plt.figure()
    colors = sns.color_palette("hls", 5)  
    for i, color in zip(model.classes_, colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label='ROC of class {0} (AUC = {1:0.2f})'.format(i, roc_auc[i]))

    # Plot micro-average ROC curve
    plt.plot(fpr["micro"], tpr["micro"], color='grey', lw=1, linestyle='dashed',
            label='micro-average ROC (AUC = {0:0.2f})'.format(roc_auc["micro"]))

    plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='dotted')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC (Multiclass) for {model.__class__.__name__}')
    plt.legend(loc="lower right", fontsize="8")
    plt.show()

In [None]:
# Function: Feature importances
def sklearn_feature_importances(model, plot=False):
    importances = pd.DataFrame({'variable_name':model.feature_names_in_,
                                    'value':model.feature_importances_}).sort_values(by='value', ascending=False)
    # Plot horizontal bars if enabled in the call, otherwise return values
    if plot:
        plt.figure(figsize=(10, 7))
        plt.barh(importances['variable_name'], importances['value'], color='#00bfc4')
        plt.title(f"Feature importances of {model.__class__.__name__} model")
        plt.xlabel('Relative Feature importance')
        plt.gca().invert_yaxis()
        plt.show()
    else:
        return importances

## 2.1. Bayesian networks

### 2.1.1. Naïve Bayes -Gaussian- (NB)

#### Model Training

In [None]:
if train_model_enabled :

    # Define model parameters
    params = {}                         # No parameters for this model

    # Create and train model
    nb_MA_train = GaussianNB(**params).fit(X_train, y_train)

    # Save to external file
    joblib.dump(nb_MA_train, models_folder + '/' + 'nb_MA_train.pkl')

else:
    # Load model from external file
    nb_MA_train = joblib.load(models_folder + '/' + 'nb_MA_train.pkl')

model = nb_MA_train

# Get model parameters and print as a one row list
print(f'Model name: {model.__class__.__name__} \nParameters:')
params = model.get_params()
for param_name, param_value in params.items():
    print(f" {param_name}: {param_value}")

#### Model Metrics

In [None]:
# Model performance main metrics
ma_model_metrics(model, X_test, y_test, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
sens_spec(model, X_test, y_test, styled=True)

In [None]:
# Confusion matrix
ma_confusion_matrix_table(model, X_test, y_test)

In [None]:
# Plot Receiver Operating Characteristic (ROC) curves for each class
roc_curve_plot(model, X_test, y_test)

In [None]:
def sklearn_theta_feature_importances(model, plot=False):
    X = model.feature_names_in_
    num_classes = model.classes_.size
    
    # Iterate through the classes and sum the absolute values of the corresponding theta parameters
    importances = []
    for i in range(len(X)):
        importance = 0
        for j in range(num_classes):
            importance += np.abs(model.theta_[j, i])
        importances.append((X[i], importance))
    
    # Normalize importance values
    total_importance = sum(x[1] for x in importances)
    importances = [(x[0], x[1] / total_importance) for x in importances]
    
    # Built importance values dataframe
    importances = pd.DataFrame(importances, columns=['variable_name', 'value']).sort_values(by='value', ascending=False)

    # Plot horizontal bars if enabled in the call, otherwise return values
    if plot:
        plt.figure(figsize=(10, 7))
        plt.barh(importances['variable_name'], importances['value'], color='#00bfc4')
        plt.title(f"Feature importances of {model.__class__.__name__} model")
        plt.xlabel('Relative Feature importance (based on theta values)')
        plt.gca().invert_yaxis()
        plt.show()
    else:
        return importances


# Plot features importances calling above function
sklearn_theta_feature_importances(model, plot=True)

## 2.2. Gradient Boosting Models

### 2.2.1. Gradient Boosting Machine (GBM)

#### Model Training

In [None]:
if train_model_enabled :
    # Define model parameters
    params = {
        'n_estimators': 500,            # Number of trees
        'learning_rate': 0.1,           # Contribution of each tree to the model
        'max_depth': 3,                 # Maximum levels of each tree
        'random_state': seed,           # Random seed for reproducibility
    }

    # Create and train model
    GBM_MA_train = GradientBoostingClassifier(**params).fit(X_train, y_train)

    # Save to external file
    joblib.dump(GBM_MA_train, models_folder + '/' + 'GBM_MA_train.pkl')

else:
    # Load model from external file
    GBM_MA_train = joblib.load(models_folder + '/' + 'GBM_MA_train.pkl')

model = GBM_MA_train

# Get model parameters and print as a one row list
print(f'Model name: {model.__class__.__name__} \nParameters:')
params = model.get_params()
for param_name, param_value in params.items():
    print(f" {param_name}: {param_value}")

#### Model Metrics

In [None]:
# Model performance main metrics
ma_model_metrics(model, X_test, y_test, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
sens_spec(model, X_test, y_test, styled=True)

In [None]:
# Confusion matrix
ma_confusion_matrix_table(model, X_test, y_test)

In [None]:
# Plot Receiver Operating Characteristic (ROC) curves for each class
roc_curve_plot(model, X_test, y_test)

In [None]:
sklearn_feature_importances(model, plot=True)

## 2.3. More Models

### 2.3.1. Random Forest (RF)

#### Model Training

In [None]:
if train_model_enabled :   
    # Define model parameters
    params = {
        'n_estimators': 100,            # Number of trees in the forest
        'max_depth': None,              # Maximum depth of the trees (no restrictions)
        'random_state': seed            # Random seed for reproducibility
    }

    # Create and train model
    rf_MA_train = RandomForestClassifier(**params).fit(X_train, y_train)

    # Save to external file
    joblib.dump(rf_MA_train, models_folder + '/' + 'rf_MA_train.pkl')

else:
    # Load model from external file
    rf_MA_train = joblib.load(models_folder + '/' + 'rf_MA_train.pkl')

model = rf_MA_train

# Get model parameters and print as a one row list
print(f'Model name: {model.__class__.__name__} \nParameters:')
params = model.get_params()
for param_name, param_value in params.items():
    print(f" {param_name}: {param_value}")

#### Model Metrics

In [None]:
# Model performance main metrics
ma_model_metrics(model, X_test, y_test, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
sens_spec(model, X_test, y_test, styled=True)

In [None]:
# Confusion matrix
ma_confusion_matrix_table(model, X_test, y_test)

In [None]:
# Plot Receiver Operating Characteristic (ROC) curves for each class
roc_curve_plot(model, X_test, y_test)

In [None]:
sklearn_feature_importances(model, plot=True)

### 2.3.2. Multilayer Perceptron (MLP)

#### Model Training

In [None]:
if train_model_enabled :
    # Define model parameters
    params = {
        'hidden_layer_sizes': (100, 50),    # Two hidden layers with 100 and 50 neurons respectively.
        'activation': 'relu',               # Activation function for the hidden layers: Rectified Linear Unit
        'solver': 'adam',                   # Optimization algorithm
        'random_state': seed                # Random seed for reproducibility
    }

    # Create and train model
    nnet_MA_train = MLPClassifier(**params).fit(X_train, y_train)

    # Save to external file
    joblib.dump(nnet_MA_train, models_folder + '/' + 'nnet_MA_train.pkl')

else:
    # Load model from external file
    nnet_MA_train = joblib.load(models_folder + '/' + 'nnet_MA_train.pkl')

model = nnet_MA_train

# Get model parameters and print as a one row list
print(f'Model name: {model.__class__.__name__} \nParameters:')
params = model.get_params()
for param_name, param_value in params.items():
    print(f" {param_name}: {param_value}")

#### Model Metrics

In [None]:
# Model performance main metrics
ma_model_metrics(model, X_test, y_test, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
sens_spec(model, X_test, y_test, styled=True)

In [None]:
# Confusion matrix
ma_confusion_matrix_table(model, X_test, y_test)

In [None]:
# Plot Receiver Operating Characteristic (ROC) curves for each class
roc_curve_plot(model, X_test, y_test)

In [None]:
def sklearn_coef_feature_importances(model, plot=False):
    # Built coefs dataframe
    importances = pd.DataFrame({'variable_name':model.feature_names_in_,
                                'value':np.mean(np.abs(model.coefs_[0].T), axis=0)}).sort_values(by='value', ascending=False)
    # Normalize values
    importances['value'] = importances['value'] / importances['value'].sum()

    # Plot horizontal bars if enabled in the call, otherwise return values
    if plot:
        plt.figure(figsize=(10, 7))
        plt.barh(importances['variable_name'], importances['value'], color='#00bfc4')
        plt.title(f"Feature importances of {model.__class__.__name__} model")
        plt.xlabel('Relative Feature importance (based on  first layer coef values)')
        plt.gca().invert_yaxis()
        plt.show()
    else:
        return importances
    
# Call above function
sklearn_coef_feature_importances(model, plot=True)

### 2.3.3. Classification and Regression Trees (CART)

#### Model Training

In [None]:
if train_model_enabled :
    # Define model parameters
    params = {
        'criterion':'gini',                 # Criteria
        'random_state': seed                # Random seed for reproducibility
    }

    # Create and train model
    cart_MA_train = DecisionTreeClassifier(**params).fit(X_train, y_train)

    # Save to external file
    joblib.dump(cart_MA_train, models_folder + '/' + 'cart_MA_train.pkl')

else:
    # Load model from external file
    cart_MA_train = joblib.load(models_folder + '/' + 'cart_MA_train.pkl')

model = cart_MA_train

# Get model parameters and print as a one row list
print(f'Model name: {model.__class__.__name__} \nParameters:')
params = model.get_params()
for param_name, param_value in params.items():
    print(f" {param_name}: {param_value}")

#### Model Metrics

In [None]:
# Model performance main metrics
ma_model_metrics(model, X_test, y_test, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
sens_spec(model, X_test, y_test, styled=True)

In [None]:
# Confusion matrix
ma_confusion_matrix_table(model, X_test, y_test)

In [None]:
# Plot Receiver Operating Characteristic (ROC) curves for each class
roc_curve_plot(model, X_test, y_test)

In [None]:
sklearn_feature_importances(model, plot=True)

## 2.4. Keras API with TensorFlow

#### Performance functions (keras)

In [None]:
# Function: Plot loss / accuracy train evolution
def keras_train_plot(data):
    # Train process visualization
    df_train=pd.DataFrame(data)
    # df_train['epochs']=history.epoch
    df_train['epochs']=list(range(0, len(data['accuracy'])))

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))

    fig.suptitle('Train process', fontsize=12)

    ax1.plot(df_train['epochs'], df_train['accuracy'], label='train_accuracy')
    ax1.plot(df_train['epochs'], df_train['val_accuracy'], label='val_accuracy')

    ax2.plot(df_train['epochs'], df_train['loss'], label='train_loss')
    ax2.plot(df_train['epochs'], df_train['val_loss'], label='val_loss')

    ax1.legend(loc='best')
    ax2.legend(loc='best')
    plt.show()

In [None]:
# Function: Table with main metrics data
def keras_model_metrics(model, X, y_ohe, styled=False):
    y_pred_ohe = pd.DataFrame(model.predict(X))              
    y_pred_serie = y_pred_ohe.idxmax(axis=1) 
    y_serie = pd.Series(label_encoder.fit_transform(y_ohe.idxmax(axis=1)))

    roc_auc = round(roc_auc_score(y_ohe, y_pred_ohe), 4)
    accuracy = round(accuracy_score(y_serie, y_pred_serie), 4)
    kappa = round(cohen_kappa_score(y_serie, y_pred_serie), 4)
    rmse = round(mean_squared_error(y_ohe, y_pred_ohe), 4)
    mae = round(mean_absolute_error(y_ohe, y_pred_ohe), 4)
    r2 = round(r2_score(y_ohe, y_pred_ohe), 4)
    f1 = round(f1_score(y_serie, y_pred_serie, average='macro'), 4)

    # Build multiindex table
    df = pd.DataFrame([['ROC AUC:', roc_auc],['Accuracy:', accuracy], ['Kappa:', kappa],
                        ['RMSE:', rmse], ['MAE:', mae], ['R2:', r2], ['F1:', f1]],
                        columns=('metric', 'value'))

    if styled:
        title = f'{model.__class__.__name__} Training'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.hide()
    else:
        return df

In [None]:
# Function: Table for recall & precision, sensitivity & specificity
def keras_sens_spec(model, X, y, styled=False):
    # Predictions (absolute)
    y_pred_ohe = pd.DataFrame(model.predict(X))              
    y_pred_serie = y_pred_ohe.idxmax(axis=1) 
    y_serie = pd.Series(label_encoder.fit_transform(y.idxmax(axis=1)))

    # Recall & Precision values
    recall = round(recall_score(y_serie, y_pred_serie, average='macro'), 4)
    precision = round(precision_score(y_serie, y_pred_serie, average='macro'), 4)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_serie, y_pred_serie)

    # List compression for sens & spec values calculation
    sensitivity, specificity = zip(*[(round(recall_score(y_serie, y_pred_serie, labels=[i], average='macro'), 4),
                                    round(conf_matrix[i, i] / sum(conf_matrix[:, i]), 4))
                                    for i in range(len(conf_matrix))])

    # Labels and indexes
    column_labels = y.columns
    index_1 = ['Recall:', 'Precision:']
    index_2 = [recall, precision]
    index_ = [' - ',' - ']
    index_3 = ['Sensitivity:', 'Specificity:']

    # Build multiindex table
    df = pd.DataFrame([sensitivity, specificity])
    df.columns = column_labels
    df.index = [index_1, index_2, index_, index_3]

    # Dataframe style
    if styled:
        title = f'{model.__class__.__name__} Model'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.set_table_styles([{'selector': 'th.col_heading', 'props': 'text-align: center;'}], overwrite=False)
    else:
        return df

In [None]:
# Function: Table with Confusion Matrix
def keras_confusion_matrix_table(model, X, y):
    # Predictions (max)
    y_pred_max = np.argmax(model.predict(X), axis=1)
    y_max = np.argmax(y, axis=1)

    # Build table
    df = pd.DataFrame(confusion_matrix(y_max, y_pred_max),
                            columns=pd.MultiIndex.from_product([[f'{model.name}: Confusion Matrix'], y.columns]),
                            index=y.columns)

    # Dataframe style
    styled_df = df.style.set_table_styles([
        {'selector': 'th.col_heading', 'props': 'text-align: center;'},
        {'selector': 'td', 'props': 'text-align: center;'},
    ], overwrite=False)

    return styled_df

In [None]:
# Function: Plot Multiclass ROC Curve
def keras_roc_curve_plot(model, X, y):
    # Predictions (max)
    y_pred = pd.DataFrame(model.predict(X))
    y_pred.columns = label_encoder.inverse_transform(y_pred.columns)
    
    # Compute ROC curve and ROC area for each class
    fpr, tpr, roc_auc = {}, {}, {}
    for i in y.columns:
        fpr[i], tpr[i], _ = roc_curve(y[i], y_pred[i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y.values.ravel(), y_pred.values.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Plot ROC curve for each class
    plt.figure()
    colors = sns.color_palette("hls", 5)  
    for i, color in zip(y.columns, colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label='ROC of class {0} (AUC = {1:0.2f})'.format(i, roc_auc[i]))

    # Plot micro-average ROC curve
    plt.plot(fpr["micro"], tpr["micro"], color='grey', lw=1, linestyle='dashed',
            label='micro-average ROC (AUC = {0:0.2f})'.format(roc_auc["micro"]))

    plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='dotted')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC (Multiclass) for {model.name}')
    plt.legend(loc="lower right", fontsize="8")
    plt.show()

### 2.4.1. Sequential: Densely-connected Model (Simple)

#### Model Design

In [None]:
# One-hot encoder for target variable
#y_ohe = (pd.get_dummies(y).astype(int))
y_ohe = pd.get_dummies(y).astype(int)
y_ohe.columns = label_encoder.inverse_transform(y_ohe.columns)

In [None]:
# Define the neural network model
keras_model_1 = Sequential()
keras_model_1.add(Input(shape=(X.shape[1],)))           # Input layer: shape = Feature data shape
keras_model_1.add(Dense(128, activation='relu'))        # Dense layer, relu activation
keras_model_1.add(Dropout(0.5))                         # Dropout
keras_model_1.add(Dense(64, activation='relu')) 
keras_model_1.add(Dropout(0.5))
keras_model_1.add(Dense(y_ohe.shape[1], activation='softmax'))

model = keras_model_1

# Check structure
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True, dpi=70)

#### Model Training

In [None]:
if train_model_enabled :
    # Compile model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Train model
    history = model.fit(X, y_ohe,
            epochs=50,
            batch_size = 32,
            callbacks = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True),
            validation_split=0.15)
    
    # Save model and history to external files
    model.save(models_folder + '/' + 'keras_model_1.keras')
    joblib.dump(history.history, models_folder + '/' + 'keras_model_1_evolution.pkl')

else:
    # Load model and history from external files
    model = load_model(models_folder + '/' + 'keras_model_1.keras')
    class EmplyClass:
        def __init__(self, history):
            self.history = history
    history = EmplyClass('')
    history.history = joblib.load(models_folder + '/' + 'keras_model_1_evolution.pkl')
    print(f'Model <{model.name}> and train history loaded from {models_folder} folder')


#### Model Metrics

In [None]:
# Accuracy / loss Train evolution
keras_train_plot(history.history)

# Accuracy / loss values
accuracy, loss = model.evaluate(X, y_ohe)
print("accuracy = {:.4f}, loss = {:.4f}".format(accuracy, loss))

In [None]:
# Model performance main metrics
keras_model_metrics(model, X, y_ohe, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
keras_sens_spec(model, X, y_ohe, styled=True)

In [None]:
# Confusion matrix
keras_confusion_matrix_table(model, X, y_ohe)

In [None]:
# Plot Roc curves
keras_roc_curve_plot(model, X, y_ohe)

In [None]:
# Function: Plot feature importances
def keras_sec_importances(model, X=X, plot=False):
    # Calculate input layer weights
    weights = model.layers[0].get_weights()[0]

    # Create dataframe for weights and variable names
    importances = pd.DataFrame({'variable_name':X.columns, 'value':np.mean(np.abs(weights), axis=1)}).sort_values(by='value', ascending=False)

    # Plot horizontal bars if enabled in the call, otherwise return values
    if plot:
        plt.figure(figsize=(10, 7))
        plt.barh(importances['variable_name'], importances['value'], color='#00bfc4')
        plt.title(f"Feature importances in Keras {model.__class__.__name__} model")
        plt.xlabel('Relative Feature importance (based on first layer weights)')
        plt.gca().invert_yaxis()
        plt.show()
    else:
        return importances
    

# Plot features importances calling above function
keras_sec_importances(model, plot=True)

### 2.4.2. Functional API Model (Multi-input)

#### Model Design

In [None]:
# Rename similar column names (vessel_...)
X_renamed = X.rename(columns=lambda x: 'class' if x.startswith('vessel_class') else ('length' if x.startswith('vessel_length') else x))

# Group the columns by the first three letters of their name
groups = X_renamed.groupby(lambda x: x[:3], axis=1)

# Iteration over each group and create a DataFrame for each one
X_splited = {}
for name, group in groups:
    X_splited[name] = group

# Verify all splited variable names
print(f'X_splited keys: {X_splited.keys()}')

# Convert to list for ease of use
X_joined = list(X_splited.values())

In [None]:
def functional_model():
    # Numerical Input layers: Dense + dropout layers
    input_act = Input(shape=(X_splited['act'].shape[1],))
    dense_act = Dense(128, activation='relu', name='activity_id')(input_act)
    dropout_act = Dropout(0.1)(dense_act)

    input_age = Input(shape=(X_splited['age'].shape[1],))
    dense_age = Dense(128, activation='relu', name='age')(input_age)
    dropout_age = Dropout(0.1)(dense_age)

    input_air = Input(shape=(X_splited['air'].shape[1],))
    dense_air = Dense(128, activation='relu', name='air_temp')(input_air)
    dropout_air = Dropout(0.1)(dense_air)

    input_gro = Input(shape=(X_splited['gro'].shape[1],))
    dense_gro = Dense(128, activation='relu', name='gross_ton')(input_gro)
    dropout_gro = Dropout(0.1)(dense_gro)

    input_hou = Input(shape=(X_splited['hou'].shape[1],))
    dense_hou = Dense(128, activation='relu', name='hour')(input_hou)
    dropout_hou = Dropout(0.1)(dense_hou)

    input_len = Input(shape=(X_splited['len'].shape[1],))
    dense_len = Dense(128, activation='relu', name='vessel_length')(input_len)
    dropout_len = Dropout(0.1)(dense_len)

    input_lon = Input(shape=(X_splited['lon'].shape[1],))
    dense_lon = Dense(128, activation='relu', name='longitude')(input_lon)
    dropout_lon = Dropout(0.1)(dense_lon)

    input_win = Input(shape=(X_splited['win'].shape[1],))
    dense_win = Dense(128, activation='relu', name='wind_speed')(input_win)
    dropout_win = Dropout(0.1)(dense_win)

    # Categorical Input layers (one-hot): Dense layers
    input_dam = Input(shape=(X_splited['dam'].shape[1],))
    dense_dam = Dense(128, activation='linear', name='damage_status')(input_dam)

    input_cla = Input(shape=(X_splited['cla'].shape[1],))
    dense_cla = Dense(128, activation='linear', name='vessel_class')(input_cla)

    input_wat = Input(shape=(X_splited['wat'].shape[1],))
    dense_wat = Dense(128, activation='linear', name='water_type')(input_wat)

    input_reg = Input(shape=(X_splited['reg'].shape[1],))
    dense_reg = Dense(128, activation='linear', name='region')(input_reg)


    # First level joins
    input_gro_len = concatenate([dropout_gro, dropout_len])
    dense_gro_len = Dense(64, activation='relu')(input_gro_len)

    input_lon_reg = concatenate([dropout_lon, dense_reg])
    dense_lon_reg = Dense(64, activation='relu')(input_lon_reg)

    input_air_win = concatenate([dropout_air, dropout_win])
    dense_air_win = Dense(64, activation='relu')(input_air_win)

    # Second level joins
    input_wat_lon_reg = concatenate([dense_lon_reg, dense_wat])
    dense_wat_lon_reg = Dense(64, activation='relu')(input_wat_lon_reg)

    # Third level joins
    input_gro_len_age_cla = concatenate([dense_gro_len, dropout_age, dense_cla])
    dense_gro_len_age_cla = Dense(64, activation='relu')(input_gro_len_age_cla)

    # Fourth level joins
    input_wat_lon_reg_air_win = concatenate([dense_wat_lon_reg, dense_air_win])
    dense_wat_lon_reg_air_win = Dense(64, activation='relu')(input_wat_lon_reg_air_win)

    # Fifth level joins
    input_gather = concatenate([dropout_act, dense_dam, dense_gro_len_age_cla, dropout_hou, dense_wat_lon_reg_air_win])
    dense_gather = Dense(64, activation='relu')(input_gather)


    # Output layer
    output = Dense(y_ohe.shape[1], activation='softmax')(dense_gather)


    # Model definition
    model = Model(inputs=[input_act, input_age, input_air, input_cla,
                        input_dam, input_gro, input_hou, input_len,
                        input_lon, input_reg, input_wat, input_win], outputs=output)
    
    return model

keras_model_2 = functional_model()

model = keras_model_2

In [None]:
# Check structure
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True, dpi=80)

#### Model Training

In [None]:
if train_model_enabled :
    # Compile model
    model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    # Train model
    history = model.fit([X_splited['act'], X_splited['age'], X_splited['air'], X_splited['cla'],
                        X_splited['dam'], X_splited['gro'], X_splited['hou'], X_splited['len'],
                        X_splited['lon'], X_splited['reg'], X_splited['wat'], X_splited['win']], y_ohe,
            epochs=50,
            batch_size = 32,
            callbacks = EarlyStopping(monitor = 'val_loss', patience = 15, restore_best_weights = True),
            validation_split=0.15)

    # Save model and history to external files
    model.save(models_folder + '/' + 'keras_model_2.keras')
    joblib.dump(history.history, models_folder + '/' + 'keras_model_2_evolution.pkl')

else:
    # Load model and history from external files
    model = load_model(models_folder + '/' + 'keras_model_2.keras')
    class EmplyClass:
        def __init__(self, history):
            self.history = history
    history = EmplyClass('')
    history.history = joblib.load(models_folder + '/' + 'keras_model_2_evolution.pkl')
    print(f'Model <{model.name}> and train history loaded from {models_folder} folder')

#### Model Metrics

In [None]:
# Accuracy / loss Train evolution
keras_train_plot(history.history)

# Accuracy / loss values
accuracy, loss = model.evaluate(X_joined, y_ohe)

print("accuracy = {:.4f}, loss = {:.4f}".format(accuracy, loss))

In [None]:
# Model performance main metrics
keras_model_metrics(model, X_joined, y_ohe, styled=True)

In [None]:
# recall & precision, sensitivity & specificity
keras_sens_spec(model, X_joined, y_ohe, styled=True)

In [None]:
# Confusion matrix
keras_confusion_matrix_table(model, X_joined, y_ohe)

In [None]:
# Plot Roc curves
keras_roc_curve_plot(model, X_joined, y_ohe)

In [None]:
# Function: Plot feature importances
def keras_func_importances(model, plot = False):
    weight_data = []
    # Considering all entrance dense layers have a name like corresponding variable name
    # Iterate through dense layers with a particular name, obtaining their weights
    for layer in model.layers:
        if 'Dense' in layer.__class__.__name__ and 'dense_' not in layer.name:
            layer_name = layer.name
            weights = np.mean(np.abs(layer.get_weights()[0]), axis=1)
            for i, weight in enumerate(weights):
                weight_data.append([f"{layer_name}_{i}", weight])

    # Build dataframe
    importances = pd.DataFrame(weight_data, columns=['variable_name', 'value']).sort_values(by='value', ascending=False)

    # Plot horizontal bars if enabled in the call, otherwise return values
    if plot:
        plt.figure(figsize=(10, 7))
        plt.barh(importances['variable_name'], importances['value'], color='#00bfc4')
        plt.title(f"Feature importances in Keras {model.__class__.__name__} model")
        plt.xlabel('Relative Feature importance (based on first layer weights)')
        plt.gca().invert_yaxis()
        plt.show()
    else:
        return importances


# Plot features importances calling above function
keras_func_importances(model, plot=True)

## 2.5. H2o AutoML

In [None]:
# More than 1 attempt to init h2o cluster (to avoid problems)
def start_h2o_cluster():
    try:
        h2o.init()  # Attempt to start the H2O cluster
        return True  # Return True if the cluster starts successfully
    except Exception as e:
        return False  # Return False if there is an error starting the cluster

attempts = 0
max_attempts = 3  # Maximum number of attempts

while attempts < max_attempts:
    if start_h2o_cluster():
        break  # If the cluster starts successfully, exit the loop
    else:
        attempts += 1

if attempts == max_attempts:
    print("Failed to start H2O cluster after", max_attempts, "attempts.")

In [None]:
# Convert dataframe to h2o format
h2o_data = h2o.H2OFrame(merged_activity_general)

# Train / Test split
h2o_train, h2o_test = h2o_data.split_frame(ratios=[0.85])

#### Model Training

In [None]:
if train_model_enabled :
    # AutoMachineLearning Model 
    aml = H2OAutoML(seed=seed, max_models=4, keep_cross_validation_predictions=True, exclude_algos=['StackedEnsemble'],verbosity="info")     # Alternative: max_runtime_secs = 2000

    # Train
    h2o_model = aml.train(y='y', training_frame=h2o_train)

    # Save to external file
    h2o.save_model(model=h2o_model, path=models_folder, filename = 'mod_aml.h2o', force=True)

else:
    # Load model from external file
    h2o_model = h2o.load_model(models_folder + '/' + 'mod_aml.h2o')
    print(f'Model {h2o_model.key} loaded from {models_folder} folder')

#### Model Metrics

In [None]:
# Best model details
h2o_model

In [None]:
# Function: Table with main metrics data
def h2o_model_metrics(h2o_model, h2o_test, styled=False):
    h2o_predict = pd.Series(label_encoder.fit_transform(h2o_model.predict(h2o_test)['predict'].as_data_frame()))
    h2o_y = pd.Series(label_encoder.fit_transform(h2o_test['y'].as_data_frame()))

    h2o_y_bin = label_binarize(h2o_y , classes=np.unique(h2o_y))
    h2o_predict_bin = label_binarize(h2o_predict , classes=np.unique(h2o_y))

    # Calculate main metrics
    roc_auc = round(roc_auc_score(h2o_y_bin, h2o_predict_bin), 4)
    accuracy = round(accuracy_score(h2o_y_bin, h2o_predict_bin), 4)
    kappa = round(cohen_kappa_score(h2o_y, h2o_predict), 4)
    rmse = round(mean_squared_error(h2o_y_bin, h2o_predict_bin), 4)
    mae = round(mean_absolute_error(h2o_y_bin, h2o_predict_bin), 4)
    r2 = round(r2_score(h2o_y_bin, h2o_predict_bin), 4)
    f1 = round(f1_score(h2o_y_bin, h2o_predict_bin, average='macro'), 4)

    # Build multiindex table
    df = pd.DataFrame([['ROC AUC:', roc_auc],['Accuracy:', accuracy], ['Kappa:', kappa],
                        ['RMSE:', rmse], ['MAE:', mae], ['R2:', r2], ['F1:', f1]],
                        columns=('metric', 'value'))

    if styled:
        title = f'{h2o_model.key} Training'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.hide()
    else:
        return df

# Call above function
h2o_model_metrics(h2o_model, h2o_test, styled=True)

In [None]:
# Function: Table for recall & precision, sensitivity & specificity
def h2o_sens_spec(h2o_model, h2o_test, styled=False):
    # Predictions
    h2o_pred = pd.Series(label_encoder.fit_transform(h2o_model.predict(h2o_test)['predict'].as_data_frame()))
    h2o_y = pd.Series(label_encoder.fit_transform(h2o_test['y'].as_data_frame()))

    # Recall & Precision values
    recall = round(recall_score(h2o_y, h2o_pred, average='macro'), 4)
    precision = round(precision_score(h2o_y, h2o_pred, average='macro'), 4)

    # Confusion matrix
    conf_matrix = confusion_matrix(h2o_y, h2o_pred)

    # List compression for sens & spec values calculation
    sensitivity, specificity = zip(*[(round(recall_score(h2o_y, h2o_pred, labels=[i], average='macro'), 4),
                                    round(conf_matrix[i, i] / sum(conf_matrix[:, i]), 4))
                                    for i in range(len(conf_matrix))])

    # Labels and indexes
    column_labels = np.unique(h2o_test['y'].as_data_frame())
    index_1 = ['Recall:', 'Precision:']
    index_2 = [recall, precision]
    index_ = [' - ',' - ']
    index_3 = ['Sensitivity:', 'Specificity:']

    # Build multiindex table
    df = pd.DataFrame([sensitivity, specificity])
    df.columns = column_labels
    df.index = [index_1, index_2, index_, index_3]

    # Dataframe style
    if styled:
        title = f'{h2o_model.key} Model'           
        df.columns = pd.MultiIndex.from_tuples([(title, col) for col in df.columns])
        return df.style.set_table_styles([{'selector': 'th.col_heading', 'props': 'text-align: center;'}], overwrite=False)
    else:
        return df
    
# Call above function
h2o_sens_spec(h2o_model, h2o_test, styled=True)

In [None]:
# Explanation inform
h2o_model.explain(h2o_test, exclude_explanations='pdp')

# All behind is generated by this method

# 3. Model Comparison

#### Load saved models

In [None]:
# Model lists by training framework
sklearn_models = ['nb_MA_train', 'GBM_MA_train', 'rf_MA_train', 'nnet_MA_train', 'cart_MA_train']
keras_models = ['keras_model_1', 'keras_model_2']
h2o_models = ['mod_aml']

# Load models by training framework
for model in sklearn_models:
    globals()[model] = joblib.load(models_folder + '/' + model + '.pkl')
    print(f'{model} loaded')
for model in keras_models:
    globals()[model] = load_model(models_folder + '/' + model + '.keras')
    print(f'{model} loaded')
for model in h2o_models:
    globals()[model] = h2o.load_model(models_folder + '/' + model + '.h2o')
    print(f'{model} loaded')

## 3.1. Features importance

#### Feature importance comparison chart

In [None]:
# Built dataframe with all feature importance functions along
importance_comparison = pd.DataFrame({
    'NB': sklearn_theta_feature_importances(nb_MA_train).reset_index()['variable_name'],
    'GBM': sklearn_feature_importances(GBM_MA_train).reset_index()['variable_name'],
    'RF': sklearn_feature_importances(rf_MA_train).reset_index()['variable_name'],
    'NNET': sklearn_coef_feature_importances(nnet_MA_train).reset_index()['variable_name'],
    'CART': sklearn_feature_importances(cart_MA_train).reset_index()['variable_name'],
    
    'keras Sec': keras_sec_importances(keras_model_1).reset_index()['variable_name'],
    'keras Func *': keras_func_importances(keras_model_2).reset_index()['variable_name'],

    'h2o *': pd.concat([h2o_model.varimp(use_pandas=True), pd.Series([np.nan] * 19)]).reset_index()['variable']
    }).head(10).applymap(lambda value: value[-15:])

# Invert only index
importance_comparison.index = importance_comparison.index + 1
importance_comparison.index.name = 'ranking'
# Show table
importance_comparison

> * Remarks:
> * In Keras Functional model, variable categories are numbered (_0, _1, _2, etc.).
> * In h2o best model, there is no level distinction in the categorical variables.
>

## 3.2. Performance of the models

#### Main metrics comparison chart

In [None]:
# Built dataframe with all performance functions along
importance_comparison = pd.DataFrame({
    'NB': ma_model_metrics(nb_MA_train, X_test, y_test)['value'],
    'GBM': ma_model_metrics(GBM_MA_train, X_test, y_test)['value'],
    'RF': ma_model_metrics(rf_MA_train, X_test, y_test)['value'],
    'NNET': ma_model_metrics(nnet_MA_train, X_test, y_test)['value'],
    'CART': ma_model_metrics(cart_MA_train, X_test, y_test)['value'],

    'keras Sec': keras_model_metrics(keras_model_1, X, y_ohe)['value'],
    'keras Func': keras_model_metrics(keras_model_2, X_joined, y_ohe)['value'],

    'h2o': h2o_model_metrics(h2o_model, h2o_test)['value']
    }).set_index(ma_model_metrics(nb_MA_train, X, y)['metric'])

# Show table
importance_comparison

#### Recall & Precision comparison chart (Sensitivity & Specificity)

In [None]:
# Built dataframe with all sens_spec functions along
recprec_comparison = pd.DataFrame({
    'NB': sens_spec(nb_MA_train, X_test, y_test).index.get_level_values(1),
    'GBM':sens_spec(GBM_MA_train, X_test, y_test).index.get_level_values(1),
    'RF': sens_spec(rf_MA_train, X_test, y_test).index.get_level_values(1),
    'NNET': sens_spec(nnet_MA_train, X_test, y_test).index.get_level_values(1),
    'CART': sens_spec(cart_MA_train, X_test, y_test).index.get_level_values(1),

    'keras Sec': keras_sens_spec(keras_model_1, X, y_ohe).index.get_level_values(1),
    'keras Func': keras_sens_spec(keras_model_2, X_joined, y_ohe).index.get_level_values(1),

    'h2o': h2o_sens_spec(h2o_model, h2o_test).index.get_level_values(1)
    }).set_index(sens_spec(nb_MA_train, X_test, y_test).index.get_level_values(0))

# Show table
recprec_comparison

#### Accuracy for Cross Validation (10 splits)

In [None]:
if file_export_enabled :
    # Calculate scores for accuracy for sklearn models
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    accuracy_scores = []
    for model in sklearn_models:
        accuracy_scores.append(cross_val_score(eval(model), X_train, y_train, scoring='accuracy', cv=kfold, n_jobs=n_jobs))

    # Store scores in a pandas dataframe and export to external file
    accuracy_scores = pd.DataFrame(accuracy_scores)
    accuracy_scores.columns = accuracy_scores.columns.astype(str)
    accuracy_scores.reset_index().to_feather(datasets_folder + '/' + 'accuracy_scores_MA.feather')

else:
    # Load this dataframe otherwise
    accuracy_scores = pd.read_feather(datasets_folder + '/' + 'accuracy_scores_MA.feather')

In [None]:
# Boxplots of accuracy values in cross validation
plt.style.use('ggplot')
fig = plt.figure(figsize=(12,5))
fig.suptitle('Cross Validation Accuracy')
ax = fig.add_subplot(111)
plt.boxplot(accuracy_scores[::-1].T.iloc[1:].reset_index(drop=True), vert=False)
ax.set_yticklabels([model.split('_')[0].upper() for model in sklearn_models][::-1])
plt.show()

In [None]:
from scipy.stats import ttest_rel

# Convert score data to list
score_list = accuracy_scores.values.tolist()

# Initiate table
table=[]
for i in range(len(score_list)):
    table.append([])
for i in range(len(score_list)):
    for j in range(len(score_list)):
        table[i].append(0)

# Populate values   
for i in range(len(score_list)): 
    for j in range(i+1,len(score_list)):
        stat, p = ttest_rel(score_list[i], score_list[j])
      
        alpha = 0.05
        
        table[i][j]=np.round(p,3) # upper diagonal
        table[j][i]=np.round(stat,3) # lower diagonal
        table[i][i]=''

print('The following table shows the comparison of the statistics and the p-values of the models')
print('The value of the statistic on the lower diagonal and the p-value on the upper diagonal')

# Show table, including model names
table = pd.DataFrame(table).set_index(pd.Index([model.split('_')[0].upper() for model in sklearn_models]), inplace=False)
table.columns = [model.split('_')[0].upper() for model in sklearn_models]
table

# 4. Interpretability

In [None]:
selected_models = ['rf_MA_train', 'GBM_MA_train', 'keras_model_1']

# Create dalex explainers dict for selected models
explainers = {}
for model in selected_models:
    explainers[model] = dx.Explainer(eval(model), X_train, y_train, label=model)

## 4.1. Residual Diagnostics

In [None]:
# Plot Residual Diagnostics
for explainer in explainers.values():
    explainer.model_diagnostics().plot()

In [None]:
# Plot Residual Diagnostics (Absolute)
for explainer in explainers.values():
    explainer.model_diagnostics().plot(variable = "ids", yvariable = "abs_residuals")

## 4.2. Global Explainability - Variable importances

In [None]:
# Plot variable importances for selected models
for explainer in explainers.values():
    explainer.model_parts().plot()

## 4.3. Partial Dependence Profile (PDP)

In [None]:
# Plot PDP for selected models
for explainer in explainers.values():
    explainer.model_profile(variables = ["vessel_class_Recreational", "watertype_river"] , type = "partial").plot()

## 4.4. Break Down profiles

In [None]:
# Select a sample observation
vessel_sample = X_train.sample(n=1, random_state=seed)

# Show this observation
vessel_sample

In [None]:
# Plot shapley values for selected observation
for explainer in explainers.values():
    explainer.predict_parts(new_observation = vessel_sample, type = "break_down").plot()

## 4.5. SHapley Additive exPlanations (SHAP)

In [None]:
# Calculate predictions (time-consuming process)
if train_model_enabled :
    shaps = {}
    for model in explainers:
        shaps[model] = explainers[model].predict_parts(new_observation = vessel_sample, type = "shap")
    joblib.dump(shaps, models_folder + '/' + 'MA_shaps.pkl')
else:
    shaps = joblib.load(models_folder + '/' + 'MA_shaps.pkl')

# Plot shapley values
for model in explainers:
     shaps[model].plot()

## 4.6. Ceteris Paribus profiles

In [None]:
# Plot shapley values for selected observation
for explainer in explainers.values():
    explainer.predict_profile(new_observation = vessel_sample).plot(variables = ["vessel_class_Recreational", "watertype_river"])

# Session info

In [None]:
# Python version
import sys
print(f'Python version: {sys.version}')

# Initiate variables for package versions
pkgs_dict = {}
version = []

# Loop importing __version__ for each package (because no whole packages are imported)
for pkg in ['pandas', 'numpy', 'sklearn', 'keras', 'tensorflow', 'h2o', 'dalex']:
    exec(f"from {pkg} import __version__ as version")
    pkgs_dict[pkg] = version
    
# Show as table
pd.DataFrame(pkgs_dict, index=["Package version"])