# Artifiical Neural network training notebook

Although the raw data has now been cleaned, each AI/ML Model we train needs to first process the data so it is formatted slightly differently. 
For Random Forest models, we need to impute missing numerical values with a neutral constant like -1 and for catagorical features we need to impute with a neutral catagory like "missing".

In [None]:
# Must pip install tensorflow in Conda environment

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

In [None]:
# Load in csv, create df and drop the index row, not needed for training
csvpath = '../data/processed/merged_data_base.csv'
df = pd.read_csv(csvpath)
df = df.drop('id', axis=1)

In [None]:
# pick out columns as labels and columns that are features, create new X and Y for training
label_cols = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56'] # create array with all labels
features = [col for col in df.columns if col not in label_cols] # get all columns that arent in the label column array

Y = df[label_cols].copy() # Y df becomes labels 
X = df[features].copy() # X df becomes the features

# now identify which columns are catagorical and which are numerical
ordinal_cols = X.select_dtypes(include='object').columns.to_list() # Get all columns that are object dtype
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.to_list() # Get all cols that are num dytpe

In [None]:
# Missing value imputing
X[num_cols] = X[num_cols].fillna(-1) # change num NaN's to -1
X[ordinal_cols] = X[ordinal_cols].fillna('missing') # Change catagory NaN's to 'missing'

# One-Hot encode catagorical columns using get_dummies()
X = pd.get_dummies(X, columns=ordinal_cols, drop_first=False)

In [None]:
# Set up test and training splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ANN configuration, 3 dense layers using relu and sigmoid output layer, 9 labels so 9 nodes.
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(9, activation='sigmoid') # Tried softmax and sigmoid, sigmoid was superior by 0.8
])

In [None]:
# prepare the model for training
model.compile(optimizer=Adam(learning_rate=0.001), # arbitrary learning optimiser
              loss='binary_crossentropy', # binary due to MLC task context
              metrics=[AUC(name="AUC")]) #AUC chosen due to MLC context

In [None]:
history = model.fit(X_train_scaled, Y_train, epochs=30, batch_size=32, validation_split=0.1, verbose=1)

In [None]:
results = model.evaluate(X_test_scaled, Y_test, verbose=1)
print(results)

In [None]:
model.summary()

In [None]:
from sklearn.metrics import classification_report

y_prediciton = model.predict(X_test_scaled)
y_prediciton_binary = (y_prediciton > 0.5).astype(int)
print(classification_report(Y_test, y_prediciton_binary, target_names=label_cols, zero_division=0))

## Model Accuracy Eval.:  
AUC: 0.996
Average F1-Score: 0.99
Lowest precision = 0.98 on B41
Lowest Recall = 0.7 on B55B
Lowest F-1 = 0.82 on B55B

## Model Inference Eval.

In [None]:
import time

start_time = time.time()
model.predict(X_test_scaled[:10])
inf_time = (time.time() - start_time) /10
print(inf_time)

In [None]:
from sklearn.metrics import hamming_loss # fraction of labesl incorrectly classified.
print(hamming_loss(Y_test, y_prediciton_binary))


In [None]:
from sklearn.metrics import accuracy_score # EXACT match ration == Accuracy %
print(accuracy_score(Y_test, y_prediciton_binary))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix
conf = multilabel_confusion_matrix(Y_test, y_prediciton_binary)

labels = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56']

for i, mtx in enumerate(conf):
    plt.figure()
    sns.heatmap(mtx, annot=True, fmt='d', cmap="Blues", cbar=False)
    plt.title(f"confusion matrix for {labels[i]}")
    plt.xlabel('Predicted label')
    plt.ylabel('Actual label')
    plt.show()