# Random forest model training notebook

Although the raw data has now been cleaned, each AI/ML Model we train needs to first process the data so it is formatted slightly differently. 
For Random Forest models, we need to impute missing numerical values with a neutral constant like -1 and for catagorical features we need to impute with a neutral catagory like "missing".

In [None]:
# Using Python 3.11, Conda environment.
# Ensure using env.yml to create the environment (data_analysis_env). 
# Importing data analysis & visualisation librarys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

In [None]:
# import csv into a df
csv_path = "../data/processed/merged_data_base.csv"
df = pd.read_csv(csv_path)

In [None]:
# Drop id column
df = df.drop('id', axis = 1)


In [None]:
# pick out columns as labels and columns that are features, create new X and Y for training
label_cols = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56'] # create array with all labels
features = [col for col in df.columns if col not in label_cols] # get all columns that arent in the label column array

Y = df[label_cols].copy() # Y df becomes labels 
X = df[features].copy() # X df becomes the features

# now identify which columns are catagorical and which are numerical
ordinal_cols = X.select_dtypes(include='object').columns.to_list() # Get all columns that are object dtype
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.to_list() # Get all cols that are num dytpe

In [None]:
# Missing value imputing
X[num_cols] = X[num_cols].fillna(-1) # change num NaN's to -1
X[ordinal_cols] = X[ordinal_cols].fillna('missing') # Change catagory NaN's to 'missing'

# One-Hot encode catagorical columns using get_dummies()
X = pd.get_dummies(X, columns=ordinal_cols, drop_first=False)

In [None]:
# Set up test and training splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
random_forest_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100,
                                                                   criterion= 'entropy', 
                                                                   random_state=42
                                                                   ))
random_forest_model.fit(X_train, Y_train)
y_prediction = random_forest_model.predict(X_test)

In [None]:
print(classification_report(Y_test, y_prediction, target_names=label_cols, zero_division=0))

In [None]:
# Check feature importance to ensure model is logically classifying the data based on domain knowledge
feature_names = X_train.columns

for i, n in enumerate(random_forest_model.estimators_): # Loop over each tree
    importances = n.feature_importances_ # get the feature importances
    label = Y_train.columns[i] # Get the target label name from corresponding index
    sorted_id = np.argsort(importances)[::-1] # Sorts importance and store in variable
    print(f"\ntop features for the label '{label}':") #print header for each label
    for idx in sorted_id[:10]: #iterate over top 10
        print(f"{feature_names[idx]} -> {importances[idx]}") #print its importance score

### Feature engineering of under-represented labels

In [None]:
# Identify under/over-representation of labels
label_counts = df[label_cols].sum().sort_values()
print(f"Occurrence of each label (sorted):\n{label_counts}")

In [None]:
# Under 0.98 F1-Score labels were chosen
B55B_pos = df[df['B55B'] == 1]
B55B_oversample = pd.concat([B55B_pos] *3, ignore_index=True)

B54B_pos = df[df['B54B'] == 1]
B54B_oversample = pd.concat([B54B_pos] *3, ignore_index=True)

B54A_pos = df[df['B54A'] == 1]
B54A_oversample = pd.concat([B54A_pos] *3, ignore_index=True)

B41_pos = df[df['B41'] == 1]
B41_oversample = pd.concat([B41_pos] *3, ignore_index=True)

In [None]:
# Create an oversampled dataframe.
df_oversample = pd.concat([df, B55B_oversample, B54B_oversample, B54A_oversample, B41_oversample], ignore_index=True)

# Now retrain again using this new df
label_cols = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56']
features = [col for col in df_oversample.columns if col not in label_cols]

Y = df_oversample[label_cols].copy()
X = df_oversample[features].copy()

# now identify which columns are catagorical and which are numerical
ordinal_cols = X.select_dtypes(include='object').columns.to_list()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.to_list()

# Missing value imputing
X[num_cols] = X[num_cols].fillna(-1)
X[ordinal_cols] = X[ordinal_cols].fillna('missing')

# One-Hot encode catagorical columns using get_dummies()
X = pd.get_dummies(X, columns=ordinal_cols, drop_first=False)

# Set up test and training splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

random_forest_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100,
                                                                   criterion='entropy', 
                                                                   random_state=42))
random_forest_model.fit(X_train, Y_train)
Y_prediction = random_forest_model.predict(X_test)

print(classification_report(Y_test, Y_prediction, target_names=label_cols, zero_division=0))

## Resultant Model Eval.:  
Average F1-Score: 1.00
Lowest precision = 0.98 on B43
Lowest Recall = 0.99 on B36
Lowest F-1 = 0.99 on B36

## Model inference Eval:

In [None]:
import time
## Time in seconds for inferencing a batch of 10 instances
start_time = time.time()
random_forest_model.predict(X_test[:10])
inf_time = (time.time() - start_time) /10
print(inf_time)

In [None]:
from sklearn.metrics import hamming_loss # fraction of labesl incorrectly classified.
print(hamming_loss(Y_test, Y_prediction))


In [None]:
from sklearn.metrics import accuracy_score # EXACT match ration == Accuracy %
print(accuracy_score(Y_test, Y_prediction))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix
conf = multilabel_confusion_matrix(Y_test, Y_prediction)

labels = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56']

for i, mtx in enumerate(conf):
    plt.figure()
    sns.heatmap(mtx, annot=True, fmt='d', cmap="Blues", cbar=False)
    plt.title(f"confusion matrix for {labels[i]}")
    plt.xlabel('Predicted label')
    plt.ylabel('Actual label')
    plt.show()