# LightGBM model training notebook

In [None]:
import pandas as pd
import lightgbm as lgb 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
# import csv into a df
csv_path = "../data/processed/merged_data_base.csv"
df = pd.read_csv(csv_path)

# Drop id column
df = df.drop('id', axis = 1)

In [None]:
# Create X and Y df's
X = df.drop(columns=['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56'])
Y = df[['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56']]

In [None]:
ordinal_cols = X.select_dtypes(include='object').columns.to_list()
X = pd.get_dummies(X, columns=ordinal_cols, drop_first=False)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Binary Relevance with LGBM
LGBM_models = {}
for i in Y_train.columns:
    model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train[i])
    LGBM_models[i] = model

In [None]:
predictions = pd.DataFrame()
for col, model in LGBM_models.items():
    predictions[col] = model.predict(X_test)

In [None]:
print(classification_report(Y_test, predictions, target_names=Y.columns, zero_division=0))

## Resultant Model Eval.:  
Average F1-Score: 0.99
Lowest precision = 0.85 on B54B
Lowest Recall = 0.73 on B54B
Lowest F-1 = 0.85 on B354B

## Model inference Eval:

In [None]:
import time
## Time in seconds for inferencing a batch of 10 instances
start_time = time.time()
model.predict(X_test[:10])
inf_time = (time.time() - start_time) /10
print(inf_time)

In [None]:
from sklearn.metrics import hamming_loss # fraction of labesl incorrectly classified.
print(hamming_loss(Y_test, predictions))


In [None]:
from sklearn.metrics import accuracy_score # EXACT match ration == Accuracy %
print(accuracy_score(Y_test, predictions))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix
conf = multilabel_confusion_matrix(Y_test, predictions)

labels = ['B36', 'B41', 'B43', 'B54A', 'B54B', 'B54C', 'B55A', 'B55B', 'B56']

for i, mtx in enumerate(conf):
    plt.figure()
    sns.heatmap(mtx, annot=True, fmt='d', cmap="Blues", cbar=False)
    plt.title(f"confusion matrix for {labels[i]}")
    plt.xlabel('Predicted label')
    plt.ylabel('Actual label')
    plt.show()