# Modelling

# 1. Load Required Libraries

In [64]:
import src.util as utils
import pandas as pd
import matplotlib.pyplot as plt
import copy
import hashlib

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 2. Load Configuration File

In [2]:
config = utils.load_config()

# 3. Load Model-Ready datasets

In [3]:
def load_train(config: dict) -> pd.DataFrame:
    # Load train set
    x_train = utils.pickle_load(config["train_set_modelready_path"][0])
    y_train = utils.pickle_load(config["train_set_modelready_path"][1])

    return x_train, y_train

def load_valid(config: dict) -> pd.DataFrame:
    # Load valid set
    x_valid = utils.pickle_load(config["valid_set_modelready_path"][0])
    y_valid = utils.pickle_load(config["valid_set_modelready_path"][1])

    return x_valid, y_valid

def load_test(config: dict) -> pd.DataFrame:
    # Load tets set
    x_test = utils.pickle_load(config["test_set_modelready_path"][0])
    y_test = utils.pickle_load(config["test_set_modelready_path"][1])

    return x_test, y_test

In [4]:
x_train, y_train = load_train(config)
x_valid, y_valid = load_valid(config)
x_test, y_test = load_test(config)

# 4. Training Model

From our EDA we know that our target is leaning toward red fighter at 58.365%. Which means if we take all of our prediction to be red, it's likely to have 58% accuracy.

We will set this to be our baseline.

In [104]:
# because we want to maintain our interpretability, let's choose Decision Tree as our base model 
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_split=10)

In [105]:
dtc.fit(x_train, y_train)

In [106]:
y_pred = dtc.predict(x_valid)

In [107]:
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.51      0.47       305
           1       0.60      0.52      0.56       429

    accuracy                           0.52       734
   macro avg       0.52      0.52      0.51       734
weighted avg       0.53      0.52      0.52       734



As we can see here, our first model didn't perform so well. It even performed worse that our baseline. It was already suggested by our EDA.

Let's try some more model, now in the form of function so we can access it easily.

In [58]:
def create_model_object() -> list:
    # Debug message
    utils.print_debug("Creating model objects.")

    # Create model objects
    lgr = LogisticRegression()
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    knn = KNeighborsClassifier()
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        { "model_name": lgr.__class__.__name__, "model_object": lgr, "model_uid": "", "performance": "", "f1_score_avg": ""},
        { "model_name": dtc.__class__.__name__, "model_object": dtc, "model_uid": "", "performance": "", "f1_score_avg": ""},
        { "model_name": rfc.__class__.__name__, "model_object": rfc, "model_uid": "", "performance": "", "f1_score_avg": ""},
        { "model_name": knn.__class__.__name__, "model_object": knn, "model_uid": "", "performance": "", "f1_score_avg": ""},
        { "model_name": xgb.__class__.__name__, "model_object": xgb, "model_uid": "", "performance": "", "f1_score_avg": ""}
    ]

    # Debug message
    utils.print_debug("Model objects created.")

    # Return the list of model
    return list_of_model

In [92]:
def create_dist_params(model_name: str) -> dict:
    # Define models parameters
    dist_params_xgb = {
        "n_estimators" : [50, 100, 200, 300, 400, 500]
    }
    dist_params_dtc = {
        "criterion" : ["gini", "entropy", "log_loss"],
        "min_samples_split" : [2, 4, 6, 10, 15, 20, 25],
        "min_samples_leaf" : [1, 2, 4, 6, 10, 15, 20, 25]
    }
    dist_params_knn = {
        "algorithm" : ["ball_tree", "kd_tree", "brute"],
        "n_neighbors" : [2, 3, 4, 5, 6, 10, 15, 20, 25],
        "leaf_size" : [2, 3, 4, 5, 6, 10, 15, 20, 25],
    }
    dist_params_lgr = {
        "penalty" : ["l1", "l2", "none"],
        "C" : [0.01, 0.05, 0.10, 0.15, 0.20, 0.30, 0.60, 0.90, 0.99],
        "solver" : ["saga"],
        "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    }
    dist_params_rfc = {
        "criterion" : ["gini", "entropy", "log_loss"],
        "n_estimators" : [50, 100, 200, 300, 400, 500],
        "min_samples_split" : [2, 4, 6, 10, 15, 20, 25],
        "min_samples_leaf" : [1, 2, 4, 6, 10, 15, 20, 25]
    }

    # Make all models parameters into one
    dist_params = {
        "XGBClassifier": dist_params_xgb,
        "DecisionTreeClassifier": dist_params_dtc,
        "KNeighborsClassifier": dist_params_knn,
        "LogisticRegression": dist_params_lgr,
        "RandomForestClassifier": dist_params_rfc
    }

    # Return distribution of model parameters
    return dist_params[model_name]

In [80]:
def get_hyperparams_tuning(list_of_model: dict) -> list:
    # Create copy of current best baseline model
    list_of_model = copy.deepcopy(list_of_model)
    list_of_rsc = list()

    for model in list_of_model:
        # Create model's parameter distribution
        dist_params = create_dist_params(model["model_name"])

        # Create model object
        model_rsc = RandomizedSearchCV(model["model_object"], dist_params, n_jobs = -1)
        model = {
            "model_name": model["model_name"],
            "model_object": model_rsc,
            "model_uid": "", 
            "performance": "", 
            "f1_score_avg": ""
        }
        list_of_rsc.append(model)
    
    # Return model object
    return list_of_rsc

In [82]:
def train_eval(
        x_train: pd.DataFrame, 
        y_train: pd.DataFrame, 
        x_valid: pd.DataFrame, 
        y_valid: pd.DataFrame,
        data_configuration:str,
        hyperparams_tuning: bool = False
    ):
    
    x_train = {
        data_configuration : x_train
    }

    y_train = {
        data_configuration : y_train
    }

    # Variabel to store trained models
    list_of_trained_model = list()

    # Training for every data configuration
    for config_data in x_train:
        # Debug message
        utils.print_debug("Training model based on configuration data: {}".format(config_data))

        # Create model objects
        if hyperparams_tuning:
            list_of_model = get_hyperparams_tuning(create_model_object())
        else:
            list_of_model = create_model_object()

        # Variabel to store tained model
        trained_model = list()

        # Load train data based on its configuration
        x_train_data = x_train[config_data]
        y_train_data = y_train[config_data]

        # Train each model by current dataset configuration
        for model in list_of_model:
            # Debug message
            utils.print_debug("Training model: {}".format(model["model_name"]))

            # Training
            training_time = utils.time_stamp()
            model["model_object"].fit(x_train_data, y_train_data)
            training_time = (utils.time_stamp() - training_time).total_seconds()

            # Debug message
            utils.print_debug("Evaluating model: {}".format(model["model_name"]))

            # Evaluation
            y_predict = model["model_object"].predict(x_valid)
            performance = classification_report(y_valid, y_predict, output_dict = True)

            # Assign model's perfomance
            model["performance"] = performance
            model["f1_score_avg"] = performance["macro avg"]["f1-score"]
            
            # Create UID
            uid = hashlib.md5(str(training_time).encode()).hexdigest()

            # Assign model's UID
            model["model_uid"] = uid

            # Collect current trained model
            trained_model.append(copy.deepcopy(model))

            # Debug message
            utils.print_debug("Model {} has been trained for configuration data {}.".format(model["model_name"], config_data))
        
        # Collect current trained list of model
        list_of_trained_model.append(copy.deepcopy(trained_model))
    
    # Debug message
    utils.print_debug("All combination models and configuration data has been trained.")
    
    # Return list trained model
    return list_of_trained_model[0]

In [76]:
def get_best_model(list_of_model):
    # Debug message
    utils.print_debug("Making training log containing model UID and f1 macro avg.")

    # Make empty training log
    training_log = {"model_uid": [], "f1_score_avg": []}
    for model in list_of_model:
        training_log["model_uid"].append(model["model_uid"])
        training_log["f1_score_avg"].append(model["f1_score_avg"])

    # Convert dictionary to pandas for easy operation
    training_log = pd.DataFrame(training_log)

    # Debug message
    utils.print_debug("Sorting training log by f1 macro avg.")

    # Sort training log by f1 score macro avg and trining time
    best_model_log = training_log.sort_values("f1_score_avg", ascending = False).iloc[0]

    # Debug message
    utils.print_debug("Searching model data based on sorted training log.")

    # Make best model variable
    best_model = None

    # Get model object with greatest f1 score macro avg by using UID
    for model in list_of_model:
        if model["model_uid"] == best_model_log["model_uid"]:
            best_model = model
            break
    
    # In case UID not found
    if best_model == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    # Debug message
    utils.print_debug("Model chosen.")
    utils.print_debug("Model name: {}".format(best_model["model_name"]))
    
    # Return current chosen production model, log of production models and current training log
    return best_model
    


In [93]:
trained_models = train_eval(x_train, y_train, x_valid, y_valid, "undersampling", hyperparams_tuning=True)

2022-12-05 20:20:05.303169 Training model based on configuration data: undersampling
2022-12-05 20:20:05.303169 Creating model objects.
2022-12-05 20:20:05.303169 Model objects created.
2022-12-05 20:20:05.304176 Training model: LogisticRegression
2022-12-05 20:20:09.737159 Evaluating model: LogisticRegression
2022-12-05 20:20:09.746159 Model LogisticRegression has been trained for configuration data undersampling.
2022-12-05 20:20:09.746159 Training model: DecisionTreeClassifier
2022-12-05 20:20:10.074167 Evaluating model: DecisionTreeClassifier
2022-12-05 20:20:10.085168 Model DecisionTreeClassifier has been trained for configuration data undersampling.
2022-12-05 20:20:10.086163 Training model: RandomForestClassifier
2022-12-05 20:20:22.482263 Evaluating model: RandomForestClassifier
2022-12-05 20:20:22.562342 Model RandomForestClassifier has been trained for configuration data undersampling.
2022-12-05 20:20:22.566325 Training model: KNeighborsClassifier
2022-12-05 20:20:24.677240 



2022-12-05 20:20:31.077233 Evaluating model: XGBClassifier
2022-12-05 20:20:31.120235 Model XGBClassifier has been trained for configuration data undersampling.
2022-12-05 20:20:32.294256 All combination models and configuration data has been trained.


In [96]:
best_model = get_best_model(trained_models)
performance = pd.DataFrame(best_model["performance"])
performance


2022-12-05 20:21:06.045662 Making training log containing model UID and f1 macro avg.
2022-12-05 20:21:06.046678 Sorting training log by f1 macro avg.
2022-12-05 20:21:06.047680 Searching model data based on sorted training log.
2022-12-05 20:21:06.048669 Model chosen.
2022-12-05 20:21:06.048669 Model name: LogisticRegression


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.512048,0.664179,0.595368,0.588114,0.600964
recall,0.557377,0.622378,0.595368,0.589877,0.595368
f1-score,0.533752,0.642599,0.595368,0.588176,0.59737
support,305.0,429.0,0.595368,734.0,734.0


In [103]:
best_model["model_object"].best_params_

{'solver': 'saga', 'penalty': 'l1', 'max_iter': 600, 'C': 0.2}

Even after exhaustive fitting, we still cannot find one that performs significantly better than our baseline.

Because of what our EDA suggests, we should try to find some other features that didn't included here but available in our source dataset. Hopefully we can make a better model.