# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **_Your Name_** | _Your Email_ | **_Project Name_** |

# II. Notebook Target Definition

_Insert Text Here_

# III. Notebook Setup

## III.A. Import Libraries

In [None]:
from copy import deepcopy
from datetime import datetime
from hyperopt import fmin, tpe, space_eval, Trials, STATUS_OK
from interpret import set_visualize_provider, show
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.provider import InlineProvider
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV, learning_curve, StratifiedKFold
from tqdm import tqdm
from xgboost import XGBClassifier
import hashlib
import hyperopt
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
set_visualize_provider(InlineProvider())

## III.B. Import Data

In [None]:
X_train = pd.read_pickle('../../data/processed/X_train_woe.pkl')
X_test = pd.read_pickle('../../data/processed/X_test_woe.pkl')
y_train = pd.read_pickle('../../data/processed/y_train.pkl')
y_test = pd.read_pickle('../../data/processed/y_test.pkl')

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# IV. Models Training and Evaluation

## IV.A. Data Shape Inspection

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## IV.B. Data Information Inspection

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [None]:
y_test.info()

## IV.C. Training Log

In [None]:
def time_stamp():
    return datetime.now()


def create_logger():
    return {
        "model_name": [],
        "model_uid": [],
        "training_time": [],
        "training_date": [],
        "performance": [],
        "f1_score_avg": [],
        "auc_roc": [],
        "gini": [],
        "data_configurations": []
    }


def training_log_updater(current_log, log_path):
    try:
        with open(log_path, 'r') as file:
            last_log = json.load(file)
    except FileNotFoundError:
        with open(log_path, 'w') as file:
            file.write("[]")
        with open(log_path, 'r') as file:
            last_log = json.load(file)
    last_log.append(current_log)
    with open(log_path, 'w') as file:
        json.dump(last_log, file)
    return last_log


def model_training_and_evaluation(model_factory, model_prefix, X_train, y_train, X_test, y_test, data_configuration, log_path):
    def check_log_length(log_path):
        try:
            with open(log_path, 'r') as file:
                logs = json.load(file)
                return len(logs)
        except FileNotFoundError:
            return 0
    before_training_len = check_log_length(log_path)
    logger = create_logger()
    current_training_models = []
    for model in tqdm(model_factory()):
        model_name = model_prefix + "-" + model["model_name"]
        start_time = time_stamp()
        model["model_object"].fit(X_train, y_train)
        finished_time = time_stamp()
        elapsed_time = (finished_time - start_time).total_seconds()
        y_prediction = model["model_object"].predict(X_test)
        performance = classification_report(
            y_test, y_prediction, output_dict=True)
        y_probs = model["model_object"].predict_proba(X_test)[:, 1]
        auc_roc = roc_auc_score(y_test, y_probs)
        gini = 2 * auc_roc - 1
        original_id = str(start_time) + str(finished_time)
        hashed_id = hashlib.md5(original_id.encode()).hexdigest()
        model["model_uid"] = hashed_id
        logger["model_name"].append(model_name)
        logger["model_uid"].append(hashed_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["auc_roc"].append(auc_roc)
        logger["gini"].append(gini)
        logger["data_configurations"].append(data_configuration)
        current_training_models.append({
            "model_name": model_name,
            "model_object": deepcopy(model["model_object"]),
            "model_uid": model["model_uid"]
        })
    training_log = training_log_updater(logger, log_path)
    after_training_len = check_log_length(log_path)
    print(f"Logs Before Training: {before_training_len}")
    print(f"Logs After Training: {after_training_len}")
    print(f"Added {after_training_len - before_training_len} new logs.")
    return training_log, current_training_models


def model_training_and_evaluation_skf(model_factory, model_prefix, X_train, y_train, X_test, y_test, data_configuration, log_path, n_splits=5, datasets_per_fold=None):
    def check_log_length(log_path):
        try:
            with open(log_path, 'r') as file:
                logs = json.load(file)
                return len(logs)
        except FileNotFoundError:
            return 0
    before_training_len = check_log_length(log_path)
    logger = create_logger()
    X = pd.concat([X_train, X_test], axis=0)
    y = pd.concat([y_train, y_test], axis=0)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)
    current_training_models = []
    if datasets_per_fold is None:
        datasets_per_fold = []
    for model in tqdm(model_factory()):
        for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
            model_name = f"{model_prefix}-{model['model_name']}-fold_{fold+1}"
            start_time = time_stamp()
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            X_train_current, X_test_current = X_train_fold, X_val_fold
            fold_data_dict = {
                "model_name": model_name,
                "n_fold": f"fold_{fold+1}",
                "X_train": pd.DataFrame(X_train_fold),
                "y_train": pd.Series(y_train_fold),
                "X_test": pd.DataFrame(X_val_fold),
                "y_test": pd.Series(y_val_fold)
            }
            datasets_per_fold.append(fold_data_dict)
            model["model_object"].fit(X_train_current, y_train_fold)
            finished_time = time_stamp()
            elapsed_time = (finished_time - start_time).total_seconds()
            y_prediction = model["model_object"].predict(X_test_current)
            performance = classification_report(
                y_val_fold, y_prediction, output_dict=True)
            y_probs = model["model_object"].predict_proba(X_test_current)[:, 1]
            auc_roc = roc_auc_score(y_val_fold, y_probs)
            gini = 2 * auc_roc - 1
            original_id = str(start_time) + str(finished_time)
            hashed_id = hashlib.md5(original_id.encode()).hexdigest()
            model["model_uid"] = hashed_id
            logger["model_name"].append(model_name)
            logger["model_uid"].append(hashed_id)
            logger["training_time"].append(elapsed_time)
            logger["training_date"].append(str(start_time))
            logger["performance"].append(performance)
            logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
            logger["auc_roc"].append(auc_roc)
            logger["gini"].append(gini)
            logger["data_configurations"].append(data_configuration)
            current_training_models.append({
                "model_name": model_name,
                "model_object": deepcopy(model["model_object"]),
                "model_uid": model["model_uid"]
            })
    training_log = training_log_updater(logger, log_path)
    after_training_len = check_log_length(log_path)
    print(f"Logs Before Training: {before_training_len}")
    print(f"Logs After Training: {after_training_len}")
    print(f"Added {after_training_len - before_training_len} new logs.")
    return training_log, current_training_models, datasets_per_fold


def training_log_to_df_converter(training_log):
    all_training_logs_df = pd.DataFrame()
    for log in tqdm(training_log):
        individual_log_df = pd.DataFrame(log)
        performance_df = pd.json_normalize(individual_log_df["performance"])
        individual_log_df = pd.concat([individual_log_df.drop(
            "performance", axis=1), performance_df], axis=1)
        all_training_logs_df = pd.concat(
            [all_training_logs_df, individual_log_df])
    all_training_logs_df.sort_values(["f1_score_avg", "auc_roc", "training_time"], ascending=[
                                     False, False, True], inplace=True)
    all_training_logs_df.reset_index(inplace=True, drop=True)
    return all_training_logs_df


def best_model_finder(all_training_logs_df, models_list):
    model_object = None
    best_model_info = all_training_logs_df.iloc[0]
    for configuration_data in models_list:
        for model_data in models_list[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    return model_object

def tuned_model_finder(models_list_tuned, tuning_method):
    for model in models_list_tuned:
        if tuning_method in model["model_name"]:
            return model["model_object"]
    print(f"No model found that was tuned with {tuning_method}")
    return None

## IV.D. Baseline Models

In [None]:
def create_models(prefix):
    return [
        {"model_name": prefix + "_DecisionTreeClassifier",
            "model_object": DecisionTreeClassifier(random_state=777), "model_uid": ""},
        {"model_name": prefix + "_ExplainableBoostingClassifier",
            "model_object": ExplainableBoostingClassifier(random_state=777), "model_uid": ""},
        {"model_name": prefix + "_LogisticRegression",
            "model_object": LogisticRegression(random_state=777), "model_uid": ""},
        {"model_name": prefix + "_RandomForestClassifier",
            "model_object": RandomForestClassifier(random_state=777), "model_uid": ""},
        {"model_name": prefix + "_XGBClassifier",
            "model_object": XGBClassifier(random_state=777), "model_uid": ""}
    ]

In [None]:
models_factory = {
    "vanilla": lambda: create_models("vanilla"),
    "sampling": lambda: create_models("sampling")
}

In [None]:
models_factory

### IV.D.1. Vanilla Models

In [None]:
training_log, models_list_vanilla = model_training_and_evaluation(
    models_factory["vanilla"],
    "vanilla",
    X_train,
    y_train,
    X_test,
    y_test,
    "vanilla",
    '../../models/logs/training_log.json'
)

In [None]:
training_log, models_list_vanilla, datasets_per_fold_vanilla = model_training_and_evaluation_skf(
    models_factory["vanilla"],
    "vanilla",
    X_train,
    y_train,
    X_test,
    y_test,
    "vanilla",
    '../../models/logs/training_log.json'
)

In [None]:
models_list_vanilla

### IV.D.2. Sampling Models

In [None]:
training_log, models_list_sampling = model_training_and_evaluation(
    models_factory["sampling"],
    "sampling",
    X_train,
    y_train,
    X_test,
    y_test,
    "sampling",
    '../../models/logs/training_log.json'
)

In [None]:
training_log, models_list_sampling, datasets_per_fold = model_training_and_evaluation_skf(
    models_factory["sampling"],
    "sampling",
    X_train,
    y_train,
    X_test,
    y_test,
    "sampling",
    '../../models/logs/training_log.json',
    datasets_per_fold=datasets_per_fold_vanilla
)

In [None]:
models_list_sampling

In [None]:
models_list = {
    "vanilla": models_list_vanilla,
    "sampling": models_list_sampling
}
models_list

## IV.E. Models Selection

### IV.E.1. Benchmark Performance Review

In [None]:
# Model performance that a model would achieve if it always predicted the most common label.
benchmark = y_train.value_counts(normalize=True)[0]
benchmark

### IV.E.2. Baseline Base Model Performance Review

In [None]:
all_training_logs_df = training_log_to_df_converter(training_log)
all_training_logs_df

In [None]:
all_training_logs_df.to_csv('../../reports/baseline_model.csv', index = False)

In [None]:
baseline_best_model = best_model_finder(all_training_logs_df, models_list)
baseline_best_model

In [None]:
best_model_info = all_training_logs_df.iloc[0]
print("Best model configuration:", best_model_info["data_configurations"])

In [None]:
def filter_by_algorithm(all_training_logs_df, algorithm_name):
    filtered_df = all_training_logs_df[all_training_logs_df["model_name"].str.contains(
        algorithm_name)]
    return filtered_df


def average_performance_metrics(filtered_df, f1_score_column="f1_score_avg", roc_auc_column="auc_roc"):
    avg_f1_score = filtered_df[f1_score_column].mean()
    avg_roc_auc = filtered_df[roc_auc_column].mean()
    return avg_f1_score, avg_roc_auc


def get_best_model_and_dataset(model_name, models_list, datasets_per_fold):
    model_instance = None
    model_data = None
    for key in models_list:
        for model_info in models_list[key]:
            if model_info["model_name"] == model_name:
                model_instance = model_info["model_object"]
                break
        if model_instance is not None:
            break
    for data in datasets_per_fold:
        if data["model_name"] == model_name:
            model_data = data
            break
    return model_instance, model_data


def get_metrics_dataframe(model, X_train, y_train, X_test, y_test):
    train_prediction = model.predict(X_train)
    test_prediction = model.predict(X_test)
    train_probs = model.predict_proba(X_train)[:, 1]
    test_probs = model.predict_proba(X_test)[:, 1]

    def get_prediction_metrics(y_true, y_pred, y_probs):
        report = classification_report(y_true, y_pred, output_dict=True)
        accuracy = accuracy_score(y_true, y_pred)
        auc_roc = roc_auc_score(y_true, y_probs)
        gini = 2 * auc_roc - 1
        metrics = {
            "precision": report["weighted avg"]["precision"],
            "recall": report["weighted avg"]["recall"],
            "f1-score": report["weighted avg"]["f1-score"],
            "accuracy": accuracy,
            "auc_roc": auc_roc,
            "gini": gini
        }
        return metrics
    train_metrics = get_prediction_metrics(
        y_train, train_prediction, train_probs)
    train_metrics["dataset"] = "Train"
    test_metrics = get_prediction_metrics(y_test, test_prediction, test_probs)
    test_metrics["dataset"] = "Test"
    return pd.DataFrame([train_metrics, test_metrics])


def display_confusion_matrix(model, X_train, y_train, X_test, y_test):
    train_prediction = model.predict(X_train)
    test_prediction = model.predict(X_test)
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    ConfusionMatrixDisplay.from_predictions(
        y_train, train_prediction, ax=ax[0])
    ax[0].set_title("Train Confusion Matrix")
    ConfusionMatrixDisplay.from_predictions(y_test, test_prediction, ax=ax[1])
    ax[1].set_title("Test Confusion Matrix")
    plt.show()


def plot_train_vs_test_error(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_error = 1 - accuracy_score(y_train, y_pred_train)
    test_error = 1 - accuracy_score(y_test, y_pred_test)
    bars = plt.bar(["Train Error", "Test Error"], [train_error, test_error])
    plt.ylabel("Error Rate")
    plt.title("Train vs Test Error")
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.0005,
                 round(yval, 2), ha='center', va='bottom')
    plt.show()


def plot_roc_curve(model, X_train, y_train, X_test, y_test):
    y_pred_train_prob = model.predict_proba(X_train)[:, 1]
    y_pred_test_prob = model.predict_proba(X_test)[:, 1]
    fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_train_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_test_prob)
    plt.figure(figsize=(12, 6))
    plt.plot(fpr_train, tpr_train,
             label=f"Train AUC: {roc_auc_score(y_train, y_pred_train_prob):.2f}")
    plt.plot(fpr_test, tpr_test,
             label=f"Test AUC: {roc_auc_score(y_test, y_pred_test_prob):.2f}")
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc='best')
    plt.show()


def plot_model_learning_curve(model, X, y, cv=50):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.figure(figsize=(12, 6))
    plt.grid()
    plt.plot(train_sizes, train_scores_mean, 'o-',
             color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-',
             color="g", label="Cross-validation score")
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.title("Learning Curves")
    plt.legend(loc='best')
    plt.show()

In [None]:
algorithm_name = "Algorithm"
algo_baseline_df = filter_by_algorithm(all_training_logs_df, algorithm_name)
algo_baseline_df

In [None]:
algo_avg_f1_score, algo_avg_auc_roc = average_performance_metrics(
    algo_baseline_df)
print(f"Average F1 Score for {algorithm_name}: {algo_avg_f1_score:.4f}")
print(f"Average AUC-ROC for {algorithm_name}: {algo_avg_auc_roc:.4f}")

In [None]:
algo_best_model_name = algo_baseline_df.iloc[0]["model_name"]
algo_best_model, algo_best_model_data = get_best_model_and_dataset(
    algo_best_model_name, models_list, datasets_per_fold)
algo_best_model

In [None]:
X_train_algo_best_baseline = algo_best_model_data["X_train"]
X_test_algo_best_baseline = algo_best_model_data["X_test"]
y_train_algo_best_baseline = algo_best_model_data["y_train"]
y_test_algo_best_baseline = algo_best_model_data["y_test"]

In [None]:
metrics_df = get_metrics_dataframe(
    baseline_best_model, X_train, y_train, X_test, y_test)
metrics_df

In [None]:
display_confusion_matrix(
    baseline_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_train_vs_test_error(
    baseline_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_roc_curve(baseline_best_model, X_train,
               y_train, X_test, y_test)

In [None]:
plot_model_learning_curve(baseline_best_model, X_train, y_train)

In [None]:
algo_baseline_df.to_csv('../../reports/algo_baseline_model.csv', index = False)

### IV.E.3. Export Baseline Best Model

In [None]:
with open('../../models/baseline_best_model.pkl', 'wb') as file:
    pickle.dump(baseline_best_model, file)

## IV.F. Hyperparameters Tuning

### IV.F.1. Hyperparameters List

#### IV.F.1.A. Grid Search

In [None]:
log_reg_hyperparams = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear']
}

In [None]:
log_reg_grid_search = GridSearchCV(
    LogisticRegression(random_state=777),
    log_reg_hyperparams,
    n_jobs=-1,
    verbose=420,
    scoring='f1_macro'
)

In [None]:
log_reg_grid_search.fit(X_train, y_train)

In [None]:
best_estimator_from_grid = log_reg_grid_search.best_estimator_

In [None]:
models_list["fine-tuned"] = [{"model_name": "GridSearchBest-LogisticRegression",
                              "model_object": best_estimator_from_grid, "model_uid": ""}]

#### IV.F.1.B. Bayesian Search

In [None]:
log_reg_space = {
    'penalty': hyperopt.hp.choice('penalty', ['l1', 'l2']),
    'C': hyperopt.hp.loguniform('C', np.log(1e-4), np.log(1e4)),
    'solver': 'liblinear'
}

In [None]:
def objective(params):
    classifier = LogisticRegression(**params, random_state=777)
    score = cross_val_score(classifier, X_train,
                            y_train, cv=5, scoring='f1_macro').mean()
    return {'loss': -score, 'status': STATUS_OK}

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=log_reg_space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
best_params = space_eval(log_reg_space, best)

In [None]:
print("The best parameters are: ", best_params)

In [None]:
optimal_log_reg = LogisticRegression(**best_params, random_state=777)

In [None]:
models_list["fine-tuned"].append({"model_name": "BayesOpt-LogisticRegression",
                                  "model_object": optimal_log_reg, "model_uid": ""})

### IV.F.2. Best Model Hyperparameter Retraining

In [None]:
training_log, models_list_tuned = model_training_and_evaluation(
    models_list["fine-tuned"],
    "tuned_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "tuned",
    '../../models/logs/training_log.json'
)

In [None]:
models_list

### IV.F.3. Hyperparameter-tuned Model Performance Review

In [None]:
all_training_logs_df_tuned = training_log_to_df_converter(training_log)
all_training_logs_df_tuned

#### IV.F.3.A. Grid Searched Model Performance Review

In [None]:
models_dict_tuned = {"fine-tuned": models_list_tuned}
tuned_best_model = tuned_model_finder(
    models_dict_tuned["fine-tuned"], "GridSearchBest")
tuned_best_model

In [None]:
metrics_df = get_metrics_dataframe(
    tuned_best_model, X_train, y_train, X_test, y_test)
metrics_df

In [None]:
display_confusion_matrix(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_train_vs_test_error(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_roc_curve(tuned_best_model, X_train,
               y_train, X_test, y_test)

In [None]:
plot_model_learning_curve(tuned_best_model, X_train, y_train)

#### IV.F.3.B. Bayesian Searched Model Performance Review

In [None]:
models_dict_tuned = {"fine-tuned": models_list_tuned}
tuned_best_model = tuned_model_finder(
    models_dict_tuned["fine-tuned"], "BayesOpt")
tuned_best_model

In [None]:
metrics_df = get_metrics_dataframe(
    tuned_best_model, X_train, y_train, X_test, y_test)
metrics_df

In [None]:
display_confusion_matrix(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_train_vs_test_error(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_roc_curve(tuned_best_model, X_train,
               y_train, X_test, y_test)

In [None]:
plot_model_learning_curve(tuned_best_model, X_train, y_train)

### IV.F.4. Export Hyperparameter-tuned Best Model

In [None]:
with open('../../models/tuned_best_model.pkl', 'wb') as file:
    pickle.dump(tuned_best_model, file)