# GRADIENT BOOSTING CLASSIFIER ---------------------------------------------------------------

### Get Train, Valid, Test data

In [None]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'dev'
run_type = 'prd'
gen_sample = False
find_best_params = False

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type)
print("Train, valid and Test data loaded")

### Resample data with SMOTEN()

In [None]:
if gen_sample:
    X_train, y_train = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train.to_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train.to_pickle(f'./pickles/y_train_smote_{run_type}.pkl')
else:
    # Load data previously generated
    X_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train = pd.read_pickle(f'./pickles/y_train_smote_{run_type}.pkl')

### Encode categorical data (target and one hot encoding)

In [None]:
from my_libs.encoder_custom import  EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)

X_train, y_train = encoder.transform(X=X_train, y=y_train, datatype='Train')
X_valid, y_valid = encoder.transform(X=X_valid, y=y_valid, datatype='Test')
X_test , y_test  = encoder.transform(X=X_test , y=y_test , datatype='Test')

### Find best hyperparameters for model with Optuna

In [None]:
import time
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

if find_best_params:

    n_trials = 10
    start_time = time.time()

    def objective(trial):

        dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5, log=True)
        dt_n_estimators  = trial.suggest_int('n_estimators', 50, 150)
        classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)

        score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    # Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"\n--- Decision Tree Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

In [None]:
if find_best_params:
    from optuna.visualization import plot_optimization_history
    plot_optimization_history(study)

### Fit best model - Plot Train and Test learning curves

In [None]:
import numpy as np
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
%matplotlib inline

# Train the best model
params = study.best_params
model = GradientBoostingClassifier(**params)
model.fit(X_train, y_train)

# Compute the learning curve
train_sizes, train_scores, valid_scores = learning_curve(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

# Calculate the mean and standard deviation of the training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std  = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std  = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.title('XgBoost Classifier Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, valid_scores_mean, 'o-', color='g', label='Validation Score')
plt.legend(loc='best')
plt.show()

### Confusion Matrix

In [None]:
## Confusion Matrix
from my_libs.model_evaluator import ModelEvaluator

if find_best_params:
    params = study.best_params

    evaluator = ModelEvaluator(model_type='GradientBoostingClassifier', params=params, X_train=X_train, y_train=y_train,
                               X_test=X_test, y_test=y_test)
    model = evaluator.evaluate()

### Save model to h5

In [None]:
from joblib import dump
# Save the model to an h5 file using joblib
dump(model, f'h5_models/model_xgb_{run_type}.h5')

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = study.best_params

evaluator = ModelEvaluator(model_type='GradientBoostingClassifier', params=params, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
model = evaluator.evaluate()