<div class="alert alert-info">It is a end to end tutorial to see how you can preprocess the data and use data science plot to evaluate the model
    </div>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

In [None]:
from john_toolbox.utils.logger_config import setup_log_config

setup_log_config(is_dev=True, level="INFO")

# Load data

In [None]:
train = pd.read_csv("/work/data/titanic/train.csv")
test = pd.read_csv("/work/data/titanic/test.csv")

In [None]:
train.head()

# Data Viz

In [None]:
# from dataprep.eda import create_report

# create_report(train).show()

In [None]:
train.columns

In [None]:
# from dataprep.eda import plot, plot_correlation, plot_missing

# plot(train, "Pclass", "Survived")

In [None]:
# plot(train,"Survived", "Pclass")

In [None]:
# plot(train,"Survived", "Pclass")

# Train test split

In [None]:
target_name = "Survived"

In [None]:
from john_toolbox.tutorial.binary.xgboost.preprocessing import extract_X_y
from sklearn.model_selection import train_test_split

In [None]:
def extract_X_y(df, target_name):
    X = df[[col for col in df.columns if col != target_name]]
    y = df[[target_name]]
    return X, y

In [None]:
X, y = extract_X_y(train, target_name)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,  test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
X_train.head()

# Boosting

## XGBoost

<div class="alert alert-info">
XGBoost has become in 2017-2018 a widely used and really popular tool among Kaggle competitors and Data Scientists in industry, as it has been battle tested for production on large-scale problems. It is a highly flexible and versatile tool that can work through most regression, classification and ranking problems as well as user-built objective functions. As an open-source software.
    
https://www.kdnuggets.com/2017/10/xgboost-top-machine-learning-method-kaggle-explained.html
    
    
But it can suffer from overfitting with small dataset.
    
Please refer to these links : 
- https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/
- https://towardsdatascience.com/xgboost-mathematics-explained-58262530904a
</div>

### Preprocessing

In [None]:
from john_toolbox.preprocessing.pandas_pipeline import PandasPipeline
from john_toolbox.tutorial.binary.xgboost.preprocessing import (
    conformity_column_list,
    data_cleaning_list,
    encoder_list
)

#### Define PandasPipeline

In [None]:
step_list = conformity_column_list + data_cleaning_list + encoder_list

pipeline_xgb = PandasPipeline(
    steps=step_list, target_name=target_name, verbose=True)

#### Fit transform on training set

In [None]:
train_transformed = pipeline_xgb.fit_transform(
    df=pd.concat([X_train.copy(), y_train.copy()], axis=1))

X_train_transformed, y_train_transformed = extract_X_y(train_transformed, target_name)

In [None]:
X_train_transformed.head()

#### Transform on valid set

In [None]:
valid_transformed = pipeline_xgb.transform(
    df=pd.concat([X_valid.copy(), y_valid.copy()], axis=1))

X_valid_transformed, y_valid_transformed = extract_X_y(valid_transformed, target_name)

### TRAIN with XGB

In [None]:
# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning
# https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
y_train_transformed.value_counts(normalize=True)

In [None]:
from xgboost import XGBClassifier

eval_metric = ["logloss", "auc"]
eval_names = ["train", "valid"]

params = {
    "booster": "gbtree", # default
    "n_estimators": 1000,
    "max_depth": 6,
    "min_child_weight": 1,
    "eta":0.1,
    "scale_pos_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 1,
    "random_state": 42,
    "n_jobs": -1,
    "missing": -1,    
}
xgb = XGBClassifier(**params)

In [None]:
xgb.fit(
    X_train_transformed,
    y_train_transformed.to_numpy().reshape(-1),
    early_stopping_rounds=30,
    eval_set=[(X_train_transformed, y_train_transformed.to_numpy().reshape(-1)),
              (X_valid_transformed, y_valid_transformed.to_numpy().reshape(-1))],
    eval_metric=eval_metric,
    verbose=True
)

In [None]:
results = xgb.evals_result()
best_iteration = xgb.best_iteration
print(f"Best Iteration: {best_iteration}")
{
    eval_name: {key: val[xgb.best_iteration] for key, val in values.items()}
    for eval_name, values in results.items()
}

### k-Fold Cross-Validation

In [None]:
# # scikit-learn k-fold cross-validation
# from numpy import array
# from sklearn.model_selection import KFold

# # data sample
# data = array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
# # prepare cross validation
# kfold = KFold(n_splits=6, shuffle=True, random_state=1)
# # enumerate splits
# for train, test in kfold.split(data):
#     print('train: %s, test: %s' % (data[train], data[test]))

## Catboost

In [None]:
from john_toolbox.preprocessing.pandas_pipeline import PandasPipeline
from john_toolbox.tutorial.binary.xgboost.preprocessing import (
    conformity_column_list,
    data_cleaning_list,
    encoder_list
)
from catboost import CatBoostClassifier

Catboost is newer and has the advantage to handle natively categorical column. So no need to do one hot encoding. It Reduce overfitting when constructing the models with a novel gradient-boosting scheme.
It can be used in GPU mode.
- https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db
- https://catboost.ai/#benchmark
- https://github.com/catboost/tutorials
- https://www.kaggle.com/mitribunskiy/tutorial-catboost-overview
- https://towardsdatascience.com/categorical-features-parameters-in-catboost-4ebd1326bee5

In [None]:
train = pd.read_csv("/work/data/titanic/train.csv")
test = pd.read_csv("/work/data/titanic/test.csv")

### Preprocessing

In [None]:
X, y = extract_X_y(train, target_name)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,  test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
X_train

In [None]:
# here we not use encoder_list that contains one hot encoding becasue catboost handle natively categorical columns
step_list = conformity_column_list + data_cleaning_list

pipeline_ctb = PandasPipeline(
    steps=step_list, target_name=target_name, verbose=True)

In [None]:
train_transformed_ctb = pipeline_ctb.fit_transform(
    df=pd.concat([X_train, y_train], axis=1))

X_train_transformed_ctb, y_train_transformed_ctb = extract_X_y(train_transformed_ctb, target_name)

In [None]:
X_train_transformed_ctb

In [None]:
valid_transformed_ctb = pipeline_ctb.transform(
    df=pd.concat([X_valid, y_valid], axis=1))

X_valid_transformed_ctb, y_valid_transformed_ctb = extract_X_y(valid_transformed_ctb, target_name)

#### get categorical columns

In [None]:
from john_toolbox.preprocessing.utils import get_idx_cat_columns


idx_cols_mapping = get_idx_cat_columns(X_train_transformed_ctb)

In [None]:
X_train_transformed_ctb.columns

In [None]:
# cat_features_names = X.columns # here we specify names of categorical features
# cat_features = [X.columns.get_loc(col) for col in cat_features_names]
# print(cat_features)

In [None]:
idx_cat_cols = idx_cols_mapping["idx_cols"]
cat_cols = idx_cols_mapping["cat_cols"]

In [None]:
params = {
    "iterations": 1000,
    'verbose': 200,
    'random_seed': 42,
    "od_wait": 100, # 'early_stopping_rounds': 200,
    "learning_rate": 0.01,
    # "task_type": "GPU",
    'loss_function': 'Logloss',
    "one_hot_max_size": 1000,
    'custom_metric': ["Logloss", "AUC", "PRAUC"],
}

cbc = CatBoostClassifier(**params)


cbc.fit(X_train_transformed_ctb, y_train_transformed_ctb,
        eval_set=(X_valid_transformed_ctb, y_valid_transformed_ctb),
        cat_features=idx_cat_cols,
        use_best_model=True,
        plot=True
        );

In [None]:
cbc.get_best_score()

In [None]:
X_train_transformed_ctb.columns

In [None]:
cbc.calc_feature_statistics(
    X_train_transformed_ctb,
    y_train_transformed_ctb,
    feature=['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', "Age"],
    plot=True,
);

https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159

# Evaluation

In [None]:
from john_toolbox.evaluation.visualisation import (
    plot_auc_curves,
    plot_classification_report,
    compare_eval_result_xgb,
    plot_cm,
)

## Analyse training eval_metric

In [None]:
print(f"eval_names : {eval_names}, eval_metric :  {eval_metric}")

compare_eval_result_xgb(
    xgb,
    eval_names=eval_names,
    eval_metrics=eval_metric,
    size=(10, 4)
)

## Evaluate valid set

In [None]:
X_valid.shape

In [None]:
y_pred_valid_prob = xgb.predict_proba(X_valid_transformed)[:, 1]

### Get best threshold

#### Optimize with the default value roc_curve or prauc_curve

In [None]:
from john_toolbox.evaluation.metrics import get_optimal_threshold

roc_curve_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, "roc_curve")
prauc_curve_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, "prauc_curve")

#### Optimize with custom metric function

In [None]:
from sklearn.metrics import f1_score

custom_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, metric_func=f1_score, is_maximize=True)

In [None]:
from john_toolbox.evaluation.metrics import to_labels

y_pred_valid = to_labels(y_pred_valid_prob, roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh

In [None]:
plot_cm(y_valid, y_pred_valid, figsize=(5, 5))

In [None]:
plot_classification_report(y_valid, y_pred_valid, size=(5, 5))

In [None]:
plot_auc_curves(y_valid, y_pred_valid)

## Evaluate train set

In [None]:
y_pred_train_prob = xgb.predict_proba(X_train_transformed)[:, 1]
y_pred_train = to_labels(y_pred_train_prob, roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh

### Confusion matrix

In [None]:
# https://www.kaggle.com/agungor2/various-confusion-matrix-plots
plot_cm(y_train, y_pred_train, figsize=(5, 5))

### classification_report

In [None]:
plot_classification_report(y_train, y_pred_train, size=(5, 5))

In [None]:
plot_auc_curves(y_train, y_pred_train)

# Prediction

In [None]:
test_transformed = pipeline_xgb.transform(test)

In [None]:
X_test = test_transformed[[col for col in train_transformed.columns if col != "Survived"]]

In [None]:
test_transformed = pipeline_xgb.transform(test)
test_transformed.head()

In [None]:
y_preds = to_labels(xgb.predict_proba(X_test)[:, 1], roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh)

In [None]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': y_preds})
output.to_csv('my_submission.csv', index=False)