In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

%load_ext autoreload
%autoreload 2

# Load data

In [None]:
train  = pd.read_csv("data/train.csv")

# Data Viz

In [None]:
# from dataprep.eda import create_report

# create_report(train).show()

In [None]:
train.columns

In [None]:
# from dataprep.eda import plot, plot_correlation, plot_missing

# plot(train, "Pclass", "Survived")

In [None]:
# plot(train,"Survived", "Pclass")

In [None]:
# plot(train,"Survived", "Pclass")

# Train test split

In [None]:
target_name = "Survived"

In [None]:
def extract_X_y(df, target_name):
    X = df[[col for col in df.columns if col != target_name]]
    y = df[[target_name]]
    return X, y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X, y = extract_X_y(train, target_name)

X_train, X_valid, y_train, y_valid = train_test_split(X, y,  test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
train.head()

# Preprocessing

In [None]:
from john_toolbox.preprocessing.pandas_pipeline import PandasPipeline
from src.preprocessing import (
    conformity_column_list,
    data_cleaning_list,
    encoder_list
)

## Define PandasPipeline

In [None]:
step_list = conformity_column_list + data_cleaning_list + encoder_list

pipeline = PandasPipeline(
    steps=step_list, target_name=target_name, verbose=True)

## Fit transform

In [None]:
train_transformed = pipeline.fit_transform(
    df=pd.concat([X_train, y_train], axis=1))

X_train, y_train = extract_X_y(train_transformed, target_name)

In [None]:
X_train.head()

In [None]:
valid_transformed = pipeline.transform(
    df=pd.concat([X_valid, y_valid], axis=1))

X_valid, y_valid = extract_X_y(valid_transformed, target_name)

# TRAIN with XGB

In [None]:
# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning
# https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
from xgboost import XGBClassifier

params = {
    "booster": "gbtree", # default
    "n_estimators": 900,
    "max_depth": 6,
    "min_child_weight": 1,
    "eta":0.1,
    "scale_pos_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 1,
    "random_state": 42,
    "n_jobs": -1,
    "missing": -1
    
}
xgb = XGBClassifier(**params)

In [None]:
eval_metric = ["logloss", "auc"]
eval_names = ["train", "valid"]

xgb.fit(
    X_train,
    y_train,
    early_stopping_rounds=30,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_metric=eval_metric,
    verbose=True
)

In [None]:
results = xgb.evals_result()
print(results.keys())

# Evaluation

In [None]:
from john_toolbox.evaluation.visualisation import (
    plot_curves,
    plot_classification_report,
    compare_eval_result_xgb,
    plot_cm,
)

## Analyse training eval_metric

In [None]:
compare_eval_result_xgb(
    xgb,
    eval_names=eval_names,
    eval_metrics=eval_metric,
    size=(10, 4)
)

## Get best threshold

### Optimize with the default value roc_curve or prauc_curve

In [None]:
from john_toolbox.evaluation.metrics import get_optimal_threshold

In [None]:
roc_curve_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, "roc_curve")
prauc_curve_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, "prauc_curve")

### Optimize with custom metric function

In [None]:
from sklearn.metrics import f1_score

custom_tresh = get_optimal_threshold(y_valid, y_pred_valid_prob, metric_func=f1_score, is_maximize=True)

## Evaluate train set

In [None]:
from john_toolbox.evaluation.metrics import to_labels

In [None]:
y_pred_train_prob = xgb.predict_proba(X_train)[:, 1]
y_pred_train = to_labels(y_pred_train_prob, roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh

### Confusion matrix

In [None]:
# https://www.kaggle.com/agungor2/various-confusion-matrix-plots
plot_cm(y_train, y_pred_train, figsize=(5, 5))

### classification_report

In [None]:
plot_classification_report(y_train, y_pred_train, size=(5, 5))

In [None]:
plot_curves(y_train, y_pred_train)

## Evaluate valid set

In [None]:
y_pred_valid_prob = xgb.predict_proba(X_valid)[:, 1]
y_pred_valid = to_labels(y_pred_valid_prob, roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh

In [None]:
plot_cm(y_valid, y_pred_valid, figsize=(5, 5))

In [None]:
plot_classification_report(y_valid, y_pred_valid, size=(5, 5))

In [None]:
plot_curves(y_valid, y_pred_valid)

# Prediction

In [None]:
test  = pd.read_csv("data/test.csv")

In [None]:
test_transformed = pipeline.transform(test)
X_test = test_transformed[[col for col in train_transformed.columns if col != "Survived"]]

In [None]:
y_preds = to_labels(xgb.predict_proba(X_test)[:, 1], roc_curve_tresh) # roc_curve_tresh, prauc_curve_tresh, custom_tresh)

In [None]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': y_preds})
output.to_csv('my_submission.csv', index=False)