In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp Kaggle_TPS_Challenge_Nov_2021_XGB

# Kaggle TPS Challenge (Nov 2021)

> A Tutorial to showcase usage of Tabular_ML_Toolkit library on Kaggle TPS Challenge Nov 2021

In [None]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [None]:
from tabular_ml_toolkit.MLPipeline import *
import pandas as pd
from sklearn.metrics import roc_auc_score,accuracy_score

#### Build MLPipeline Class with Kaggle TPS Challenge data

*You can use MLPipeline to quickly train any model which supports scikit-lear fit and transform methods.

For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [None]:
# Dataset file names and Paths
DIRECTORY_PATH = "/Users/pankajmathur/kaggle_datasets/tps_nov_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Make sure to install XGBooost depending upon your OS.*

*For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
# set xgb_params
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'max_depth': 9,
    'booster': 'gbtree',
    'eval_metric': 'auc',
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
    'use_label_encoder': False,
    'random_state': 42
}

In [None]:
from xgboost import XGBClassifier
# create xgb Classifier model
xgb_model = XGBClassifier(**xgb_params)

In [None]:
# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    idx_col="id",
    target="target",
    model=xgb_model,
    random_state=42,
    valid_size=0.2)

In [None]:
# Quick check on dataframe shapes
print(f"X_train shape is {xgb_ml_pl.dataframeloader.X_train.shape}" )
print(f"X_valid shape is {xgb_ml_pl.dataframeloader.X_valid.shape}" )
print(f"y_train shape is {xgb_ml_pl.dataframeloader.y_train.shape}")
print(f"y_valid shape is {xgb_ml_pl.dataframeloader.y_valid.shape}")

In [None]:
# Fit & Predict
xgb_ml_pl.scikit_pipeline.fit(xgb_ml_pl.dataframeloader.X_train,
                              xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.scikit_pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
preds_probs = xgb_ml_pl.scikit_pipeline.predict_proba(xgb_ml_pl.dataframeloader.X_valid)[:, 1]

In [None]:
# quick check on predictions and predictions probabilities shape
print(f"preds shape is {preds.shape}" )
print(f"preds_probs shape is {preds_probs.shape}" )

In [None]:
# Metrics
auc = roc_auc_score(xgb_ml_pl.dataframeloader.y_valid, preds_probs)
acc = accuracy_score(xgb_ml_pl.dataframeloader.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

#### Let's do Cross Validation for XGB Model on our MLPipeline

In [None]:
# # createm ml pipeline for scikit-learn model
# xgb_ml_pl = MLPipeline().prepare_data_for_cv(train_file_path= "input/home_data/train.csv",
#                                              test_file_path= "input/home_data/test.csv",
#                                              idx_col="Id", target="SalePrice",
#                                              model=xgb_model,random_state=42,
#                                              cv_cols_type = "all") #cv_cols_type = all|num|cat
# # Now fit and predict
# scores = xgb_ml_pl.cross_validation(estimator=xgb_ml_pl.scikit_pipeline, cv=5,
#                                     scoring='neg_mean_absolute_error')
# print("scores:", scores)
# print("Average MAE score:", scores.mean())

In [None]:
# # hide
# # run the script to build 

# from nbdev.export import notebook2script; notebook2script()

#### Create Kaggle Predictions

In [None]:
test_preds_probs = sci_ml_pl.scikit_pipeline.predict_proba(sci_ml_pl.dataframeloader.X_test)[:,1]
print(f"X_train shape is {sci_ml_pl.dataframeloader.X_test.shape}" )
print(f"test_preds_probs shape is {test_preds_probs.shape}" )

In [None]:
sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['target'] = test_preds_probs
sub.to_csv('submission.csv', index=False)