# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

import datetime

from typing import Union

import numpy as np
import pandas as pd

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier

import catboost
import xgboost
import lightgbm

In [None]:
import sklearn
from sklearn import set_config

In [None]:
print(f"scikit-learn version is {sklearn.__version__}")

In [None]:
if sklearn.__version__ not in ["1.2.0", "1.2.2", "1.4.1.post1"]:
    os.system("pip install scikit-learn==1.2.0")

# Global Configuration

In [None]:
set_config(transform_output="pandas")

In [None]:
CWD = os.getcwd()
KAGGLE = "kaggle" in CWD

In [None]:
print(CWD)

In [None]:
if not KAGGLE:
    PATH_INPUT = os.path.join(CWD, "input")
    PATH_INPUT_TRAIN = os.path.join(PATH_INPUT, "train.csv")
    PATH_INPUT_TEST = os.path.join(PATH_INPUT, "test.csv")
    
    print(os.listdir(PATH_INPUT))
    
else:
    PATH_INPUT_TRAIN = "/kaggle/input/spaceship-titanic/train.csv"
    PATH_INPUT_TEST = "/kaggle/input/spaceship-titanic/test.csv"

In [None]:
sorted(os.listdir(CWD))

In [None]:
sys.path

In [None]:
sys.path.insert(0, os.path.join(CWD, "src"))

In [None]:
from src import (
    SurnameExtractor,
    CabinLetterExtractor,
    CustomOrdinalEncoder
)

# Helper functions

In [None]:
def load_data():
    
    X_train = pd.read_csv(PATH_INPUT_TRAIN).set_index("PassengerId")
    X_test = pd.read_csv(PATH_INPUT_TEST).set_index("PassengerId")
    
    return X_train, X_test

# Data Loading

In [None]:
X_train, X_test = load_data()

In [None]:
X_train[X_train["Spa"] > 100]["Spa"].plot(kind = "hist", bins = 100)

In [None]:
y = X_train.pop("Transported")

# Impute data with Pipelines

In [None]:
ct_impute = ColumnTransformer(
    transformers = [
        ("impute_numerical", SimpleImputer(strategy="mean"), make_column_selector(dtype_include=np.number)),
        ("impute_cabin", SimpleImputer(strategy="constant", fill_value="NA"), ["Cabin"]),
        (
            "impute_categorical",
            SimpleImputer(strategy="most_frequent"),
            ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name']
        )
    ],
    remainder = 'drop'
)

In [None]:
ct_impute

# Preprocess Data Post Imputation

In [None]:
class MiddleLetterExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X["impute_cabin__Cabin"].apply(
            lambda cabin: 2000 if cabin == "NA" else float(cabin.split("/")[1])
        ).to_frame()

In [None]:
mle = MiddleLetterExtractor()

In [None]:
pipe_mle = Pipeline(
    steps = [
        ("mle", MiddleLetterExtractor()),
        ("fu_mle", FeatureUnion(
            transformer_list = [
                ("kbins_20", KBinsDiscretizer(n_bins = 20, strategy = "uniform", encode = "ordinal")),
                ("kbins_50", KBinsDiscretizer(n_bins = 50, strategy = "uniform", encode = "ordinal")),
                ("kbins_100", KBinsDiscretizer(n_bins = 100, strategy = "uniform", encode = "ordinal"))
            ]
        ))   
    ]
)

TO DO: define an EmbeddingTransfomer compatible with scikit-learn

https://medium.com/@micahmelling/categorical-embeddings-in-scikit-learn-pipeline-c81071be5140

In [None]:
fu_age = FeatureUnion(
    transformer_list = [
        ("fe_age_12", Binarizer(threshold=12)),
        ("fe_age_18", Binarizer(threshold=18)),
        ("fe_age_buckets", KBinsDiscretizer(encode = "ordinal", strategy = "uniform", n_bins = 8)),
        ("fe_age_pass", "passthrough")       
    ]
)

In [None]:
fu_cabin = FeatureUnion(
    transformer_list = [
        ("extract_first_letter", CabinLetterExtractor(letter="first")),
        ("extract_last_letter", CabinLetterExtractor(letter="last"))
    ]
)

In [None]:
fe_cabin = Pipeline(
    steps = [
        ("extract_letters_from_cabin", fu_cabin),
        ("ohe_first_letter", OneHotEncoder(sparse_output=False)),
    ]
)

In [None]:
# We are getting all the surnames that exist in train and test
# because our CustomOrdinalEncoder can faill when doing CV
# or when predicting test
surnames_ = pd.concat([X_train[["Name"]], X_test[["Name"]]]).dropna()

In [None]:
SURNAMES = SurnameExtractor().fit_transform(surnames_).values

In [None]:
fe_surname = Pipeline(
    steps = [
        ("extract_surname", SurnameExtractor()),
        ("custom_ordinal_encoder", CustomOrdinalEncoder(list_surnames=SURNAMES))
    ]
)

In [None]:
ct_fe = ColumnTransformer(
    transformers = [
        ("fu_age", fu_age, ["impute_numerical__Age"]),
        ("pass", "passthrough", [1, 2, 3, 4, 5]),
        ("fe_cabin", fe_cabin, ["impute_cabin__Cabin"]),
        ("mle", pipe_mle, ["impute_cabin__Cabin"]),
        (
            "ohe",
            OneHotEncoder(sparse_output=False),
            ["impute_categorical__HomePlanet", "impute_categorical__CryoSleep", "impute_categorical__VIP"]
        ),
        ("oe", OrdinalEncoder(), ["impute_categorical__Destination"]),
        ("surname", fe_surname, ["impute_categorical__Name"])
    ],
    remainder = "passthrough"
)

In [None]:
def drop_duplicated_columns(X):
    return X.iloc[:, ~X.columns.duplicated()]

In [None]:
def get_pipe_without_model():
    
    pipe = Pipeline(
        steps = [
            ("impute", ct_impute),
            ("fe", ct_fe),
            ("drop_duplicated_columns", FunctionTransformer(drop_duplicated_columns)),
        ]
    )
    
    return pipe

In [None]:
def get_pipe(model = None):
    
    if model is None:
        return get_pipe_without_model()
    
    else:
        
        pipe = Pipeline(
            steps = [
                ("impute", ct_impute),
                ("fe", ct_fe),
                ("drop_duplicated_columns", FunctionTransformer(drop_duplicated_columns)),
                ("model", model)
            ]
        )
    
        return pipe

In [None]:
model = VotingClassifier(
    estimators = [
        ("xgboost", xgboost.XGBClassifier()),
        ("catboost", catboost.CatBoostClassifier()),
        ("hgb", HistGradientBoostingClassifier()),
        ("rf", RandomForestClassifier())
    ]
)

model = HistGradientBoostingClassifier()
model = catboost.CatBoostClassifier()
model = xgboost.XGBClassifier()

In [None]:
pipe = get_pipe(model = model)

In [None]:
pipe

In [None]:
# cv_scores = cross_val_score(estimator=pipe, X=X_train, y=y, cv=10, n_jobs = -1)

# cv_scores

# np.mean(cv_scores)

# np.std(cv_scores)

# Model Explainability

In [None]:
def get_fi(model):
    
    if isinstance(model, catboost.CatBoostClassifier):
        return model.get_feature_importance()
    
    elif isinstance(model, xgboost.XGBClassifier):
        return model.feature_importances_

In [None]:
def create_fi_df(model, X):
    
    fi = get_fi(model = model)
    fn = X.columns
    
    fidf = pd.DataFrame(
        data  = [
            fn,
            fi
        ]
    ).T
    
    fidf.rename(
        columns = {
            0:"FeatureName",
            1:"FeatureImportance"
        },
        inplace = True
    )
    
    return fidf.sort_values("FeatureImportance", ascending = False)

In [None]:
def run_experiment(model, X_train, y):
    
    pipe = get_pipe(model = model)
    
    cv_scores = cross_val_score(
        estimator = pipe,
        X = X_train,
        y = y,
        cv = 10,
        n_jobs = -1,
        scoring = "accuracy"
    )
    
    print(str(model))
    print(cv_scores)
    print(f"Mean: {np.mean(cv_scores)}")
    print(f"Std: {np.std(cv_scores)}")
    
    Xt = pipe[:-1].fit_transform(X_train)
    pipe.fit(X_train, y)
    model_fitted = pipe[-1]
    
    try:
        fidf = create_fi_df(model = model_fitted, X = Xt)
    except:
        fidf = None
    
    return pipe, cv_scores, Xt, model_fitted, fidf

In [None]:
pipe_xgb, cv_scores_xgb, Xt_xgb, model_fitted_xgb, fidf_xgb = run_experiment(
    model = xgboost.XGBClassifier(),
    X_train = X_train,
    y = y
)

In [None]:
fidf_xgb

In [None]:
pipe_cat, cv_scores_cat, Xt_cat, model_fitted_cat, fidf_cat = run_experiment(
    model = catboost.CatBoostClassifier(verbose = 0),
    X_train = X_train,
    y = y
)

In [None]:
fidf_cat

In [None]:
model = VotingClassifier(
    estimators = [
        ("xgboost", xgboost.XGBClassifier(verbose = False)),
        ("catboost", catboost.CatBoostClassifier(verbose = 0)),
        ("hgb", HistGradientBoostingClassifier()),
        ("rf", RandomForestClassifier())
    ]
)

In [None]:
model

In [None]:
pipe_vc, cv_scores_vc, Xt_vc, model_fitted_vc, fidf_vc = run_experiment(
    model = model,
    X_train = X_train,
    y = y
)

# Submission a Kaggle

In [None]:
pipe = pipe_vc

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
timestamp

In [None]:
X_test_copy = X_test.copy(deep = True)

In [None]:
X_test_copy["Transported"] = list(map(bool, y_pred))

In [None]:
if not KAGGLE:
    path_export = os.path.join(CWD, "outputs", f"submission_{timestamp}.csv")
else:
    path_export = f"submission_{timestamp}.csv"

In [None]:
(
    X_test_copy
    .reset_index()
    [["PassengerId", "Transported"]]
    .to_csv(path_export, index=False)
)

In [None]:
# Benchmark -                        0.7234581961033293
# New Model con OHE -                0.7155214740155814
# New Model con 4 depth -            0.764529582159438
# Model with 4 depth y CabinFe -     0.7709713900242054
# HistGradient default params y -    0.7998441860772721
# HistGradient default params y -    0.8028344113328835  # Leaderboard - 1359
# default pipe + OE de Destionation

# HistGradient default params y -    0.8036411253521687  # Leaderboard - 1359 - 0.79331
# default pipe + OE de Destionation
# First and Last letter de Cabin

# HistGradient default params y -    0.8043334259222517 # Leaderboard -  1093 - 0.79682
# default pipe + OE de Destionation
# First and Last letter de Cabin
# Surname CustomOrdinalEncoder

# CatBoost default params y -        0.799844450617039 # Leaderboard -  985 - 0.79822
# default pipe + OE de Destionation
# First and Last letter de Cabin
# Surname CustomOrdinalEncoder

# CatBoost default params y -        0.8023753025673586 # Leaderboard -  965 - 0.79869
# default pipe + OE de Destionation
# First and Last letter de Cabin
# Surname CustomOrdinalEncoder
# KBinsDiscretizer + Age

# Pipe with VotingClassifier        0.800306601589884   # Leaderboard - 966 - 0.79518
# 4 models: cat, xgb, hgb, rf

# Pipe with Catboost                
# 20, 50, 100 KBinsDiscretizar Cabin 0.7833970874171661 # Leaderboard - 744 - 0.80149

# Pipe with VotingClassfier (cat, xgb, rf, hgb)               
# 20, 50, 100 KBinsDiscretizar Cabin 0.7830545084189782 # Leaderboard - 672 - 0.80243

# The End