In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, RocCurveDisplay

import optuna

from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/Spaceship Titanic/train.csv")

In [3]:
df.drop(["PassengerId", "Name"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [5]:
len(df["Cabin"].unique())

6561

In [6]:
df["Cabin"] = df["Cabin"].fillna(0)
df["Cabin"] = df["Cabin"].str[0]

In [7]:
len(df["Cabin"].unique())

9

In [8]:
df.isna().mean()

HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Transported     0.000000
dtype: float64

In [9]:
df.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

In [10]:
df["CryoSleep"] = df["CryoSleep"].astype("boolean")
df["VIP"] = df["VIP"].astype("boolean")

In [11]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]
num_labels = [c for c in df if pd.api.types.is_numeric_dtype(df[c]) and c != "Transported"]

In [12]:
cat_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

In [13]:
num_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

In [14]:
preprocessor = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_labels),
    ("num_imputer", num_imputer, num_labels)
], remainder="passthrough")

In [15]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostClassifier(verbose=False))
])

In [16]:
X = df.drop("Transported", axis=1)
y = df["Transported"]

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [18]:
model.fit(X_train, y_train);

In [19]:
model.score(X_valid, y_valid, task_type="GPU", devices="0")

0.817101226993865

In [20]:
grid_params = {
    "preprocessor__cat_imputer__imputer__strategy": ["constant", "most_frequent"],
    "preprocessor__num_imputer__imputer__strategy": ["mean", "median"],
    "model__task_type": ["GPU"],
    "model__devices": ["0"]
}

In [21]:
gs_model = GridSearchCV(model, grid_params, cv=5, n_jobs=-1, scoring="accuracy")

In [None]:
gs_model.fit(X_train, y_train);

In [None]:
gs_model.score(X_valid, y_valid)

In [None]:
gs_model.best_params_

In [None]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_valid_encoded = preprocessor.transform(X_valid)

In [None]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 2, 8),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 8),
        "task_type": "GPU",
    }

    model = CatBoostClassifier(**params, verbose=False, early_stopping_rounds=100)
    model.fit(X_train_encoded, y_train)
    y_preds = model.predict(X_valid_encoded)
    return roc_auc_score(y_valid, y_preds)

In [None]:
study = optuna.create_study(direction="maximize");

In [None]:
study.optimize(objective, n_trials=300, n_jobs=-1, show_progress_bar=True);

In [None]:
study.best_params

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
model = CatBoostClassifier(iterations=713, learning_rate=0.06461623500905105, 
                           depth=4, l2_leaf_reg=2, verbose=False)

In [None]:
model.fit(X_train_encoded, y_train, plot=True, eval_set=(X_valid_encoded, y_valid));

In [None]:
model.score(X_valid_encoded, y_valid)

In [None]:
plt.figsize(14, 8)
RocCurveDisplay.from_estimator(model, X_valid_encoded, y_valid);