In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

import optuna

from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv("../data/Spaceship Titanic/train.csv")

In [3]:
df.drop(["PassengerId", "Name"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [5]:
len(df["Cabin"].unique())

6561

In [6]:
df["Cabin"] = df["Cabin"].fillna(0)
df["Cabin"] = df["Cabin"].str[0]

In [7]:
len(df["Cabin"].unique())

9

In [8]:
df.isna().mean()

HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Transported     0.000000
dtype: float64

In [9]:
df.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

In [12]:
df["CryoSleep"] = df["CryoSleep"].astype("boolean")
df["VIP"] = df["VIP"].astype("boolean")

In [13]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]
num_labels = [c for c in df if pd.api.types.is_numeric_dtype(df[c]) and c != "Transported"]

In [14]:
cat_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

In [15]:
num_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

In [16]:
preprocessor = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_labels),
    ("num_imputer", num_imputer, num_labels)
], remainder="passthrough")

In [17]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostClassifier(verbose=False))
])

In [18]:
X = df.drop("Transported", axis=1)
y = df["Transported"]

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [20]:
model.fit(X_train, y_train)

In [21]:
model.score(X_valid, y_valid)

0.8048312883435583

In [33]:
grid_params = {
    "preprocessor__cat_imputer__imputer__strategy": ["constant", "most_frequent"],
    "preprocessor__num_imputer__imputer__strategy": ["mean", "median"]
}

In [34]:
gs_model = GridSearchCV(model, grid_params, cv=5, n_jobs=-1, scoring="accuracy")

In [35]:
gs_model.fit(X_train, y_train)

In [40]:
gs_model.score(X_valid, y_valid)

0.8048312883435583

Didn't make a difference. Lets try Optuna

In [41]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_valid_encoded = preprocessor.transform(X_valid)

In [42]:
model = CatBoostClassifier(verbose=False)

In [None]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        
    }