In [58]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline

from catboost import CatBoostClassifier
import time

In [59]:
train_data = pd.read_csv('../pre_data/pre_training.csv')
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,cabin_deck,cabin_num,cabin_side
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0.0,P
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0.0,S
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0.0,S
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0.0,S
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,A,98.0,P
8689,9278_01,Earth,1,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,G,1499.0,S
8690,9279_01,Earth,0,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,G,1500.0,S
8691,9280_01,Europa,0,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,E,608.0,S


In [60]:
test_data = pd.read_csv('../pre_data/pre_test.csv')
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,cabin_deck,cabin_num,cabin_side
0,0013_01,Earth,1,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,G,3.0,S
1,0018_01,Earth,0,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,F,4.0,S
2,0019_01,Europa,1,55 Cancri e,31.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,C,0.0,S
3,0021_01,Europa,0,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,C,1.0,S
4,0023_01,Earth,0,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,F,5.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,1,TRAPPIST-1e,34.0,0.0,0.0,0.0,0.0,0.0,0.0,Jeron Peter,G,1496.0,S
4273,9269_01,Earth,0,TRAPPIST-1e,42.0,0.0,0.0,847.0,17.0,10.0,144.0,Matty Scheron,G,1044.0,S
4274,9271_01,Mars,1,55 Cancri e,26.0,0.0,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,D,296.0,P
4275,9273_01,Europa,0,55 Cancri e,26.0,0.0,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,D,297.0,P


In [61]:
for df in [train_data, test_data]:
    df["VIP"] = df["VIP"].astype(str).str.lower().map({
        "true": 1.0,
        "false": 0.0,
        "1": 1.0,
        "0": 0.0,
        "1.0": 1.0,
        "0.0": 0.0
    }).fillna(0.0).astype(float)

train_data = train_data.drop(columns='Name')
test_data = test_data.drop(columns='Name')

In [62]:
categorical_cols = ['HomePlanet', 'Destination', 'cabin_deck', 'cabin_side']

In [63]:
all_data = pd.concat([train_data, test_data], axis=0)

all_data = pd.get_dummies(all_data, columns=categorical_cols, drop_first=False)

dummy_cols = [col for col in all_data.columns if any(cat in col for cat in categorical_cols)]

all_data[dummy_cols] = all_data[dummy_cols].astype(float)


train_data = all_data.iloc[:len(train_data), :].copy()
test_data  = all_data.iloc[len(train_data):, :].copy()

In [64]:
from numpy import log1p

# ---------- TOTAL SPENDING ----------
train_data["TotalSpent"] = (
    train_data["RoomService"] +
    train_data["FoodCourt"] +
    train_data["ShoppingMall"] +
    train_data["Spa"] +
    train_data["VRDeck"]
)

test_data["TotalSpent"] = (
    test_data["RoomService"] +
    test_data["FoodCourt"] +
    test_data["ShoppingMall"] +
    test_data["Spa"] +
    test_data["VRDeck"]
)

# ---------- TOTAL SPENDING ----------
train_data["TotalSpent_log1p"] = log1p(
    (
    train_data["RoomService"] +
    train_data["FoodCourt"] +
    train_data["ShoppingMall"] +
    train_data["Spa"] +
    train_data["VRDeck"]
)
)

test_data["TotalSpent_log1p"] = log1p(
    (
    test_data["RoomService"] +
    test_data["FoodCourt"] +
    test_data["ShoppingMall"] +
    test_data["Spa"] +
    test_data["VRDeck"]
)
)


#--------------------------------------------------------------------------------------#
train_data['spa_log1p'] = log1p(train_data['Spa'] + train_data['VRDeck'])
test_data['spa_log1p'] = log1p(test_data['Spa'] + test_data['VRDeck'])

train_data['cabin_log'] = log1p(train_data['cabin_num'] + train_data['Spa'])
test_data['cabin_log'] = log1p(test_data['cabin_num'] + test_data['Spa'])

train_data['cabin_log_1'] = log1p(train_data['RoomService'] + train_data['FoodCourt'])
test_data['cabin_log_1'] = log1p(test_data['RoomService'] + test_data['FoodCourt'])

train_data['food_log_1'] = log1p(train_data['FoodCourt'] + train_data['ShoppingMall'])
test_data['food_log_1'] = log1p(test_data['FoodCourt'] + test_data['ShoppingMall'])
#---------------------------------------------------------------------------------------#
train_data = train_data.drop(columns=['VIP', 'CryoSleep', 'TotalSpent', 'Age'])
test_data = test_data.drop(columns=['VIP', 'CryoSleep', 'TotalSpent', 'Age'])

In [65]:
psg_id = train_data['PassengerId']
train_data = train_data.drop(columns='PassengerId')

In [66]:
train_data

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,cabin_num,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,cabin_deck_F,cabin_deck_G,cabin_deck_T,cabin_side_P,cabin_side_S,TotalSpent_log1p,spa_log1p,cabin_log,cabin_log_1,food_log_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,109.0,9.0,25.0,549.0,44.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,6.602588,6.386879,6.309918,4.779123,3.555348
2,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,9.248021,8.819518,8.812248,8.194229,8.182280
3,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,8.551981,8.167068,8.110728,7.157735,7.411556
4,303.0,70.0,151.0,565.0,2.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,6.995766,6.342121,6.340359,5.924256,5.402677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,6819.0,0.0,1643.0,74.0,0.0,98.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,9.052165,7.448916,7.462789,8.827615,8.827615
8689,0.0,0.0,0.0,0.0,0.0,0.0,1499.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.000000,0.000000,7.313220,0.000000,0.000000
8690,0.0,0.0,1872.0,1.0,0.0,1.0,1500.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,7.535830,0.693147,7.314553,0.000000,7.535297
8691,0.0,1049.0,0.0,353.0,3235.0,0.0,608.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,8.442039,8.185629,6.869014,6.956545,6.956545


In [67]:
test_data

Unnamed: 0,PassengerId,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,cabin_deck_F,cabin_deck_G,cabin_deck_T,cabin_side_P,cabin_side_S,TotalSpent_log1p,spa_log1p,cabin_log,cabin_log_1,food_log_1
0,0013_01,0.0,0.0,0.0,0.0,0.0,,3.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.000000,0.000000,1.386294,0.000000,0.000000
1,0018_01,0.0,9.0,0.0,2823.0,0.0,,4.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,7.949091,7.945910,7.947325,2.302585,2.302585
2,0019_01,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0021_01,0.0,6652.0,0.0,181.0,585.0,,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,8.911800,6.642487,5.209486,8.802823,8.802823
4,0023_01,10.0,0.0,635.0,0.0,0.0,,5.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,6.470800,0.000000,1.791759,2.397895,6.455199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,0.0,0.0,0.0,0.0,0.0,,1496.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.000000,0.000000,7.311218,0.000000,0.000000
4273,9269_01,0.0,847.0,17.0,10.0,144.0,,1044.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,6.926577,5.043425,6.961296,6.742881,6.762730
4274,9271_01,0.0,0.0,0.0,0.0,0.0,,296.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,5.693732,0.000000,0.000000
4275,9273_01,0.0,2680.0,0.0,0.0,523.0,,297.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,8.072155,6.261492,5.697093,7.893945,7.893945


In [68]:
test_data = test_data.drop(columns='Transported', errors='ignore')

In [69]:
X = train_data.drop("Transported", axis=1)
y = train_data["Transported"]

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.08, random_state=42)

model = CatBoostClassifier(
    iterations=2300,
    learning_rate=0.018,
    depth=6,
    random_state=42,
    auto_class_weights='Balanced',
    l2_leaf_reg=3,
    bagging_temperature=1,
    border_count=254,
    loss_function='Logloss',
    eval_metric='AUC',
)

model.fit(X_train, y_train,)

0:	total: 2.78ms	remaining: 6.4s
1:	total: 5.07ms	remaining: 5.82s
2:	total: 7.54ms	remaining: 5.77s
3:	total: 9.88ms	remaining: 5.67s
4:	total: 12.1ms	remaining: 5.56s
5:	total: 14.6ms	remaining: 5.57s
6:	total: 17ms	remaining: 5.58s
7:	total: 19.4ms	remaining: 5.55s
8:	total: 21.8ms	remaining: 5.54s
9:	total: 24.3ms	remaining: 5.57s
10:	total: 26.7ms	remaining: 5.55s
11:	total: 29.4ms	remaining: 5.61s
12:	total: 31.7ms	remaining: 5.58s
13:	total: 34ms	remaining: 5.54s
14:	total: 36.2ms	remaining: 5.52s
15:	total: 39.3ms	remaining: 5.61s
16:	total: 41.7ms	remaining: 5.6s
17:	total: 44.3ms	remaining: 5.62s
18:	total: 46.7ms	remaining: 5.61s
19:	total: 49.1ms	remaining: 5.6s
20:	total: 51.5ms	remaining: 5.59s
21:	total: 54ms	remaining: 5.59s
22:	total: 56.4ms	remaining: 5.58s
23:	total: 58.9ms	remaining: 5.58s
24:	total: 61.2ms	remaining: 5.57s
25:	total: 63.6ms	remaining: 5.56s
26:	total: 65.9ms	remaining: 5.55s
27:	total: 68.4ms	remaining: 5.55s
28:	total: 70.9ms	remaining: 5.55s
29:	

<catboost.core.CatBoostClassifier at 0x203fe4a07a0>

In [71]:
model.score(X_test, y_test)

np.float64(0.8218390804597702)

In [72]:
psg_id_test = test_data["PassengerId"]

In [73]:
test_data = test_data.drop(columns="PassengerId", errors="ignore")

In [74]:
pred = model.predict(test_data)

In [75]:
submission = pd.DataFrame({
    "PassengerId": psg_id_test,
    "Transported": pred.astype(bool)
})

submission.to_csv("submission.csv", index=False)

In [76]:
imp = model.get_feature_importance()

f_imp = pd.DataFrame({
    'Feature': X.columns,
    'Importance': imp
}).sort_values('Importance', ascending=False)

f_imp.head(15)

Unnamed: 0,Feature,Importance
22,TotalSpent_log1p,12.022833
23,spa_log1p,11.736998
5,cabin_num,9.952694
26,food_log_1,7.385847
24,cabin_log,7.230698
0,RoomService,6.059265
6,HomePlanet_Earth,4.472644
4,VRDeck,3.811728
25,cabin_log_1,3.776195
1,FoodCourt,3.548262
