# üõ∞Ô∏è Spaceship Titanic ‚Äì CatBoost Solution Notebook

This notebook builds a complete Kaggle pipeline for the **Spaceship Titanic** competition:

1. Load data
2. Feature engineering (Cabin, Passenger groups, spending)
3. Missing value handling
4. Encoding
5. Model training with **CatBoost** + cross-validation
6. Train on full data and generate `submission.csv`


In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


## 1. Load data

In [2]:
#load data files

train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

print(train.shape, test.shape)
train.head()


(8693, 14) (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## 2. Quick missing value overview

In [3]:

train.isna().sum()


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## 3. Feature engineering

We engineer features from `Cabin`, `PassengerId`, and spending columns.

In [4]:

# Make copies to work on
train_fe = train.copy()
test_fe = test.copy()

# 3.1 Split Cabin into Deck / CabinNum / Side
for df in [train_fe, test_fe]:
    df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')

# 3.2 Passenger group features from PassengerId
for df in [train_fe, test_fe]:
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    df['GroupPos'] = df['PassengerId'].str.split('_').str[1].astype(int)
    df['GroupSize'] = df.groupby('Group')['Group'].transform('count')

# 3.3 Spending features
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for df in [train_fe, test_fe]:
    # Total spent
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    # Indicator: spent nothing
    df['SpentZero'] = (df[spending_cols].sum(axis=1) == 0).astype(int)
    # Log-transformed spends
    for col in spending_cols:
        df[col + '_log'] = np.log(df[col].fillna(0) + 1)

# 3.4 Cabin number bucket
bins = [-1, 50, 150, 300, 600, 1000]
labels = ['very_low', 'low', 'mid', 'high', 'very_high']

for df in [train_fe, test_fe]:
    df['CabinNumBucket'] = pd.cut(df['CabinNum'], bins=bins, labels=labels)


## 4. Missing value handling

In [5]:

# 4.1 Fill spending with 0
for df in [train_fe, test_fe]:
    for col in spending_cols:
        df[col] = df[col].fillna(0)

# 4.2 Age with median
for df in [train_fe, test_fe]:
    df['Age'] = df['Age'].fillna(df['Age'].median())

# 4.3 VIP: missing -> False (0)
for df in [train_fe, test_fe]:
    df['VIP'] = df['VIP'].fillna(False).astype(int)

# 4.4 CryoSleep: if all spending = 0 and CryoSleep is NaN -> True; then fill remaining with False
for df in [train_fe, test_fe]:
    df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0})
    df.loc[(df[spending_cols].sum(axis=1) == 0) & (df['CryoSleep'].isna()), 'CryoSleep'] = 1
    df['CryoSleep'] = df['CryoSleep'].fillna(0).astype(int)

# 4.5 Deck, Side, CabinNum: fill by mode/median
for df in [train_fe, test_fe]:
    for col in ['Deck', 'Side']:
        df[col] = df[col].fillna(df[col].mode()[0])
    df['CabinNum'] = df['CabinNum'].fillna(df['CabinNum'].median())
    df['CabinNumBucket'] = df['CabinNumBucket'].cat.add_categories('unknown').fillna('unknown')

# 4.6 HomePlanet, Destination: fill with mode
for df in [train_fe, test_fe]:
    for col in ['HomePlanet', 'Destination']:
        df[col] = df[col].fillna(df[col].mode()[0])


  df['VIP'] = df['VIP'].fillna(False).astype(int)
  df['VIP'] = df['VIP'].fillna(False).astype(int)


## 5. Encoding and dataset preparation

In [6]:
# Target
y = train_fe['Transported'].map({True: 1, False: 0})

# Columns to drop before encoding
drop_cols = ['Transported', 'Name', 'Cabin']

# Categorical columns to one-hot encode
cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Side', 'CabinNumBucket']

# Build combined frame for consistent get_dummies
full = pd.concat(
    [
        train_fe.drop(columns=drop_cols),
        test_fe
    ],
    axis=0,
    ignore_index=True
)

full_encoded = pd.get_dummies(full, columns=cat_cols)

# Split back
X = full_encoded.iloc[:len(train_fe), :].copy()
X_test = full_encoded.iloc[len(train_fe):, :].copy()

# Drop ID-like/string columns from features
cols_to_drop = ['PassengerId', 'Group']
X = X.drop(columns=cols_to_drop, errors='ignore')
X_test = X_test.drop(columns=cols_to_drop, errors='ignore')

# Sanity check
print(X.shape, X_test.shape)
X.head()



(8693, 42) (4277, 42)


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,GroupPos,...,Deck_G,Deck_T,Side_P,Side_S,CabinNumBucket_very_low,CabinNumBucket_low,CabinNumBucket_mid,CabinNumBucket_high,CabinNumBucket_very_high,CabinNumBucket_unknown
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,...,False,False,True,False,True,False,False,False,False,False
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,0.0,1,...,False,False,False,True,True,False,False,False,False,False
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0.0,1,...,False,False,False,True,True,False,False,False,False,False
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0.0,2,...,False,False,False,True,True,False,False,False,False,False
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1.0,1,...,False,False,False,True,True,False,False,False,False,False


## 6. CatBoost model with cross-validation

In [7]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cat = CatBoostClassifier(
    depth=6,
    learning_rate=0.05,
    iterations=800,
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_state=42,
    verbose=False
)

scores = cross_val_score(cat, X, y, cv=cv, scoring='accuracy')
print("CatBoost fold scores:", scores)
print("CatBoost mean CV accuracy:", scores.mean())


CatBoost fold scores: [0.81713629 0.80736055 0.80908568 0.82220944 0.80264672]
CatBoost mean CV accuracy: 0.8116877350381255


In [8]:
cols_to_drop = ['Cabin', 'Name']  # add others if needed

X = X.drop(columns=cols_to_drop, errors='ignore')
X_test = X_test.drop(columns=cols_to_drop, errors='ignore')

print(X.select_dtypes(include='object').columns)
print(X_test.select_dtypes(include='object').columns)



Index([], dtype='object')
Index([], dtype='object')


## 7. Train final CatBoost on all data and create submission

In [9]:
# Train on all data


cat_final = CatBoostClassifier(
    depth=6,             # tree depth (controls complexity)
    learning_rate=0.05,  # smaller LR + more iterations = better generalization
    iterations=1200,     # more trees
    l2_leaf_reg=3,       # L2 regularization to reduce overfitting
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_state=42,
    verbose=100          # prints progress every 100 iterations
)

cat_final.fit(X, y)

# Predict on test
y_test_pred = cat_final.predict(X_test)
y_test_pred_bool = y_test_pred.astype(bool)

# Build submission
submission = pd.DataFrame({
    'PassengerId': test_fe['PassengerId'],
    'Transported': y_test_pred_bool
})

submission.head()


0:	learn: 0.7524445	total: 7.05ms	remaining: 8.45s
100:	learn: 0.8207753	total: 672ms	remaining: 7.31s
200:	learn: 0.8389509	total: 1.35s	remaining: 6.69s
300:	learn: 0.8573565	total: 2.02s	remaining: 6.03s
400:	learn: 0.8720810	total: 2.7s	remaining: 5.37s
500:	learn: 0.8848499	total: 3.36s	remaining: 4.68s
600:	learn: 0.8934775	total: 4.01s	remaining: 4s
700:	learn: 0.9021051	total: 4.68s	remaining: 3.33s
800:	learn: 0.9079719	total: 5.35s	remaining: 2.67s
900:	learn: 0.9155642	total: 6.03s	remaining: 2s
1000:	learn: 0.9232716	total: 6.72s	remaining: 1.34s
1100:	learn: 0.9300587	total: 7.44s	remaining: 669ms
1199:	learn: 0.9365006	total: 8.11s	remaining: 0us


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [10]:

submission.to_csv('submission_catboost_final.csv', index=False)
print("Saved submission_catboost_final.csv")


Saved submission_catboost_final.csv
