In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
#load data files

train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

train.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
#Check missing values

train.isna().sum()


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Feature Engineering

In [4]:
# Split the cabin into 3 columns

train[['Deck','CabinNum','Side']] = train['Cabin'].str.split('/', expand=True)
test[['Deck','CabinNum','Side']] = test['Cabin'].str.split('/', expand=True)

# Convert CabinNum to numeric
train['CabinNum'] = pd.to_numeric(train['CabinNum'], errors='coerce')
test['CabinNum'] = pd.to_numeric(test['CabinNum'], errors='coerce')


In [5]:
# Extract Group information from Cabin
# groupid = first part of cabin number
# groupSize = number of people in the group

train['Group'] = train['PassengerId'].str.split('_').str[0]
train['GroupSize'] = train.groupby('Group')['Group'].transform('count')
test['Group'] = test['PassengerId'].str.split('_').str[0]
test['GroupSize'] = test.groupby('Group')['Group'].transform('count')



In [6]:
# Create total spending

spending = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train['TotalSpent'] = train[spending].sum(axis=1)
test['TotalSpent'] = test[spending].sum(axis=1)



In [7]:
# converts booleans to integer

train['CryoSleep'] = train['CryoSleep'].astype(float)
train['VIP'] = train['VIP'].astype(float)
test['CryoSleep'] = test['CryoSleep'].astype(float)
test['VIP'] = test['VIP'].astype(float)


In [8]:
# Drop unused columns: Name, Cabin

train['CryoSleep'] = train['CryoSleep'].astype(float)
train['VIP'] = train['VIP'].astype(float)
test['CryoSleep'] = test['CryoSleep'].astype(float)
test['VIP'] = test['VIP'].astype(float)


In [9]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Side,Group,GroupSize,TotalSpent
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0.0,P,1,1,0.0
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0.0,S,2,1,736.0
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0.0,S,3,2,10383.0
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0.0,S,3,2,5176.0
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1.0,S,4,1,1091.0


## Strategy of Handling Missing values

| Feature                                           | Type        | Imputation Strategy                                  |
| ------------------------------------------------- | ----------- | ---------------------------------------------------- |
| HomePlanet                                        | categorical | mode                                                 |
| Destination                                       | categorical | mode                                                 |
| CryoSleep                                         | boolean     | If spending=0 → True; else mode                      |
| VIP                                               | boolean     | False                                                |
| Cabin                                             | composite   | split then fill deck/side with mode, num with median |
| Age                                               | numeric     | median                                               |
| RoomService, FoodCourt, ShoppingMall, Spa, VRDeck | numeric     | 0                                                    |
| Name                                              | unused      | can drop                                             |


In [10]:
# fill spending variables with 0

spending = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in spending:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)


In [11]:
# fill age with median

train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())


In [12]:
# fill VIP with false 

train['VIP'] = train['VIP'].fillna(False).astype(int)
test['VIP'] = test['VIP'].fillna(False).astype(int)


In [13]:
# fill cryosleep as true when spending = 0

train['CryoSleep'] = train['CryoSleep'].map({True: 1, False: 0})
test['CryoSleep'] = test['CryoSleep'].map({True: 1, False: 0})

train.loc[(train[spending].sum(axis=1) == 0) & (train['CryoSleep'].isna()), 'CryoSleep'] = 1
test.loc[(test[spending].sum(axis=1) == 0) & (test['CryoSleep'].isna()), 'CryoSleep'] = 1

train['CryoSleep'] = train['CryoSleep'].fillna(0).astype(int)
test['CryoSleep'] = test['CryoSleep'].fillna(0).astype(int)


In [14]:
# fill cabin split items

for col in ['Deck','Side']:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

train['CabinNum'] = train['CabinNum'].fillna(train['CabinNum'].median())
test['CabinNum'] = test['CabinNum'].fillna(test['CabinNum'].median())


## Encoding and Split file

In [15]:
y = train['Transported'].map({True: 1, False: 0})


In [16]:
cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']


In [17]:
# Keep PassengerId for later
train_ids = train['PassengerId']
test_ids = test['PassengerId']

# Combine train and test (so get_dummies makes same columns)
full = pd.concat([train.drop(['Transported'], axis=1),
                  test],
                 axis=0,
                 ignore_index=True)

cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']

full_encoded = pd.get_dummies(full, columns=cat_cols)

# Split back to train/test
X = full_encoded.iloc[:len(train), :]
X_test = full_encoded.iloc[len(train):, :]


In [18]:
print(X.shape)
print(X_test.shape)


(8693, 31)
(4277, 31)


In [19]:
# Define y from Transported
y = train['Transported'].map({True: 1, False: 0})

# Start from full_encoded again if needed, or just drop bad columns from X / X_test
cols_to_drop = ['PassengerId', 'Cabin', 'Name', 'Group']

X = X.drop(columns=cols_to_drop, errors='ignore')
X_test = X_test.drop(columns=cols_to_drop, errors='ignore')

# Check that no object columns remain
print(X.select_dtypes(include='object').columns)



Index([], dtype='object')


## Build Model with CATBoost Model

In [20]:
from catboost import CatBoostClassifier

cat_final = CatBoostClassifier(
    depth=6,             # tree depth (controls complexity)
    learning_rate=0.05,  # smaller LR + more iterations = better generalization
    iterations=1200,     # more trees
    l2_leaf_reg=3,       # L2 regularization to reduce overfitting
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_state=42,
    verbose=100          # prints progress every 100 iterations
)


In [21]:
cat_final.fit(X, y)


0:	learn: 0.7470378	total: 60.1ms	remaining: 1m 12s
100:	learn: 0.8161739	total: 611ms	remaining: 6.65s
200:	learn: 0.8402163	total: 1.16s	remaining: 5.76s
300:	learn: 0.8578166	total: 1.72s	remaining: 5.13s
400:	learn: 0.8720810	total: 2.27s	remaining: 4.53s
500:	learn: 0.8809387	total: 2.83s	remaining: 3.94s
600:	learn: 0.8912918	total: 3.37s	remaining: 3.36s
700:	learn: 0.9002646	total: 3.92s	remaining: 2.79s
800:	learn: 0.9050961	total: 4.46s	remaining: 2.22s
900:	learn: 0.9108478	total: 5s	remaining: 1.66s
1000:	learn: 0.9182101	total: 5.54s	remaining: 1.1s
1100:	learn: 0.9246520	total: 6.1s	remaining: 548ms
1199:	learn: 0.9290234	total: 6.64s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78e7cf4a3910>

## Create output

In [22]:
test_ids = test['PassengerId']

# CatBoost will return 0/1 labels because y is 0/1
y_test_pred = cat_final.predict(X_test)

# Convert 0/1 → True/False for Kaggle
y_test_pred_bool = y_test_pred.astype(bool)

submission_cat = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': y_test_pred_bool
})

print(submission_cat.head())
submission_cat['Transported'].value_counts()


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01        False


Transported
False    2139
True     2138
Name: count, dtype: int64

In [23]:
submission_cat.to_csv('submission_catboost.csv', index=False)
print("Saved submission_catboost.csv")


Saved submission_catboost.csv
