<a href="https://www.kaggle.com/code/mariushinsberger/spaceship-titanic-first-approaches?scriptVersionId=160478714" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# Imports

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from category_encoders.target_encoder import TargetEncoder
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Load data

In [3]:
path = str("/kaggle/input/spaceship-titanic/")
train_data = pd.read_csv(f"{path}train.csv")
test_data = pd.read_csv(f"{path}test.csv")
submission_data = pd.read_csv(f"{path}sample_submission.csv")

# Process data

In [4]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train_data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [7]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
def clean_data(df):
    df.rename(columns={'PassengerId': 'passenger_id', 'HomePlanet': 'home_planet', 'CryoSleep': 'cryo_sleep', 'Cabin': 'cabin', 'Destination': 'destination', 'Age': 'age',
           'VIP': 'vip', 'RoomService': 'room_service', 'FoodCourt': 'food_court', 'ShoppingMall': 'shopping_mall', 'Spa': 'spa', 'VRDeck': 'vr_deck',
           'Name': 'name', 'Transported': 'transported'}, inplace=True)
    
    # Fill missing categorical feature values with mode. 
    df["home_planet"].fillna(df["home_planet"].mode()[0], inplace=True)
    df["cryo_sleep"].fillna(df["cryo_sleep"].mode()[0], inplace=True)
    df["cabin"].fillna(df["cabin"].mode()[0], inplace=True)
    df["destination"].fillna(df["destination"].mode()[0], inplace=True)
    df["vip"].fillna(df["vip"].mode()[0], inplace=True)
    # Fill numerical feature values with median.
    df["age"].fillna(df["age"].median(), inplace= True)
    df["room_service"].fillna(df["room_service"].median(), inplace= True)
    df["food_court"].fillna(df["food_court"].median(), inplace= True) 
    df["shopping_mall"].fillna(df["shopping_mall"].median(), inplace= True)
    df["spa"].fillna(df["spa"].median(), inplace= True)
    df["vr_deck"].fillna(df["vr_deck"].median(), inplace= True) 
    
    df[["deck", "num", "side"]] = df["cabin"].str.split("/", expand=True)
    df.drop(columns=["cabin", "passenger_id", "name"], inplace=True)
    
    df["home_planet"] = le.fit_transform(df["home_planet"].values)
    df["cryo_sleep"] = le.fit_transform(df["cryo_sleep"].values)
    df["destination"] = le.fit_transform(df["destination"].values)
    df["vip"] = le.fit_transform(df["vip"].values)
    df["deck"] = le.fit_transform(df["deck"].values)
    df["num"] = le.fit_transform(df["num"].values)
    df["side"] = le.fit_transform(df["side"].values)
    
    df["age"] = df["age"].astype(int)
    df["room_service"] = df["room_service"].astype(int)
    df["food_court"] = df["food_court"].astype(int)
    df["shopping_mall"] = df["shopping_mall"].astype(int)
    df["spa"] = df["spa"].astype(int)
    df["vr_deck"] = df["vr_deck"].astype(int)
    return df


In [10]:
le = LabelEncoder()
#scaler = StandardScaler()
train_df = clean_data(train_data)
test_df = clean_data(test_data)

In [11]:
train_df.head()

Unnamed: 0,home_planet,cryo_sleep,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,transported,deck,num,side
0,1,0,2,39,0,0,0,0,0,0,False,1,0,0
1,0,0,2,24,0,109,9,25,549,44,True,5,0,1
2,1,0,2,58,1,43,3576,0,6715,49,False,0,0,1
3,1,0,2,33,0,0,1283,371,3329,193,False,0,0,1
4,0,0,2,16,0,303,70,151,565,2,True,5,1,1


In [12]:
train_df.corrwith(train_df["transported"]).abs().sort_values(ascending=False)

transported      1.000000
cryo_sleep       0.460132
room_service     0.241124
spa              0.218545
vr_deck          0.204874
home_planet      0.115461
deck             0.113992
destination      0.108152
side             0.101397
age              0.074233
food_court       0.045583
vip              0.037261
num              0.019393
shopping_mall    0.009391
dtype: float64

# Build model

In [13]:
result_dicts = []
X_train = train_df.drop(columns=["transported"])
y_train = train_df["transported"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Random Forest

In [14]:
random_forest_model = RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=0.001).fit(X_train,y_train)
acc_train = accuracy_score(random_forest_model.predict(X_train),y_train)
acc_val = accuracy_score(random_forest_model.predict(X_val),y_val)
result_dicts.append({"model": "random_forest_model", "train_accuracy": f"{acc_train:.4f}", "test_accuracy": f"{acc_val:.4f}"})

## XGBoost

### Simple model

In [15]:
xgb_model = XGBClassifier(n_estimators=80, learning_rate=0.1,verbosity=1, random_state= 42)
xgb_model.fit(X_train,y_train, eval_set=[(X_val,y_val)])
acc_train = accuracy_score(xgb_model.predict(X_train),y_train)
acc_val = accuracy_score(xgb_model.predict(X_val),y_val)
result_dicts.append({"model": "xgb_model", "train_accuracy": f"{acc_train:.4f}", "test_accuracy": f"{acc_val:.4f}"})

[0]	validation_0-logloss:0.65391
[1]	validation_0-logloss:0.62196
[2]	validation_0-logloss:0.59503
[3]	validation_0-logloss:0.57216
[4]	validation_0-logloss:0.55198
[5]	validation_0-logloss:0.53521
[6]	validation_0-logloss:0.52105
[7]	validation_0-logloss:0.50904
[8]	validation_0-logloss:0.49825
[9]	validation_0-logloss:0.48844
[10]	validation_0-logloss:0.48024
[11]	validation_0-logloss:0.47352
[12]	validation_0-logloss:0.46671
[13]	validation_0-logloss:0.46005
[14]	validation_0-logloss:0.45486
[15]	validation_0-logloss:0.44982
[16]	validation_0-logloss:0.44636
[17]	validation_0-logloss:0.44209
[18]	validation_0-logloss:0.43897
[19]	validation_0-logloss:0.43575
[20]	validation_0-logloss:0.43295
[21]	validation_0-logloss:0.43128
[22]	validation_0-logloss:0.42928
[23]	validation_0-logloss:0.42709
[24]	validation_0-logloss:0.42515
[25]	validation_0-logloss:0.42356
[26]	validation_0-logloss:0.42241
[27]	validation_0-logloss:0.42094
[28]	validation_0-logloss:0.41972
[29]	validation_0-loglos

### Grid search

In [16]:
xgb0 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    seed=42)
xgb0.fit(X_train,y_train, eval_set = [(X_val, y_val)])

[0]	validation_0-logloss:0.65825
[1]	validation_0-logloss:0.62963
[2]	validation_0-logloss:0.60571
[3]	validation_0-logloss:0.58454
[4]	validation_0-logloss:0.56547
[5]	validation_0-logloss:0.54831
[6]	validation_0-logloss:0.53697
[7]	validation_0-logloss:0.52505
[8]	validation_0-logloss:0.51620
[9]	validation_0-logloss:0.50563
[10]	validation_0-logloss:0.49667
[11]	validation_0-logloss:0.48910
[12]	validation_0-logloss:0.48211
[13]	validation_0-logloss:0.47569
[14]	validation_0-logloss:0.46954
[15]	validation_0-logloss:0.46463
[16]	validation_0-logloss:0.46034
[17]	validation_0-logloss:0.45653
[18]	validation_0-logloss:0.45227
[19]	validation_0-logloss:0.44870
[20]	validation_0-logloss:0.44636
[21]	validation_0-logloss:0.44387
[22]	validation_0-logloss:0.44051
[23]	validation_0-logloss:0.43783
[24]	validation_0-logloss:0.43589
[25]	validation_0-logloss:0.43385
[26]	validation_0-logloss:0.43287
[27]	validation_0-logloss:0.43119
[28]	validation_0-logloss:0.42868
[29]	validation_0-loglos

Parameters: { "early_stopping" } are not used.



[36]	validation_0-logloss:0.41843
[37]	validation_0-logloss:0.41801
[38]	validation_0-logloss:0.41758
[39]	validation_0-logloss:0.41731
[40]	validation_0-logloss:0.41673
[41]	validation_0-logloss:0.41620
[42]	validation_0-logloss:0.41544
[43]	validation_0-logloss:0.41492
[44]	validation_0-logloss:0.41442
[45]	validation_0-logloss:0.41358
[46]	validation_0-logloss:0.41323
[47]	validation_0-logloss:0.41315
[48]	validation_0-logloss:0.41249
[49]	validation_0-logloss:0.41171
[50]	validation_0-logloss:0.41153
[51]	validation_0-logloss:0.41113
[52]	validation_0-logloss:0.41047
[53]	validation_0-logloss:0.41051
[54]	validation_0-logloss:0.41016
[55]	validation_0-logloss:0.40962
[56]	validation_0-logloss:0.40954
[57]	validation_0-logloss:0.40912
[58]	validation_0-logloss:0.40876
[59]	validation_0-logloss:0.40854
[60]	validation_0-logloss:0.40874
[61]	validation_0-logloss:0.40758
[62]	validation_0-logloss:0.40742
[63]	validation_0-logloss:0.40738
[64]	validation_0-logloss:0.40754
[65]	validatio

In [24]:
def xgboost_grid_search():
    param = {
        #"max_depth": [4,5,6],
        #"min_child_weight": [0,1,2],
        #"gamma": [i/10.0 for i in range(0,5)], 
        #"subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
        #"colsample_bytree": [0.5, 0.6, 0.7],
        #"reg_alpha": [1e-5, 1e-4, 1e-6]
    }
    gs = GridSearchCV(
        estimator = XGBClassifier( 
            learning_rate=0.1, 
            n_estimators=250,
            early_stopping_rounds=50,
            max_depth=5,
            min_child_weight=1, 
            gamma=0, 
            subsample=0.9, 
            colsample_bytree=0.6,
            objective= 'binary:logistic',
            reg_alpha=1e-5,
            nthread=-1, 
            scale_pos_weight=1, 
            random_state= 42
        ), 
        param_grid = param, 
        scoring='roc_auc',
        n_jobs=4, 
        cv=5
    )
    gs.fit(X_train, y_train, eval_set = [(X_val, y_val)])
    print(f"Best params: {gs.best_params_}\n Best score: {gs.best_score_}")
#xgboost_grid_search()

[0]	validation_0-logloss:0.66729
[1]	validation_0-logloss:0.64103
[2]	validation_0-logloss:0.62032
[3]	validation_0-logloss:0.60029
[0]	validation_0-logloss:0.66690
[0]	validation_0-logloss:0.66749
[0]	validation_0-logloss:0.66728
[4]	validation_0-logloss:0.58173
[1]	validation_0-logloss:0.64103
[1]	validation_0-logloss:0.64026
[1]	validation_0-logloss:0.64136
[5]	validation_0-logloss:0.56275
[2]	validation_0-logloss:0.62020
[2]	validation_0-logloss:0.61977
[2]	validation_0-logloss:0.62085
[6]	validation_0-logloss:0.55112
[3]	validation_0-logloss:0.60055
[3]	validation_0-logloss:0.60042
[3]	validation_0-logloss:0.60174
[7]	validation_0-logloss:0.53848
[4]	validation_0-logloss:0.58153
[4]	validation_0-logloss:0.58089
[4]	validation_0-logloss:0.58286
[5]	validation_0-logloss:0.56269
[5]	validation_0-logloss:0.56194
[8]	validation_0-logloss:0.52951
[6]	validation_0-logloss:0.55047
[5]	validation_0-logloss:0.56357
[6]	validation_0-logloss:0.55011
[9]	validation_0-logloss:0.51815
[7]	valida

In [26]:
xgb_model_2 = XGBClassifier( 
        learning_rate =0.01, 
        n_estimators=5000,
        early_stopping_rounds=50,
        max_depth=5,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.9, 
        colsample_bytree=0.6,
        objective= 'binary:logistic',
        reg_alpha=1e-5,
        nthread=-1, 
        scale_pos_weight=1, 
        random_state= 42
    )
xgb_model_2.fit(X_train,y_train, eval_set = [(X_val, y_val)])
acc_train = accuracy_score(xgb_model_2.predict(X_train),y_train)
acc_val = accuracy_score(xgb_model_2.predict(X_val),y_val)
result_dicts.append({"model": "xgb_model_2", "train_accuracy": f"{acc_train:.4f}", "test_accuracy": f"{acc_val:.4f}"})

[0]	validation_0-logloss:0.69037
[1]	validation_0-logloss:0.68725
[2]	validation_0-logloss:0.68433
[3]	validation_0-logloss:0.68121
[4]	validation_0-logloss:0.67799
[5]	validation_0-logloss:0.67451
[6]	validation_0-logloss:0.67167
[7]	validation_0-logloss:0.66853
[8]	validation_0-logloss:0.66588
[9]	validation_0-logloss:0.66290
[10]	validation_0-logloss:0.65989
[11]	validation_0-logloss:0.65764
[12]	validation_0-logloss:0.65478
[13]	validation_0-logloss:0.65192
[14]	validation_0-logloss:0.64910
[15]	validation_0-logloss:0.64664
[16]	validation_0-logloss:0.64422
[17]	validation_0-logloss:0.64171
[18]	validation_0-logloss:0.63929
[19]	validation_0-logloss:0.63684
[20]	validation_0-logloss:0.63419
[21]	validation_0-logloss:0.63206
[22]	validation_0-logloss:0.62955
[23]	validation_0-logloss:0.62711
[24]	validation_0-logloss:0.62449
[25]	validation_0-logloss:0.62203
[26]	validation_0-logloss:0.61971
[27]	validation_0-logloss:0.61758
[28]	validation_0-logloss:0.61536
[29]	validation_0-loglos

In [27]:
xgb_model_3 = XGBClassifier( 
    learning_rate =0.1, 
    n_estimators=164,
    early_stopping_rounds=20,
    max_depth=5,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.9, 
    colsample_bytree=0.6,
    objective= 'binary:logistic',
    reg_alpha=1e-5,
    nthread=-1, 
    scale_pos_weight=1, 
    random_state= 42
)
xgb_model_3.fit(X_train,y_train, eval_set=[(X_val,y_val)])
acc_train = accuracy_score(xgb_model_3.predict(X_train),y_train)
acc_val = accuracy_score(xgb_model_3.predict(X_val),y_val)
result_dicts.append({"model": "xgb_model_3", "train_accuracy": f"{acc_train:.4f}", "test_accuracy": f"{acc_val:.4f}"})

[0]	validation_0-logloss:0.66700
[1]	validation_0-logloss:0.64094
[2]	validation_0-logloss:0.62078
[3]	validation_0-logloss:0.60158
[4]	validation_0-logloss:0.58235
[5]	validation_0-logloss:0.56326
[6]	validation_0-logloss:0.55107
[7]	validation_0-logloss:0.53847
[8]	validation_0-logloss:0.52932
[9]	validation_0-logloss:0.51792
[10]	validation_0-logloss:0.50925
[11]	validation_0-logloss:0.50411
[12]	validation_0-logloss:0.49595
[13]	validation_0-logloss:0.48742
[14]	validation_0-logloss:0.48179
[15]	validation_0-logloss:0.47598
[16]	validation_0-logloss:0.46930
[17]	validation_0-logloss:0.46482
[18]	validation_0-logloss:0.46106
[19]	validation_0-logloss:0.45758
[20]	validation_0-logloss:0.45464
[21]	validation_0-logloss:0.45221
[22]	validation_0-logloss:0.44854
[23]	validation_0-logloss:0.44657
[24]	validation_0-logloss:0.44377
[25]	validation_0-logloss:0.44106
[26]	validation_0-logloss:0.43916
[27]	validation_0-logloss:0.43639
[28]	validation_0-logloss:0.43400
[29]	validation_0-loglos

In [37]:
acc_train = accuracy_score(xgb_model_3.predict(X_train),y_train)
acc_val = accuracy_score(xgb_model_3.predict(X_val),y_val)
result_dicts.append({"model": "xgb_model_3", "train_accuracy": f"{acc_train:.4f}", "test_accuracy": f"{acc_val:.4f}"})

### Validation of best model

In [28]:
result_df = pd.DataFrame(columns=["model", "train_accuracy", "test_accuracy"], data=result_dicts)
result_df.head()

Unnamed: 0,model,train_accuracy,test_accuracy
0,random_forest_model,0.8325,0.7884
1,xgb_model,0.8637,0.7982
2,xgb_model_2,0.8677,0.7999
3,xgb_model_2,0.8746,0.7987
4,xgb_model_3,0.868,0.8016


In [33]:
best = xgb_model_2

# Make prediction

In [34]:
predictions = best.predict(test_df)
submission_data["Transported"] = predictions
# For XGBoost, comment out for Random Forest.
submission_data["Transported"] = submission_data["Transported"].map({0: "False", 1: "True"})

In [35]:
submission_data.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [36]:
submission_data.to_csv('submission.csv', index=False)