# Spaceship Titanic


In [44]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


## Load data

In [45]:
# Files copied locally

train = pd.read_csv('sst_train.csv')
test = pd.read_csv('sst_test.csv')
print(train.shape)
print(test.shape)
train.info()

(8693, 14)
(4277, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [46]:
# Quick view of training data
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


## Data Cleaning

In [47]:

def split_cabin(x):
  if len(str(x).split('/')) < 3:
    return ['Missing', 0, "Missing"]
  else:   
    return str(x).split('/')


def cleandata(dfin): 
    
    df = dfin.copy()
    
    df['CryoSleep'] = pd.to_numeric(df['CryoSleep'])
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df['VIP'] = pd.to_numeric(df['VIP'])
    df['VIP'] = df['VIP'].astype(bool)
    df[['group', 'groupno']] = df['PassengerId'].str.split('_', expand=True)
    df['group'] = pd.to_numeric(df['group'])
    df['groupno'] = pd.to_numeric(df['groupno'])
    df.drop(['PassengerId'], axis=1, inplace=True)
    # apply(lambda is inefficient
    df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
    df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
    df['Room'] = df['TempCabin'].apply(lambda x: x[1])
    df['Side'] = df['TempCabin'].apply(lambda x: x[2])
    df['Room'] = pd.to_numeric(df['Room'])
    df.drop(['TempCabin', 'Cabin'], axis=1, inplace=True)  
    
    return df
#train[train['Cabin'].isnull()]
train_clean=cleandata(train)
test_clean=cleandata(test)
y_train = train_clean.Transported
train_clean.drop(['Transported'], axis=1, inplace=True)

print(train_clean.shape)
print(y_train.shape)
print(test_clean.shape)


(8693, 16)
(8693,)
(4277, 16)


In [48]:

if 1==2: # Activate

    import sweetviz as sv

    # Create the report
    report = sv.compare([train, "Training Data"], [test, "Test Data"])

    # Display the report within the Jupyter Notebook
    report.show_notebook()



## Pre Processing

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# General function for preprocessing

def preprocess_data(df, cols_to_exclude=None, exclude_output_cols=None, log_normalize_cols=None, one_hot_encode_cols=None, imput_strategy='mean'):
    # Copy the original DataFrame to avoid modifying it
    processed_df = df.copy()

    # Exclude specified columns from the output
    if exclude_output_cols:
        processed_df = processed_df.drop(columns=exclude_output_cols, errors='ignore')

    # Log normalization for selected columns
    if log_normalize_cols:
        for col in log_normalize_cols:
            if col in processed_df.columns:
                processed_df[col] = np.log1p(processed_df[col])

    

    # Exclude specified columns from scaling, imputing, and one-hot encoding
    if cols_to_exclude:
        cols_to_process = [col for col in processed_df.columns if col not in cols_to_exclude]
        numeric_cols = processed_df[cols_to_process].select_dtypes(include=['number']).columns
        imputer = SimpleImputer(strategy=imput_strategy)
        scaler = StandardScaler()
        processed_df[numeric_cols] = scaler.fit_transform(imputer.fit_transform(processed_df[numeric_cols]))

    # One-hot encode selected columns
    if one_hot_encode_cols:
        processed_df = pd.get_dummies(processed_df, columns=one_hot_encode_cols)
    
    
    # Fill missing values in non-numeric columns (excluding boolean columns) with 'Missing'
    # but only if there are missing values to begin with
    non_numeric_cols = processed_df.select_dtypes(exclude=['number', 'bool']).columns
    for col in non_numeric_cols:
        if processed_df[col].isnull().any():
            processed_df[col] = processed_df[col].fillna('Missing')


    return processed_df



#imput_strategy = mean, median, most_frequent
#cols_to_exclude= exclude from imputer and scaler
X_train = preprocess_data(train_clean,  cols_to_exclude=['PassengerId'], exclude_output_cols=['Name'], log_normalize_cols=['RoomService','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'], one_hot_encode_cols=['Destination','HomePlanet','Deck','Side'],imput_strategy='mean')
X_test = preprocess_data(test_clean,  cols_to_exclude=['PassengerId'], exclude_output_cols=['Name'], log_normalize_cols=['RoomService','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'], one_hot_encode_cols=['Destination','HomePlanet','Deck','Side'],imput_strategy='mean')


print(X_train.shape)
print(X_test.shape)


(8693, 29)
(4277, 29)


## EDA Tools

In [50]:
if 1==2: # Activate

    import sweetviz as sv

    report = sv.compare([X_train, "Training Data"], [X_test, "Test Data"])

    # Display the report within the Jupyter Notebook
    report.show_notebook()

In [51]:
if 1==2: # Activate
    import pygwalker as pyg
    pyg.walk(X_train)

## Modelling

In [52]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.metrics import mean_squared_error
#from sklearn.base import BaseEstimator, ClassifierMixin
from xgboost import XGBClassifier
import tensorflow as tf


In [58]:
# Basic XGBoost

xgb = XGBClassifier(random_state =1)


xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_train)


accuracy = accuracy_score(y_train,y_pred) # Accuracy is the ratio of correctly predicted observation to the total
precision = precision_score(y_train,y_pred) # Precision is the ratio of correctly predicted positive observations to the total predicted positives. 
recall = recall_score(y_train,y_pred) # Recall is the ratio of correctly predicted positive observations to the all actual positives. 
print(f'Metrics accuracy- {accuracy}, recall- {recall}, precision- {precision}')

mse = mean_squared_error(y_train,y_pred)
print("Mean Squared Error:", mse)

#Metrics accuracy- 0.9312090187507189, recall- 0.9323892188213796, precision- 0.9311131386861314
#Metrics accuracy- 0.9339698608075463, recall- 0.9321608040201005, precision- 0.9364387333639284

Metrics accuracy- 0.9339698608075463, recall- 0.9321608040201005, precision- 0.9364387333639284
Mean Squared Error: 0.0660301391924537


In [54]:

# XGBoost with grid search

"""param_grid = {
    'n_estimators': [20, 50, 100, 250, 500,1000],
    'colsample_bytree': [0.2, 0.5, 0.7, 0.8, 1],
    'max_depth': [2, 5, 10, 15, 20, 25, None],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'subsample': [0.5,0.6,0.7, 0.8, 0.9],
    'learning_rate':[.01,0.1,0.2,0.3,0.5, 0.7, 0.9],
    'gamma':[0,.01,.1,1,10,100],
    'min_child_weight':[0,.01,0.1,1,10,100],
    'sampling_method': ['uniform', 'gradient_based']
}
"""



param_grid = {
    'n_estimators': [10, 20, 30],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'max_depth': [ 10, 15, 20],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1.5, 2,2.5],
    'subsample': [ 0.8, 0.9,1,1.1],
    'learning_rate':[0.001,0.01,0.1],
    'gamma':[.1,1,10],
    'min_child_weight':[1,10,100],
    'sampling_method': ['uniform']
}


#clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
#best_clf_xgb = clf_xgb.fit(X_train,y_train)
#clf_performance(best_clf_xgb,'XGB')
clf_xgb_rnd = RandomizedSearchCV(xgb, param_distributions = param_grid, n_iter = 10000, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb_rnd = clf_xgb_rnd.fit(X_train,y_train)
y_pred = xgb.predict(X_train)


accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
print(f'Metrics accuracy- {accuracy}, recall- {recall}, precision- {precision}')

print('Best Score: ' + str(best_clf_xgb_rnd.best_score_))
print('Best Parameters: ' + str(best_clf_xgb_rnd.best_params_))


#Best Score: 0.7876492779536142
#Best Parameters: {'subsample': 0.9, 'sampling_method': 'uniform', 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 20, 'min_child_weight': 10, 'max_depth': 15, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.5}
#Best Score: 0.7901786074692081
#Best Parameters: {'subsample': 0.9, 'sampling_method': 'uniform', 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 10, 'min_child_weight': 10, 'max_depth': 15, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.5}
#Fitting 5 folds for each of 100 candidates, totalling 500 fits
#Best Score: 0.7884539412953094
#Best Parameters: {'subsample': 0.8, 'sampling_method': 'uniform', 'reg_lambda': 2, 'reg_alpha': 0.5, 'n_estimators': 10, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.5}
#Mean Squared Error: 0.1741631197515242
#Metrics accuracy- 0.9312090187507189, recall- 0.9323892188213796, precision- 0.9311131386861314
#Metrics accuracy- 0.9339698608075463, recall- 0.9321608040201005, precision- 0.9364387333639284
#Best Parameters: {'subsample': 0.9, 'sampling_method': 'uniform', 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 10, 'min_child_weight': 10, 'max_depth': 20, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.5}

Fitting 5 folds for each of 10000 candidates, totalling 50000 fits


KeyboardInterrupt: 

In [None]:
# tensor flow with grid search
X_train = X_train * 1

# Custom wrapper for Keras model
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, optimizer='adam', neurons1=8, neurons2=8, activation='relu', epochs=50, batch_size=10):
        self.model = model
        self.optimizer = optimizer
        self.neurons1 = neurons1
        self.neurons2 = neurons2
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self._estimator_type = "classifier"

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.model = self.model(optimizer=self.optimizer, neurons1=self.neurons1, neurons2=self.neurons2, activation=self.activation)
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        X = np.array(X)
        return (self.model.predict(X) > 0.5).astype(int)

    def score(self, X, y):
        X = np.array(X)
        y = np.array(y)
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

# Function to create the Keras model
def create_model(optimizer='adam', neurons1=8, neurons2=8, activation='relu'):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(neurons1, input_dim=X_train.shape[1], activation=activation))
    model.add(tf.keras.layers.Dense(neurons2, activation=activation))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # Use sigmoid for binary classification
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Use the wrapper with GridSearchCV
model = KerasClassifierWrapper(model=create_model)

param_grid = {
    'optimizer': ['SGD', 'Adam'],
    'neurons1': [8, 16, 32],
    'neurons2': [8, 16, 32],
    'activation': ['relu', 'tanh'],
    'batch_size': [10, 20],
    'epochs': [30, 50]
}

param_grid = {
    'optimizer': ['SGD'],
    'neurons1': [8],
    'neurons2': [8],
    'activation': ['relu', 'tanh'],
    'batch_size': [10],
    'epochs': [30]
}


grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', n_jobs=1, cv=3)
grid_result = grid.fit(X_train, y_train)

y_pred = grid_result.predict(X_train)


accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
print(f'Metrics accuracy- {accuracy}, recall- {recall}, precision- {precision}')

# Print the best results
print(f"Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}")



#Best Accuracy: nan using {'activation': 'relu', 'batch_size': 10, 'epochs': 30, 'neurons1': 8, 'neurons2': 8, 'optimizer': 'SGD'}

#Mean Squared Error: 0.19544461060623491


In [None]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

# RandomForest with grid search

# Define the RandomForestClassifier
rf_clf = RandomForestClassifier()

# Hyperparameters to be checked in the GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the GridSearchCV model
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV model
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_train)


accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
print(f'Metrics accuracy- {accuracy}, recall- {recall}, precision- {precision}')

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# If you wish to use the best estimator found:
best_rf_clf = grid_search.best_estimator_


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Metrics accuracy- 0.9012998964684229, recall- 0.8951576062128825, precision- 0.9075961093098657
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.7798265077015414


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# GradientBoosting with grid search

# Create a GradientBoostingClassifier
gb_clf = GradientBoostingClassifier()

# Define a parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

# Set up GridSearchCV
grid_search_gb = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model
grid_search_gb.fit(X_train, y_train)


y_pred = grid_search_gb.predict(X_train)


accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
print(f'Metrics accuracy- {accuracy}, recall- {recall}, precision- {precision}')

# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters found: ', best_params)

# Get the best estimator
best_gb_clf = grid_search.best_estimator_

# Make predictions on the training data
train_predictions = best_gb_clf.predict(X_train)


# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, train_predictions)
print('Training accuracy with best parameters: ', train_accuracy)


#Fitting 5 folds for each of 729 candidates, totalling 3645 fits
#Metrics accuracy- 0.7946623720234671, recall- 0.8042485153037917, precision- 0.7914138008541245
#Best parameters found:  {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
#Training accuracy with best parameters:  0.9012998964684229

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Metrics accuracy- 0.7946623720234671, recall- 0.8042485153037917, precision- 0.7914138008541245
Best parameters found:  {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Training accuracy with best parameters:  0.9012998964684229


In [None]:
# Make predictions simple XGBoost

xgb.fit(X_train,y_train)

# Make predictions
predictions = xgb.predict(X_test).astype(bool)

test_sub = test.copy()

test_sub['Transported'] = predictions


test_df_sub = test_sub[['PassengerId', 'Transported']]

test_df_sub

test_df_sub.to_csv('submission.csv', index=False)

[False False  True ...  True  True  True]


In [None]:
# Make predictions XGB/Grid
predictions = best_clf_xgb.predict(X_test).astype(int)

test_sub = test.copy()

test_sub['Transported'] = predictions


test_df_sub = testsub[['PassengerId', 'Transported']]

test_df_sub

test_df_sub.to_csv('submission_xgb2.csv', index=False)

NameError: name 'best_clf_xgb' is not defined

In [None]:
# Make predictions Tensor Flow
predictions = grid.predict(X_test).astype(int)

test_sub = test.copy()

test_sub['Transported'] = predictions


test_df_sub = test_sub[['PassengerId', 'Transported']]

test_df_sub

test_df_sub.to_csv('submission_tf.csv', index=False)

In [None]:


# Make predictions RANDOM FOREST
predictions = grid_search.predict(X_test).astype(bool)

test_sub = test.copy()

test_sub['Transported'] = predictions


test_df_sub = test_sub[['PassengerId', 'Transported']]

test_df_sub

test_df_sub.to_csv('submission_rf.csv', index=False)

In [None]:
# Make predictions GRADIENT BOOST

predictions = grid_search_gb.predict(X_test).astype(bool)

test_sub = test.copy()

test_sub['Transported'] = predictions


test_df_sub = test_sub[['PassengerId', 'Transported']]

test_df_sub

test_df_sub.to_csv('submission_gb.csv', index=False)

# Save, Load Model

#save load model

In [None]:
import pickle

with open('gradientboosted.pkl', 'wb') as f: 
  pickle.dump(fit_models['gb'], f)

with open('gradientboosted.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)