# Titanic Kaggle Submission

This notebook encapsulates all the lessons from the previous two notebooks, creating a final submission for Kaggle. 

In [1]:
import os
import sys
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTENC

# Modelling
# Standard Machine Learning Algorithms
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
# Silect XGBoost
xgboost.config_context(verbosity=0)

# Neural Networks
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import InputLayer, Dense
from keras.callbacks import EarlyStopping

sys.path.append("../")
from utils import preprocessing_tools

print("Tensorflow version " + str(tf.__version__))

config = {
    'seed': 14,
    'balance_dataset': True,

    # NN Parameters
    'batch_size': 50,
    "no_epochs": 25
}

train_data = pd.read_csv('../input/train.csv')
train_data.head(5)

  from pandas import MultiIndex, Int64Index
2023-01-23 22:14:42.865549: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Tensorflow version 2.10.0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# This time, we also import the test dataset for submission
# We will process both in parallel
test_data = pd.read_csv('../input/test.csv')
test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# Extract relevant features
train_data = preprocessing_tools.titanic_feature_extraction(train_data)
test_data = preprocessing_tools.titanic_feature_extraction(test_data)

# Fill NA from Embarked
train_data['Embarked'] = train_data.Embarked.fillna('NoBoardingRecorded')
test_data['Embarked'] = test_data.Embarked.fillna('NoBoardingRecorded')

# OneHotEncode Categories
cat_encoder = OneHotEncoder()
cat_features = ['Embarked', 'cabinLetter', 'Pclass']
all_train_df = [train_data]
all_test_df = [test_data]
# Perform OneHotEncoder for each feature
for cat in cat_features:
    # Perform transformation first with train data 
    cat_x = cat_encoder.fit_transform(train_data[[cat]])
    all_train_df.append(pd.DataFrame(
        cat_x.toarray(), 
        columns=cat_encoder.categories_, 
        index=train_data.index))

    # Perform the same with test data, only transform
    cat_x = cat_encoder.transform(test_data[[cat]])
    all_test_df.append(pd.DataFrame(
        cat_x.toarray(), 
        columns=cat_encoder.categories_, 
        index=test_data.index))
    

# Concatenate OneHotEncoder results
train_data = pd.concat(all_train_df, axis=1)
test_data = pd.concat(all_test_df, axis=1)

# Input missing values of age as the median
median_imputer = SimpleImputer(strategy='median')
train_data['Age'] = median_imputer.fit_transform(train_data[['Age']])
test_data['Age'] = median_imputer.transform(test_data[['Age']])

# Standardise the numerical values
num_scaler = StandardScaler()
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
train_data[num_features] = num_scaler.fit_transform(train_data[num_features])
test_data[num_features] = num_scaler.transform(test_data[num_features])

# Drop columns not used for prediction
clean_train_data = train_data.drop(['PassengerId', 'Name', 'Sex', 'Pclass', 'Ticket', 'Cabin', 'Embarked', 'cabinLetter'], axis=1)
clean_test_data = test_data.drop(['PassengerId', 'Name', 'Sex', 'Pclass', 'Ticket', 'Cabin', 'Embarked', 'cabinLetter'], axis=1)
clean_test_data.head()

Unnamed: 0,Age,SibSp,Parch,Fare,hasCabin,numCabins,isFemale,"(C,)","(NoBoardingRecorded,)","(Q,)",...,"(C,).1","(D,)","(E,)","(F,)","(G,)","(No Cabin,)","(T,)","(1,)","(2,)","(3,)"
0,0.394887,-0.474545,-0.473674,-0.490783,0,0.0,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.35551,0.432793,-0.473674,-0.507479,0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2.508257,-0.474545,-0.473674,-0.453367,0,0.0,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.181487,-0.474545,-0.473674,-0.474005,0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.565736,0.432793,0.76763,-0.401017,0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [4]:
# Split train data into X_train, y_train (no test this time)
y = clean_train_data.Survived
X = clean_train_data.drop(["Survived"], axis=1)
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,hasCabin,numCabins,isFemale,"(C,)","(NoBoardingRecorded,)","(Q,)",...,"(C,).1","(D,)","(E,)","(F,)","(G,)","(No Cabin,)","(T,)","(1,)","(2,)","(3,)"
0,-0.565736,0.432793,-0.473674,-0.502445,0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.663861,0.432793,-0.473674,0.786845,1,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.258337,-0.474545,-0.473674,-0.488854,0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.433312,0.432793,-0.473674,0.42073,1,1.0,1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.433312,-0.474545,-0.473674,-0.486337,0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [5]:
# If requested, balance the dataset using SMOTEC
if config['balance_dataset']:
    smote_nc = SMOTENC(categorical_features=list(range(7, 23)), random_state=config['seed'])
    X, y = smote_nc.fit_resample(X, y)



In [6]:
# Perform grid search on the entire training set
gs = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss'),
    param_grid={
        'n_estimators': [2, 5, 10, 100, 500], 
        "max_depth": [2, 5, 10, 20], 
        "random_state": [config['seed']], 
        "use_label_encoder": [False]
    }, 
    n_jobs=-1, 
    scoring='accuracy', 
    cv=5, 
    verbose=2)
gs.fit(X.values, y)
print(f"Best Estimator -> {gs.best_estimator_}")
print(f"Best Params -> {gs.best_params_}")
print(f"Best Params -> {gs.best_score_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


[CV] END max_depth=2, n_estimators=2, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=2, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=2, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=2, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=2, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=5, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=5, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=5, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=5, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END max_depth=2, n_estimators=5, random_state=14, use_label_encoder=False; total time=   0.1s
[CV] END m

In [7]:
# Train the best estimator with all training data
gs.best_estimator_.fit(X.values, y)

In [11]:
# Predict the test dataset with the best estimator
y_pred = gs.best_estimator_.predict(clean_test_data)

# Create the submission dataframe
kaggle_submission = pd.DataFrame()
kaggle_submission['PassengerId'] = test_data['PassengerId']
kaggle_submission['Survived'] = y_pred
kaggle_submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [13]:
kaggle_submission.to_csv("../submission/titanic_submission.csv", index=False)