# Titanic Prediction, with all models explains

This notebook extracts the features, performs the preprocessing steps, and applies different models. 
Each model is described, highlighting its benefits and potential drawbacks

# 0. Load libraries and input data

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

sns.set()

sys.path.append("../")
from utils import preprocessing_tools

config = {
    'test_size': 0.2,
    'seed': 14
}

train_data = pd.read_csv('../input/train.csv')
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 1. Preprocessing steps
This section includes:
- Feature extraction
- Preprocessing using Pipelines, including
    - Fix categorical missing values with rules
    - One-hot encoding
    - Fix missing values with Imputer
    - Feature scaling
- Train/Test split

In [6]:
# Extract relevant features
train_data = preprocessing_tools.titanic_feature_extraction(train_data)

# Fill NA from Embarked
train_data['Embarked'] = train_data.Embarked.fillna('NoBoardingRecorded')

# OneHotEncode Categories
cat_encoder = OneHotEncoder()
cat_features = ['Embarked', 'cabinLetter', 'Pclass']
all_df = [train_data]
# Perform OneHotEncoder for each feature
for cat in cat_features:
    cat_x = cat_encoder.fit_transform(train_data[[cat]])
    all_df.append(pd.DataFrame(
        cat_x.toarray(), 
        columns=cat_encoder.categories_, 
        index=train_data.index))

# Concatenate OneHotEncoder results
train_data = pd.concat(all_df, axis=1)

# Input missing values of age as the median
median_imputer = SimpleImputer(strategy='median')
train_data['Age'] = median_imputer.fit_transform(train_data[['Age']])

# Standardise the numerical values
num_scaler = StandardScaler()
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
train_data[num_features] = num_scaler.fit_transform(train_data[num_features])

# Drop columns not used for prediction
clean_train_data = train_data.drop(['PassengerId', 'Name', 'Sex', 'Pclass', 'Ticket', 'Cabin', 'Embarked', 'cabinLetter'], axis=1)
clean_train_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,hasCabin,numCabins,isFemale,"(C,)","(Q,)",...,"(C,).1","(D,)","(E,)","(F,)","(G,)","(No Cabin,)","(T,)","(1,)","(2,)","(3,)"
0,0,-0.565736,0.432793,-0.473674,-0.502445,0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,0.663861,0.432793,-0.473674,0.786845,1,1.0,1,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,-0.258337,-0.474545,-0.473674,-0.488854,0,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,0.433312,0.432793,-0.473674,0.42073,1,1.0,1,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0.433312,-0.474545,-0.473674,-0.486337,0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [7]:
clean_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 56 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Survived        891 non-null    int64  
 1   Age             891 non-null    float64
 2   SibSp           891 non-null    float64
 3   Parch           891 non-null    float64
 4   Fare            891 non-null    float64
 5   hasCabin        891 non-null    int64  
 6   numCabins       891 non-null    float64
 7   isFemale        891 non-null    int64  
 8   (C,)            891 non-null    float64
 9   (Q,)            891 non-null    float64
 10  (S,)            891 non-null    float64
 11  (nan,)          891 non-null    float64
 12  (A,)            891 non-null    float64
 13  (B,)            891 non-null    float64
 14  (C,)            891 non-null    float64
 15  (D,)            891 non-null    float64
 16  (E,)            891 non-null    float64
 17  (F,)            891 non-null    flo

In [4]:
# Split dataset between features and labels
y = clean_train_data.Survived
X = clean_train_data.drop(["Survived"], axis=1)
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,hasCabin,numCabins,isFemale,"(C,)","(Q,)","(S,)",...,"(C,).1","(D,)","(E,)","(F,)","(G,)","(No Cabin,)","(T,)","(1,)","(2,)","(3,)"
0,-0.565736,0.432793,-0.473674,-0.502445,0,0.0,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.663861,0.432793,-0.473674,0.786845,1,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.258337,-0.474545,-0.473674,-0.488854,0,0.0,1,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.433312,0.432793,-0.473674,0.42073,1,1.0,1,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.433312,-0.474545,-0.473674,-0.486337,0,0.0,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)