### Titanic Passenger Survival Prediction using Machine Learning
### Preprocessing: 
To make the data suitable for machine learning, we are going to do some preprocessing, including, handling missing data, transform some columns, etc


### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
plt.rcParams['figure.figsize'] = (12.0, 5.0)

In [2]:
df = pd.read_csv('../data/train.csv', index_col=0)
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [4]:
#Handle missing data. From EDA, Age has 19.87%, Cabin has 77.10% and Embarked has 0.25% NaNs.
def process_missing(df):

    # Fill missing ages with the mean ages based on sex and Pclass
    df['Age'] = df.groupby(['Sex','Pclass'])['Age'].apply(lambda x: x.fillna(x.mean()))

    # The Cabin column will be dropped due to the magnitude of the NaNs. 
    
    # For Embarked, we will replace the 2 NaNs with the mode
    df["Embarked"] = df["Embarked"].fillna('S') #mode of embarked is 'S'.
    return df

df = process_missing(df)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def unwanted_cols(df):

    # Remove columns unnecessary for EDA and Modeling: Name, Ticket, Cabin
    df = df.drop(['Cabin','Name','Ticket'], axis=1)
    return df

df = unwanted_cols(df)
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.00,1,0,7.2500,S
2,1,1,female,38.00,1,0,71.2833,C
3,1,3,female,26.00,0,0,7.9250,S
4,1,1,female,35.00,1,0,53.1000,S
5,0,3,male,35.00,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.00,0,0,13.0000,S
888,1,1,female,19.00,0,0,30.0000,S
889,0,3,female,21.75,1,2,23.4500,S
890,1,1,male,26.00,0,0,30.0000,C


In [6]:
def feature_transformation(df):

    #let combine features Sibsp and Parch since it shows whether a person was travelling alone or not
    df['fam_size'] = df['SibSp'] + df['Parch'] + 1
    df['fam_size'] = df['fam_size'].map(lambda x:1 if x==1 else(2 if x==2 else(3 if 3 <= x <= 4 else(4 if x>=5 else 0))))

    #let form bins or cut the distribution of age and fare into pieces
    df['Age'] = pd.cut(df['Age'].astype(int), 5)
    df['Fare'] = pd.qcut(df['Fare'].astype(int), 5)
    return df

df = feature_transformation(df)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,fam_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,"(16.0, 32.0]",1,0,"(-0.001, 7.0]",S,2
2,1,1,female,"(32.0, 48.0]",1,0,"(39.0, 512.0]",C,2
3,1,3,female,"(16.0, 32.0]",0,0,"(-0.001, 7.0]",S,1
4,1,1,female,"(32.0, 48.0]",1,0,"(39.0, 512.0]",S,2
5,0,3,male,"(32.0, 48.0]",0,0,"(7.0, 10.0]",S,1


In [7]:
def feature_encoding(df):

    # source:https://towardsdatascience.com/machine-learning-with-the-titanic-dataset-7f6909e58280
    #label encode the values of non_numeric features to numbers 
    non_num_features = ['Embarked','Sex','Age','Fare']
    for feature in non_num_features:
        df[feature] = LabelEncoder().fit_transform(df[feature])

    #one hot encode the variables into binary codes
    cat_features = ['Embarked','Sex','Age','Fare','Pclass','fam_size']
    encoded_features = []
    for feature in cat_features:

        encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1,1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n+1)]
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = df.index
        encoded_features.append(encoded_df)

    df = pd.concat([df, *encoded_features], axis=1)

    return df

df = feature_encoding(df)
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,fam_size,Embarked_1,...,Fare_3,Fare_4,Fare_5,Pclass_1,Pclass_2,Pclass_3,fam_size_1,fam_size_2,fam_size_3,fam_size_4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,1,1,1,0,0,2,2,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,1,0,2,1,0,4,0,2,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3,0,1,0,0,0,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1,1,0,2,1,0,4,2,2,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0,3,1,2,0,0,1,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,1,1,0,0,2,2,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
888,1,1,0,1,0,0,3,2,1,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
889,0,3,0,1,1,2,3,2,3,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
890,1,1,1,1,0,0,3,0,1,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
def more_unwanted_cols(df):

    #Drop columns unneeded for modeling, including those which we have created new label and one hot encoded variants out of
    df = df.drop(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'fam_size'], axis=1)
    return df

df = more_unwanted_cols(df)
df

Unnamed: 0_level_0,Survived,Embarked_1,Embarked_2,Embarked_3,Sex_1,Sex_2,Age_1,Age_2,Age_3,Age_4,...,Fare_3,Fare_4,Fare_5,Pclass_1,Pclass_2,Pclass_3,fam_size_1,fam_size_2,fam_size_3,fam_size_4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
888,1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
889,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
890,1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
df.describe()

Unnamed: 0,Survived,Embarked_1,Embarked_2,Embarked_3,Sex_1,Sex_2,Age_1,Age_2,Age_3,Age_4,...,Fare_3,Fare_4,Fare_5,Pclass_1,Pclass_2,Pclass_3,fam_size_1,fam_size_2,fam_size_3,fam_size_4
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.188552,0.08642,0.725028,0.352413,0.647587,0.112233,0.555556,0.242424,0.077441,...,0.191919,0.20202,0.197531,0.242424,0.20651,0.551066,0.602694,0.180696,0.147026,0.069585
std,0.486592,0.391372,0.281141,0.446751,0.47799,0.47799,0.315831,0.497183,0.42879,0.26744,...,0.394031,0.401733,0.39836,0.42879,0.405028,0.497665,0.489615,0.384982,0.354331,0.254589
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
