### Titanic Passenger Survival Prediction using Machine Learning
### Preprocessing: 
To make the data suitable for machine learning, we are going to do some preprocessing, including, handling missing data, transform some columns, etc


### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
plt.rcParams['figure.figsize'] = (12.0, 5.0)

In [2]:
df = pd.read_csv('../data/train.csv', index_col=0)

In [3]:
def preprocess(df):
    
    #Handle missing data. From EDA, Age has 19.87%, Cabin has 77.10% and Embarked has 0.25% NaNs.
    def process_missing(df):
        
        # Fill missing ages with the mean ages based on sex and Pclass
        df['Age'] = df.groupby(['Sex','Pclass'])['Age'].apply(lambda x: x.fillna(x.mean()))
        
        # For Embarked, we will replace the 2 NaNs with the mode
        df["Embarked"] = df["Embarked"].fillna('S') #mode of embarked is 'S'.
        return df
        
    def unwanted_cols(df):
        
        # Remove columns unnecessary for EDA and Modeling: Name, Ticket, Cabin
        df = df.drop(['Cabin','Name','Ticket'], axis=1)
        return df
    
    def feature_transformation(df):
        
        #let combine features Sibsp and Parch since it shows whether a person was travelling alone or not
        df['fam_size'] = df['SibSp'] + df['Parch'] + 1
        df['fam_size'] = df['fam_size'].map(lambda x:1 if x==1 else(2 if x==2 else(3 if 3 <= x <= 4 else(4 if x>=5 else 0))))
            
        #let form bins or cut the distribution of age and fare into pieces
        df['Age'] = pd.cut(df['Age'].astype(int), 5)
        df['Fare'] = pd.qcut(df['Fare'].astype(int), 5)
        return df
    
    def feature_encoding(df):
        
        # source:https://towardsdatascience.com/machine-learning-with-the-titanic-dataset-7f6909e58280
        #label encode the values of non_numeric features to numbers 
        non_num_features = ['Embarked','Sex','Age','Fare']
        for feature in non_num_features:
            df[feature] = LabelEncoder().fit_transform(df[feature])
         
        #one hot encode the variables into binary codes
        cat_features = ['Embarked','Sex','Age','Fare','Pclass','fam_size']
        encoded_features = []
        for feature in cat_features:
            
            encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1,1)).toarray()
            n = df[feature].nunique()
            cols = ['{}_{}'.format(feature, n) for n in range(1, n+1)]
            encoded_df = pd.DataFrame(encoded_feat, columns=cols)
            encoded_df.index = df.index
            encoded_features.append(encoded_df)
            
        df = pd.concat([df, *encoded_features], axis=1)
        return df
    
    df = process_missing(df)
    df = unwanted_cols(df)
    df = feature_transformation(df)
    df = feature_encoding(df)
    
    return df