## Predicting Survival on the Titanic

### History
Perhaps one of the most infamous shipwrecks in history, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 people on board. Interestingly, by analysing the probability of survival based on few attributes like gender, age, and social status, we can make very accurate predictions on which passengers would survive. Some groups of people were more likely to survive than others, such as women, children, and the upper-class. Therefore, we can learn about the society priorities and privileges at the time.

### Assignment:

Build a Machine Learning Pipeline, to engineer the features in the data set and predict who is more likely to Survive the catastrophe.

Follow the Jupyter notebook below, and complete the missing bits of code, to achieve each one of the pipeline steps.

In [27]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# ========== NEW IMPORTS ========
# Respect to notebook 02-Predicting-Survival-Titanic-Solution

# pipeline
from sklearn.pipeline import Pipeline

# for the preprocessors
from sklearn.base import BaseEstimator, TransformerMixin

# for imputation
from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer)

# for encoding categorical variables
from feature_engine.encoding import (
    RareLabelEncoder,
    OneHotEncoder
)

## 1. Load the data set


In [8]:
# load the data - it is available open source and online

data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')

# display data
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [9]:
# replace ? with NaN
data = data.replace('?', np.nan)

In [11]:
# retain only the first cabin if more than one are available pre passanger
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan

data['cabin'] = data['cabin'].apply(get_first_cabin)

In [12]:
data['cabin'].head()

0     B5
1    C22
2    C22
3    C22
4    C22
Name: cabin, dtype: object

In [13]:
# extracts the title (Mr, Ms, etc) from the name variable

def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'

data['title'] = data['name'].apply(get_title)

In [14]:
data['title'].head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: title, dtype: object

In [16]:
# cast numerical variables as floats
data['fare'] = data['fare'].astype('float')
data['age'] = data['age'].astype('float')

In [17]:
# drop unnecessary variables
data.drop(['name', 'ticket', 'boat', 'body', 'home.dest'], axis=1, inplace=True)


In [18]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title
0,1,1,female,29.0,0,0,211.3375,B5,S,Miss
1,1,1,male,0.9167,1,2,151.55,C22,S,Master
2,1,0,female,2.0,1,2,151.55,C22,S,Miss
3,1,0,male,30.0,1,2,151.55,C22,S,Mr
4,1,0,female,25.0,1,2,151.55,C22,S,Mrs


In [19]:
# save the data set
data.to_csv('titanic.csv', index=False)

# Begin the assignment

In [20]:
# list variables to be used by the pipeline's transformers
NUMERICAL_VARS = [ var for var in data.columns if data[var].dtype != 'O' and var != 'survived']
CATEGORICAL_VARS = [ var for var in data.columns if data[var].dtype == 'O' and var != 'survived']
CABIN = ['cabin']


In [21]:
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('survived', axis=1),
    data['survived'],
    test_size=0.2,
    random_state=0)  # we are setting the seed here

In [22]:
X_train.shape, X_test.shape

((1047, 9), (262, 9))

## 2. Preporcessors
### Class to extract first letter of cabin variable

In [24]:
class ExtractFirstLetter(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):
        # copy dataframe
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].str[0]
        return X

## Pipeline
- Impute categorical variables with string missing
- Add a binary missing indicator to numerical variables with missing data
- Fill NA in original numerical variable with the median
- Extract first letter from cabin
- Group rare Categories
- Perform One hot encoding
- Scale features with standard scaler
- Fit a Logistic regression

In [28]:
# set up the pipeline
titanic_pipe = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string 'missing'
     ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARS)),
    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS)),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUMERICAL_VARS)),


    # Extract first letter from cabin
    ('extract_letter', ExtractFirstLetter(variables=CABIN)),


    # == CATEGORICAL ENCODING ======
    # remove categories present in less than 5% of the observations (0.05)
    # group them in one category called 'Rare'
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.05, n_categories=1, variables=CATEGORICAL_VARS)),


    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OneHotEncoder(
        drop_last=True, variables=CATEGORICAL_VARS)),
    # scale using standardization
    ('scaler', StandardScaler()),

    # logistic regression (use C=0.0005 and random_state=0)
    ('Logit', LogisticRegression(C=0.0005, random_state=0)),
])

In [30]:
# train the pipeline
titanic_pipe.fit(X_train, y_train)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/manuel/.virtualenvs/deploying-machine-learning-models/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/w2/69b2sdj90tq97dgr3llxwvdm0000gn/T/ipykernel_47978/1898835097.py", line 2, in <module>
    titanic_pipe.fit(X_train, y_train)
  File "/Users/manuel/.virtualenvs/deploying-machine-learning-models/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
  File "/Users/manuel/.virtualenvs/deploying-machine-learning-models/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X : iterable
  File "/Users/manuel/.virtualenvs/deploying-machine-learning-models/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    self.func = func
  File "/Users/manuel/.virtualenvs/deploying-machine-learning-models/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    hasa