# Titanic: Machine Learning from Disaster_2


This notebook is an improved project from my "Titanic: Machine Learning from Disaster".  


## Overview 

This notebook runs through most of the basic components of a ML script on the Titanic dataset, using  
  
-Python  
-Pandas  
-Sci-kit learn  
-XGBoost

The goal is to use a simple and easy to understand implementation of:

-feature engineering  
-feature selection using Greedy Search (RFECV)  
-hyperparameter tuning using Grid Search  
-XGBoost classifier  

## Step 1 

***Import the packages.***

In [2]:
from IPython.display import display

import re
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV



***Functions to generate new features***

In [3]:
def extract_maritial(name):
    """ extract the person's title, and bin it to Mr. Miss. and Mrs.
    assuming a Miss, Lady or Countess has more change to survive than a regular married woman."""
    
    re_maritial = r' ([A-Za-z]+\.) '   # use regular expressions to extract the persons title
    found = re.findall(re_maritial, name)[0]
    replace = [['Dr.','Sir.'],
               ['Rev.','Sir.'],
               ['Major.','Officer.'],
               ['Mlle.','Miss.'],
               ['Col.','Officer.'],
               ['Master.','Sir.'],
               ['Jonkheer.','Sir.'],
               ['Sir.','Sir.'],
               ['Don.','Sir.'],
               ['Countess.','High.'],
               ['Capt.','Officer.'],
               ['Ms.','High.'],
               ['Mme.','High.'],
               ['Dona.','High.'],Functions to generate new features
               ['Lady.','High.']]
                
    for i in range(0,len(replace)):
        if found == replace[i][0]:
            found = replace[i][1]
            break
    return found


def father(sex, age, parch):
    if sex == 'male' and age > 16 and parch > 0:
        return 1
    else:
        return 0
        
        
def mother(sex, age, parch):
    if sex == 'female' and age > 16 and parch > 0:
        return 1
    else:
        return 0
        
        
def parent(sex, age, parch):
    if mother(sex, age, parch) == 1 or father(sex, age, parch) == 1:
        return 1
    else:
        return 0
        
        
def extract_cabin_nr(cabin):
    """ Extracts the cabin number.  If there no number found, return NaN """
    if not pd.isnull(cabin):
        cabin = cabin.split(' ')[-1]    # if several cabins on ticket, take last one
        re_numb = r'[A-Z]([0-9]+)'
        try:
            number = int(re.findall(re_numb, cabin)[0])
            return number
        except:
            return np.nan
    else:
        return np.nan
    
    
def extract_cabin_letter(cabin):
    """ Extracts the cabin letter.  If there no letter found, return NaN """
    if not pd.isnull(cabin):
        cabin = cabin.split(' ')[-1]    # if several cabins on ticket, take last one
        re_char = r'([A-Z])[0-9]+'
        try:
            character = re.findall(re_char, cabin)[0]
            return character
        except:
            return np.nan
    else:
        return np.nan
        
        
def expand_sex(sex, age):
    """ this expands male/female with kid.  Cause below 14 years old, male or female is irrelevant"""
    if age < 14:
        return 'kid'
    else:
        return sex

***Function to add the new features to the dataset***

In [5]:
def feat_eng(data):
    # create feature 'Title', which extracts the persons title from their name.
    data['Title'] = list(map(extract_maritial, data['Name']))

    # Extract features from cabins
    data['Cabin_char'] = list(map(extract_cabin_letter, data['Cabin']))
    data['Cabin_nr'] = list(map(extract_cabin_nr, data['Cabin']))
    data['Cabin_nr_odd'] = data.Cabin_nr.apply(lambda x: np.nan if x == np.nan else x%2)
    
    # Family features
    data['Father'] = list(map(father, data.Sex, data.Age, data.Parch))
    data['Mother'] = list(map(mother, data.Sex, data.Age, data.Parch))
    data['Parent'] = list(map(parent, data.Sex, data.Age, data.Parch))
    data['has_parents_or_kids'] = data.Parch.apply(lambda x: 1 if x > 0 else 0)
    data['FamilySize'] = data.SibSp + data.Parch
    
    # Extend the male/female feature with kid.  Cause for kids gender doesn't matter.
    data['Sex'] = list(map(expand_sex, data['Sex'], data['Age']))
    
    # Create bins for Fare and Age
    data['FareBin'] = pd.cut(data.Fare, bins=(-1000,0,8.67,16.11,32,350,1000))
    data['AgeBin'] = pd.cut(data.Age, bins=(0,15,25,60,90))

    data.head(8)
    return data

***Function to handle missing data***

In [6]:
def missing(data):
    # If Age is null, we impute it with the median Age for their title.
    data.loc[(data.Age.isnull()) & (data.Title == 'Sir.'), 'Age'] = data.loc[data.Title == 'Sir.', 'Age'].median()        
    data.loc[(data.Age.isnull()) & (data.Title == 'Officer.'), 'Age'] = data.loc[data.Title == 'Officer.', 'Age'].median()
    data.loc[(data.Age.isnull()) & (data.Title == 'Miss.'), 'Age'] = data.loc[data.Title == 'Miss.', 'Age'].median()
    data.loc[(data.Age.isnull()) & (data.Title == 'High.'), 'Age'] = data.loc[data.Title == 'High.', 'Age'].median()
    data.loc[(data.Age.isnull()) & (data.Title == 'Mrs.'), 'Age'] = data.loc[data.Title == 'Mrs.', 'Age'].median()
    data.loc[(data.Age.isnull()) & (data.Title == 'Mr.'), 'Age'] = data.loc[data.Title == 'Mr.', 'Age'].median()

    # There is one row without a Fare...
    median_fare = data['Fare'].median()
    data['Fare'].fillna(value=median_fare, inplace=True)

    # ... and 2 rows without Embarked.
    mode_embarked = data['Embarked'].mode()[0]
    data['Embarked'].fillna(value=mode_embarked, inplace=True)

    # deal with the NaN's in some of our newly created columns
    data['Cabin_char'].fillna(value=-9999, inplace=True)
    data['Cabin_nr'].fillna(value=-9999, inplace=True)
    data['Cabin_nr_odd'].fillna(value=-9999, inplace=True)

    # after our feature engineering, we don't need some of the original features anymore
    data = data.drop(['Name','Cabin','Fare','Age','Ticket'], 1)

    data.head(8)
    return data

## Step 2 

***Preparing the training set***

In [9]:
# read the training set
train = pd.read_csv('input/train.csv')
display("Unaltered training set:")
display(train.head(8))

# feature engineering
train = feat_eng(train)
display("After feature engineering:")
display(train.head(8))

# treat missing values
train = missing(train)
display("After handling missing values:")
display(train.head(8))

# convert categorical values to numerical
train = pd.get_dummies(train, drop_first=True)
display("After handling categorical values:")
display(train.head(8))

'Unaltered training set:'

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


'After feature engineering:'

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Cabin_char,Cabin_nr,Cabin_nr_odd,Father,Mother,Parent,has_parents_or_kids,FamilySize,FareBin,AgeBin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,,,,0,0,0,0,1,"(0.0, 8.67]","(15, 25]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,85.0,1.0,0,0,0,0,1,"(32.0, 350.0]","(25, 60]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,,,,0,0,0,0,0,"(0.0, 8.67]","(25, 60]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,C,123.0,1.0,0,0,0,0,1,"(32.0, 350.0]","(25, 60]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,,,,0,0,0,0,0,"(0.0, 8.67]","(25, 60]"
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,...,,,,0,0,0,0,0,"(0.0, 8.67]",
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,...,E,46.0,0.0,0,0,0,0,0,"(32.0, 350.0]","(25, 60]"
7,8,0,3,"Palsson, Master. Gosta Leonard",kid,2.0,3,1,349909,21.075,...,,,,0,0,0,1,4,"(16.11, 32.0]","(0, 15]"


'After handling missing values:'

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,Cabin_char,Cabin_nr,Cabin_nr_odd,Father,Mother,Parent,has_parents_or_kids,FamilySize,FareBin,AgeBin
0,1,0,3,male,1,0,S,Mr.,-9999,-9999.0,-9999.0,0,0,0,0,1,"(0.0, 8.67]","(15, 25]"
1,2,1,1,female,1,0,C,Mrs.,C,85.0,1.0,0,0,0,0,1,"(32.0, 350.0]","(25, 60]"
2,3,1,3,female,0,0,S,Miss.,-9999,-9999.0,-9999.0,0,0,0,0,0,"(0.0, 8.67]","(25, 60]"
3,4,1,1,female,1,0,S,Mrs.,C,123.0,1.0,0,0,0,0,1,"(32.0, 350.0]","(25, 60]"
4,5,0,3,male,0,0,S,Mr.,-9999,-9999.0,-9999.0,0,0,0,0,0,"(0.0, 8.67]","(25, 60]"
5,6,0,3,male,0,0,Q,Mr.,-9999,-9999.0,-9999.0,0,0,0,0,0,"(0.0, 8.67]",
6,7,0,1,male,0,0,S,Mr.,E,46.0,0.0,0,0,0,0,0,"(32.0, 350.0]","(25, 60]"
7,8,0,3,kid,3,1,S,Sir.,-9999,-9999.0,-9999.0,0,0,0,1,4,"(16.11, 32.0]","(0, 15]"


'After handling categorical values:'

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Cabin_nr,Cabin_nr_odd,Father,Mother,Parent,...,Cabin_char_F,Cabin_char_G,"FareBin_(0.0, 8.67]","FareBin_(8.67, 16.11]","FareBin_(16.11, 32.0]","FareBin_(32.0, 350.0]","FareBin_(350.0, 1000.0]","AgeBin_(15, 25]","AgeBin_(25, 60]","AgeBin_(60, 90]"
0,1,0,3,1,0,-9999.0,-9999.0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,2,1,1,1,0,85.0,1.0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,3,1,3,0,0,-9999.0,-9999.0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,4,1,1,1,0,123.0,1.0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,5,0,3,0,0,-9999.0,-9999.0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,6,0,3,0,0,-9999.0,-9999.0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,7,0,1,0,0,46.0,0.0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
7,8,0,3,3,1,-9999.0,-9999.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


***Training the XGBoost model***

In [10]:
X = np.array(train.drop(['Survived','PassengerId'], 1))
training_features = np.array(train.drop(['Survived','PassengerId'], 1).columns)
#X = preprocessing.scale(X)  
y = np.array(train['Survived'])

In [11]:
clf = xgb.XGBClassifier()
cv = cross_validation.KFold(len(X), n_folds=20, shuffle=True, random_state=1)
scores = cross_validation.cross_val_score(clf, X, y, cv=cv, n_jobs=1, scoring='accuracy')
clf.fit(X,y)
print(scores)
print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores)))

[ 0.73333333  0.71111111  0.82222222  0.73333333  0.82222222  0.71111111
  0.77777778  0.8         0.77777778  0.86666667  0.86666667  0.84090909
  0.90909091  0.84090909  0.77272727  0.81818182  0.88636364  0.88636364
  0.84090909  0.75      ]
Accuracy: 0.808 stdev: 0.06


***Feature selection with Greedy Search (RFECV)***

In [12]:
featselect = RFECV(estimator=clf, cv=cv, scoring='accuracy')
featselect.fit(X,y)

print("features used during training: ")
print(training_features)
print("")
print("features proposed by RFECV: "),
print(training_features[featselect.support_])

# Note that for our feature "Sex", which consists of male/female/kid, the classifier only needs to
# know if a person is male or not.  The classifier expects women and children to have equal
# chance of survival.  Which makes sense when we think about "Women and children first!".

features used during training: 
['Pclass' 'SibSp' 'Parch' 'Cabin_nr' 'Cabin_nr_odd' 'Father' 'Mother'
 'Parent' 'has_parents_or_kids' 'FamilySize' 'Sex_kid' 'Sex_male'
 'Embarked_Q' 'Embarked_S' 'Title_Miss.' 'Title_Mr.' 'Title_Mrs.'
 'Title_Officer.' 'Title_Sir.' 'Cabin_char_A' 'Cabin_char_B' 'Cabin_char_C'
 'Cabin_char_D' 'Cabin_char_E' 'Cabin_char_F' 'Cabin_char_G'
 'FareBin_(0.0, 8.67]' 'FareBin_(8.67, 16.11]' 'FareBin_(16.11, 32.0]'
 'FareBin_(32.0, 350.0]' 'FareBin_(350.0, 1000.0]' 'AgeBin_(15, 25]'
 'AgeBin_(25, 60]' 'AgeBin_(60, 90]']

features proposed by RFECV: 
['Pclass' 'Cabin_nr' 'FamilySize' 'Sex_male' 'FareBin_(32.0, 350.0]']


***Training the XGBoost model again after feature selection***

In [13]:
selection = np.append(training_features[featselect.support_], ['Survived','PassengerId'])
train2 = train[selection]

X = np.array(train2.drop(['Survived','PassengerId'], 1))
training_features = np.array(train2.drop(['Survived','PassengerId'], 1).columns)
#X = preprocessing.scale(X)  --- not needed for XGboost?
y = np.array(train2['Survived'])

clf = xgb.XGBClassifier()
cv = cross_validation.KFold(len(X), n_folds=20, shuffle=True, random_state=1)
scores = cross_validation.cross_val_score(clf, X, y, cv=cv, n_jobs=1, scoring='accuracy')
print(scores)
print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores)))
clf.fit(X,y)

[ 0.8         0.75555556  0.86666667  0.77777778  0.84444444  0.71111111
  0.8         0.82222222  0.77777778  0.91111111  0.86666667  0.86363636
  0.90909091  0.90909091  0.77272727  0.84090909  0.88636364  0.86363636
  0.88636364  0.75      ]
Accuracy: 0.831 stdev: 0.06


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

***Hyper parameter tuning using Grid Search***

In [14]:
# just as an example, tuning 2 parameters.
# first try a wide range, e.g. [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
# and then narrow it down.
grid = {'learning_rate':[0, 0.001, 0.002, 0.004, 0.006, 0.008, 0.010], 
       'reg_lambda':[0, 0.01, 0.10, 0.50, 1]}

search = GridSearchCV(estimator=clf, param_grid=grid, scoring='accuracy', n_jobs=1, refit=True, cv=cv)
search.fit(X,y)

print(search.best_params_)
print(search.best_score_)

{'reg_lambda': 1, 'learning_rate': 0.004}
0.8372615039281706


***Making prediction for testing set***

In [15]:
# read testing set
test = pd.read_csv('input/test.csv')

# pull the test set through our feature engineering and missing values functions
test = feat_eng(test)
test = missing(test)

# deal with categorical values
test = pd.get_dummies(test, drop_first=True)

# remove features deemed unworthy by our feature selection (RFECV)
test2 = test[training_features]
# the above line removes several features incl. PassengerId.
# So we prefer to keep our 'test' variable as it is, cause a few lines below
# we will need the passengerid feature.

X = np.array(test2)
#X = preprocessing.scale(X)
y_predict = clf.predict(X)
dfresult = pd.DataFrame(y_predict, test.PassengerId)

***Wrtie the result to prediction.csv***

In [16]:
dfresult.columns = ['Survived']
dfresult.to_csv('predictions.csv')
print("done")

done
