In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
# reference: https://medium.com/@v_winn/botnoi-classroom-titanic-dataset-pipeline-8e106e5c68a5

In [2]:
#train_file = 'new_Titanic_train.csv'
#test_file = 'new_Titanic_test.csv'

train_file = 'data/titanic_train.csv'
test_file = 'data/titanic_test.csv'

In [3]:
train = pd.read_csv('new_Titanic_train.csv', index_col=0)
test = pd.read_csv('new_Titanic_test.csv', index_col=0)

In [4]:
def get_data(filepath):
    data = pd.read_csv(filepath, index_col=0)
    X = data.drop("Survived", axis=1)
    y = data['Survived']
    return X, y

In [169]:
# using sklearn pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Imputer, OrdinalEncoder, OneHotEncoder, LabelEncoder, PowerTransformer
from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.base import BaseEstimator, TransformerMixin

In [59]:
k5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
k10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=888)

def compute_cv_score(pipe, X, y, cv, scoring, text=""):
    scores = cross_val_score(pipe, X, y ,cv=cv, scoring=scoring, n_jobs=2)
    print("CV-score: {}\n- mean={:.3f}, std={:.3f}".format(text, scores.mean(), scores.std()))

In [8]:
X_train ,y_train = get_data(train_file)
X_test, y_test = get_data(test_file)

In [7]:
X_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
75,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S
205,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S
234,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S
634,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S
844,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C


In [12]:
X_train.isnull().sum() / X_train.shape[0]

Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.205056
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Cabin       0.765449
Embarked    0.002809
dtype: float64

In [11]:
X_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          31
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       142
Embarked      0
dtype: int64

In [13]:
dtype_by_cols = X_train.dtypes
dtype_by_cols

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [71]:
num_cols = ['Pclass', 'SibSp', 'Parch', 'Fare']

In [196]:
# customized transformer

class TextExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_pat=""):
        self.text_pat = text_pat
        
    def fit(self, X, y=None):
        import re
        self.regex_  = re.compile(self.text_pat)
        self.extract_func_ = np.vectorize(lambda x: self.regex_.findall(x)[0])
        return self
    
    def transform(self, X, y=None):
        result = self.extract_func_(X)
        return result
    
class GroupNominal(BaseEstimator, TransformerMixin):
    '''group nominal feature according to threshold'''
    
    def __init__(self, threshold=0.05):
        self.threshold = threshold
        
    def fit(self, X, y=None):
        n_rows, self.n_cols_ = X.shape
        self.col_dict_ = {}
        for i in range(self.n_cols_):
            vec = X[:, i]
            name_count = pd.value_counts(vec) / n_rows
            name_to_keep = set(name_count.index[name_count > threshold])
            
            # init dict to keep value learn for each column
            self.col_dict_[i] = {}
            self.col_dict_[i]['name_to_keep'] = name_to_keep
            f = np.vectorize(lambda x: x if x in name_to_keep else "Other")
            self.col_dict_[i]['func'] = f
        return self
    
    def transform(self, X, y=None):
        assert X.shape[1]==self.n_cols_
        Xt = np.copy(X)
        for i in range(self.n_cols_):
            f = self.col_dict_[i]['func']
            Xt[:, i] = f(X[:, i])
        return Xt
    
class RandomImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.n_cols_ = X.shape[1]
        return self
    
    def transform(self, X, y=None):
        assert self.n_cols_ == X.shape[1], 'number of columns mismatch'
        
        Xt = np.copy(X)
        for i in range(self.n_cols_):
            # count missing value
            na_mask = pd.isnull(X[:, i])
            na_count = np.sum(na_mask)
            if na_count > 0:
                to_fill = np.random.normal(size=na_count)
                Xt[na_mask, i] = to_fill
        return Xt

In [197]:
# Classifier
gb = GradientBoostingClassifier()


# Missing value: Age(~20%), Cabin(~76%), Embarked(< 1%)

# Embarked preprocess: Impute with mode -> Onehot encoding
embarked_selector = ColumnTransformer(transformers=[
                        ('impute', SimpleImputer(strategy="most_frequent"), ['Embarked']),
                    ], remainder='drop')

embark_prep = Pipeline(steps=[
    ('impute', embarked_selector),
    ('onehot', OneHotEncoder(sparse=False))
])

# Cabin preprocess: convert missing to missing indicator
missing_indicator_prep = ColumnTransformer(transformers=[
    ('missing', MissingIndicator(sparse=False), ['Cabin', 'Age'])
], remainder='drop')


# Sex: label encoder
sex_prep = ColumnTransformer(transformers=[
    ('sex', OrdinalEncoder(), ['Sex'])
], remainder='drop')

# num columns: ['Pclass', 'SibSp', 'Parch', 'Fare']
num_prep = ColumnTransformer(transformers=[
    ('numeric', SimpleImputer(), num_cols)
], remainder='drop')

# Name: extract title
title_extract = ColumnTransformer(transformers=[
    ('title', TextExtractor(r', ([A-Za-z]+). '), ['Name'])
], remainder='drop')

title_prep = Pipeline(steps=[
    ('get_title', title_extract),
    ('group_title', GroupNominal()),
    ('onehot', OneHotEncoder(sparse=False))
])

# Age -> power_transform -> fill nan [mean, median, random, end of distribution]

age_power_transform = ColumnTransformer(transformers=[
    ('yeo_johnson', PowerTransformer(), ['Age'])
], remainder='drop')

age_mean_prep = Pipeline(steps=[
    ('selector', age_power_transform),
    ('impute', SimpleImputer(strategy='mean'))
])

age_median_prep = Pipeline(steps=[
    ('selector', age_power_transform),
    ('impute', SimpleImputer(strategy='median'))
])

age_3sd_prep = Pipeline(steps=[
    ('selector', age_power_transform),
    ('impute', SimpleImputer(strategy='constant', fill_value=3.0))
])

age_rnd_prep = Pipeline(steps=[
    ('selector', age_power_transform),
    ('impute', RandomImputer())
])

# Age -> bucket


# combine features
all_prep = FeatureUnion(transformer_list=[
    #('embark_prep', embark_prep),
    ('missing_indicator', missing_indicator_prep),
    ('sex_prep', sex_prep),
    ('num_prep', num_prep),
    ('title_prep', title_prep),
    ('age_mean', age_mean_prep),        
    #('age_median', age_median_prep),  
    #('age_3sd', age_3sd_prep),
    ('age_rnd', age_rnd_prep)
    
], n_jobs=2)




# full pipeline
full_pipe = Pipeline(steps=[
    ('prep', all_prep),
    ('clf', gb)
])

In [142]:
temp = title_prep.fit(X_train).transform(X_train)
temp[:5]

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [140]:
temp = all_prep.fit_transform(X_train)
temp[:5]

array([[ 1.    ,  0.    ,  1.    ,  3.    ,  0.    ,  0.    , 56.4958,
         0.    ,  1.    ,  0.    ,  0.    ],
       [ 1.    ,  0.    ,  1.    ,  3.    ,  0.    ,  0.    ,  8.05  ,
         0.    ,  1.    ,  0.    ,  0.    ],
       [ 1.    ,  0.    ,  0.    ,  3.    ,  4.    ,  2.    , 31.3875,
         1.    ,  0.    ,  0.    ,  0.    ],
       [ 1.    ,  1.    ,  1.    ,  1.    ,  0.    ,  0.    ,  0.    ,
         0.    ,  1.    ,  0.    ,  0.    ],
       [ 1.    ,  0.    ,  1.    ,  3.    ,  0.    ,  0.    ,  6.4375,
         0.    ,  1.    ,  0.    ,  0.    ]])

In [42]:
# check if pipe can work with test set
y_pred = full_pipe.fit(X_train, y_train).predict(X_test)

In [63]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, 
                 scoring='accuracy', text="Embarked Onehot")

CV-score: Embarked Onehot
- mean=0.625, std=0.044


In [65]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, 
                 scoring='accuracy', text="Embarked Onehot + Missing[Cabin, Age]")

CV-score: Embarked Onehot + Missing[Cabin, Age]
- mean=0.692, std=0.029


In [61]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, 
                 scoring='accuracy', text="Embarked Onehot + Missing[Cabin, Age] + Sex")

CV-score: Embarked Onehot + Missing[Cabin, Age] + Sex
- mean=0.773, std=0.039


In [74]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, 
                 scoring='accuracy', text="Embarked Onehot + Missing[Cabin, Age] + Sex + numeric_columns")

CV-score: Embarked Onehot + Missing[Cabin, Age] + Sex + numeric_columns
- mean=0.784, std=0.035


In [76]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, 
                 scoring='accuracy', text="Missing[Cabin, Age] + Sex + numeric_columns")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns
- mean=0.795, std=0.030


In [148]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
                 text="Missing[Cabin, Age] + Sex + numeric_columns + Title")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns + Title
- mean=0.812, std=0.025


In [177]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
                 text="Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[median]")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[median]
- mean=0.824, std=0.035


In [179]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
                 text="Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[3sd]")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[3sd]
- mean=0.823, std=0.029


In [198]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
                 text="Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[rnd]")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[rnd]
- mean=0.824, std=0.040


In [175]:
compute_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
                 text="Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[mean]")

CV-score: Missing[Cabin, Age] + Sex + numeric_columns + Title + Age[mean]
- mean=0.829, std=0.036
