# titanic from Kaggle

## Setup from book

In [362]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [363]:
import pandas as pd

PATH = "dataset"

def load_data(filename, path=PATH):
    csv_path = os.path.join(path, filename)
    return pd.read_csv(csv_path)

train = load_data("train.csv")
test = load_data("test.csv")

In [364]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [365]:
train.shape

(891, 12)

In [366]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [367]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [368]:
train["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [369]:
train["Survived"] = train["Survived"].astype("bool")
train["Survived"].value_counts()

False    549
True     342
Name: Survived, dtype: int64

In [370]:
train["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [371]:
train["Pclass"] = train["Pclass"].astype("category")
train["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [372]:
train["Sex"] = train["Sex"].astype("category")
train["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [373]:
train["Embarked"] = train["Embarked"].astype("object")
train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [374]:
train.dtypes

PassengerId       int64
Survived           bool
Pclass         category
Name             object
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
dtype: object

In [375]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [376]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", Imputer(strategy="median")),
    ])

In [377]:
num_pipeline.fit_transform(train)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

In [378]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [379]:
most_frequent_ = pd.Series([train[c].value_counts().index[0] for c in train], index=train.columns)
most_frequent_

PassengerId                    891
Survived                     False
Pclass                           3
Name           Kraeff, Mr. Theodor
Sex                           male
Age                             24
SibSp                            0
Parch                            0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
dtype: object

In [380]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [381]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)

In [382]:
pclass_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass"])),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

pclass_pipeline.fit_transform(train)

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [383]:
emb_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

emb_pipeline.fit_transform(train)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [465]:
sex_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Sex"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

sex_pipeline.fit_transform(train).shape

(891, 1)

In [385]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("pclass_pipeline", pclass_pipeline),
        ("emb_pipeline", emb_pipeline),
        ("sex_pipeline", sex_pipeline)
    ])

In [386]:
X_train = preprocess_pipeline.fit_transform(train)

In [387]:
X_train.shape

(891, 11)

In [388]:
X_train

array([[22.,  1.,  0., ...,  0.,  1.,  1.],
       [38.,  1.,  0., ...,  0.,  0.,  0.],
       [26.,  0.,  0., ...,  0.,  1.,  0.],
       ...,
       [28.,  1.,  2., ...,  0.,  1.,  0.],
       [26.,  0.,  0., ...,  0.,  0.,  1.],
       [32.,  0.,  0., ...,  1.,  0.,  1.]])

In [389]:
y_train = train["Survived"]

In [390]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8070874475087958

In [391]:
y_test = preprocess_pipeline.fit_transform(test)
forest_test_pred = forest_clf.predict(y_test)

In [392]:
import numpy as np

PATH = "predict"

def save_csv(x, name, path=PATH):
    csv_path = os.path.join(path, name + "_result.csv")
    pd.DataFrame(data={'PassengerId': test["PassengerId"], 'Survived': x}, dtype='int32').to_csv(csv_path, index=False)
    
save_csv(forest_test_pred, "forest")

 - bad score : 0.76

In [393]:
X_train_pd = pd.DataFrame(X_train)
X_train_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,22.0,1.0,0.0,7.25,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,26.0,0.0,0.0,7.925,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,35.0,0.0,0.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0,1.0


 - Actually : "Age", "SibSp", "Parch", "Fare", "Pclass", "Embarked", "Sex" used

In [469]:
train_tr_cabin = train["Cabin"].fillna("G")
train_tr_cabin = [x[0] for x in train_tr_cabin]
train_tr_cabin[0:10]

['G', 'C', 'G', 'C', 'G', 'G', 'E', 'G', 'G', 'G']

In [395]:
train["Name"].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [396]:
import re
p = re.compile(',(.*)\.')

train_tr_name = [p.search(x).group() for x in train["Name"]]
pd.DataFrame(train_tr_name)[0].value_counts()[4:].index

Index([', Dr.', ', Rev.', ', Col.', ', Mlle.', ', Major.', ', Ms.', ', Capt.',
       ', Mme.', ', Don.', ', Sir.', ', Lady.', ', Mrs. Martin (Elizabeth L.',
       ', the Countess.', ', Jonkheer.'],
      dtype='object')

In [411]:
def my_func(x):
    if x in pd.DataFrame(train_tr_name)[0].value_counts()[:4].index :
        return x
    else:
        return "other"

train_tr_name = [my_func(x) for x in train_tr_name]
pd.DataFrame(train_tr_name)[0].value_counts()

, Mr.        517
, Miss.      182
, Mrs.       124
, Master.     40
other         28
Name: 0, dtype: int64

In [455]:
class cabin_tr(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ret = X.fillna('H')
        ret = [x[0] for x in ret["Cabin"]]
        ret = ["H" if x not in ["A", "B", "C", "D", "E", "F", "G", "H"] else x for x in ret]
        return ret

In [456]:
cabin_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Cabin"])),
        ("cabin_tr", cabin_tr()),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

cabin_pipeline.fit_transform(train).shape

(891, 8)

In [457]:
class name_tr(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        import re
        p = re.compile(',(.*)\.')
        
        X_name = [p.search(x).group() for x in X["Name"]]
        
        def my_func(x):
            if x in pd.DataFrame(X_name)[0].value_counts()[:4].index :
                return x
            else:
                return "other"
        
        ret = [my_func(x) for x in X_name]
        return ret

In [458]:
name_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Name"])),
        ("name_tr", name_tr()),
        ("cat_encoder", CustomLabelBinarizer()),
    ])

name_pipeline.fit_transform(train).shape

(891, 5)

In [459]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("pclass_pipeline", pclass_pipeline),
        ("emb_pipeline", emb_pipeline),
        ("sex_pipeline", sex_pipeline),
        ("cabin_pipeline", cabin_pipeline),
        ("name_pipeline", name_pipeline)
    ])

In [460]:
X_train = preprocess_pipeline.fit_transform(train)
X_train.shape

(891, 24)

In [461]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8092722165474975

In [462]:
y_test = preprocess_pipeline.fit_transform(test)
forest_test_pred = forest_clf.predict(y_test)

In [463]:
print(y_test.shape, X_train.shape)

(418, 24) (891, 24)


In [464]:
save_csv(forest_test_pred, "forest2")

 - very bad score 0.72 on Kaggle
 
surely the add of feature like this overfiting the result