In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
PASSENGER_PATH = "datasets/titanic/passengers.csv"
passengers = pd.read_csv(PASSENGER_PATH)

In [4]:
passengers["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

Step 1: Split train/test data

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(passengers, passengers["Sex"]):
    strat_train_set = passengers.loc[train_index]
    strat_test_set = passengers.loc[test_index]

In [6]:
passengers = strat_train_set.copy().drop("Survived", axis=1)
passengers_labels = strat_train_set["Survived"]
passengers

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
788,789,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.5750,,S
347,348,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1000,,S
629,630,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
734,735,2,"Troupiansky, Mr. Moses Aaron",male,23.0,0,0,233639,13.0000,,S
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
763,764,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36.0,1,2,113760,120.0000,B96 B98,S
334,335,1,"Frauenthal, Mrs. Henry William (Clara Heinshei...",female,,1,0,PC 17611,133.6500,,S
209,210,1,"Blank, Mr. Henry",male,40.0,0,0,112277,31.0000,A31,C
389,390,2,"Lehmann, Miss. Bertha",female,17.0,0,0,SC 1748,12.0000,,C
471,472,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S


In [7]:
# Definition of the CategoricalEncoder class, copied from PR #9151.
# Just run this cell, or copy it to your code, do not try to understand it (yet).

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

Step 1: Remove useless columns

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

In [67]:
passenger_base = ["Pclass", "SibSp", "Parch"]
passenger_tweaked = ["Fare", "Age", "Sex", "Embarked"]

from sklearn.preprocessing import StandardScaler, Imputer, LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn_pandas import DataFrameMapper, CategoricalImputer

age_pipeline = Pipeline([
    ('median_imputer', Imputer(strategy="median")),
    ('standard_scaler', StandardScaler()),
])
embarked_pipeline = Pipeline([
    ('categorical_imputer', CategoricalImputer()),
    ('binary_encoder', CategoricalEncoder()),
])
sex_pipeline = Pipeline([
    ('binary_encoder', CategoricalEncoder()),
])
tweaked_pipeline = DataFrameMapper([
    (['Fare'], StandardScaler()),
    (['Age'], age_pipeline),
    (['Sex'], sex_pipeline),
    (['Embarked'], embarked_pipeline),
])
base_pipeline = Pipeline([
    ('selector', DataFrameSelector(passenger_base)),
    ('standard_scaler', StandardScaler()),
    ('imputer', Imputer(strategy="median")),
])
full_pipeline = FeatureUnion(transformer_list=[
    ("base_pipeline", base_pipeline),
    ("tweaked_pipeline", tweaked_pipeline),
])

In [68]:
passengers_prepared = full_pipeline.fit_transform(passengers)
passengers_prepared



array([[ 0.80807448,  0.40887812,  2.02647087, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.80807448,  0.40887812, -0.46549331, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.80807448, -0.49242597, -0.46549331, ...,  0.        ,
         1.        ,  0.        ],
       ..., 
       [-1.57926589, -0.49242597, -0.46549331, ...,  0.        ,
         0.        ,  1.        ],
       [-1.57926589,  0.40887812, -0.46549331, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.80807448,  3.1127904 ,  2.02647087, ...,  0.        ,
         0.        ,  1.        ]])

Step 3: Search for fitting model

In [73]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'min_samples_split': [2, 4, 6, 8, 10, 20, 40, 60, 80], 'max_features': [2, 3, 6, 8, 10]},
]
tree_classifier = DecisionTreeClassifier()
grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(passengers_prepared, passengers_labels)
print("Best params:", grid_search.best_params_)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("Score:", np.sqrt(-mean_score), "Params:", params)

Best params: {'max_features': 10, 'min_samples_split': 80}
Score: 0.494350101114 Params: {'max_features': 2, 'min_samples_split': 2}
Score: 0.482852013992 Params: {'max_features': 2, 'min_samples_split': 4}
Score: 0.469580257227 Params: {'max_features': 2, 'min_samples_split': 6}
Score: 0.476999046003 Params: {'max_features': 2, 'min_samples_split': 8}
Score: 0.465072182388 Params: {'max_features': 2, 'min_samples_split': 10}
Score: 0.471073361972 Params: {'max_features': 2, 'min_samples_split': 20}
Score: 0.460519979681 Params: {'max_features': 2, 'min_samples_split': 40}
Score: 0.462042363932 Params: {'max_features': 2, 'min_samples_split': 60}
Score: 0.45283129284 Params: {'max_features': 2, 'min_samples_split': 80}
Score: 0.487195597847 Params: {'max_features': 3, 'min_samples_split': 2}
Score: 0.502801142365 Params: {'max_features': 3, 'min_samples_split': 4}
Score: 0.465072182388 Params: {'max_features': 3, 'min_samples_split': 6}
Score: 0.463559748522 Params: {'max_features': 3,

In [76]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'solver': ['lbfgs'], 'alpha': [1e-6, 1e-7, 1e-8], 'hidden_layer_sizes': [(5, 2), (6, 2), (7, 3)], 'random_state': [1]}
]
mlp_classifier = MLPClassifier()
grid_search = GridSearchCV(mlp_classifier, param_grid, cv=3, scoring="neg_mean_squared_error")
grid_search.fit(passengers_prepared, passengers_labels)
print("Best params:", grid_search.best_params_)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("Score:", np.sqrt(-mean_score), "Params:", params)

Best params: {'alpha': 1e-08, 'hidden_layer_sizes': (5, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.445009784011 Params: {'alpha': 1e-06, 'hidden_layer_sizes': (5, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.445009784011 Params: {'alpha': 1e-06, 'hidden_layer_sizes': (6, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.458992546022 Params: {'alpha': 1e-06, 'hidden_layer_sizes': (7, 3), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.443428927206 Params: {'alpha': 1e-07, 'hidden_layer_sizes': (5, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.45592232731 Params: {'alpha': 1e-07, 'hidden_layer_sizes': (6, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.457460012374 Params: {'alpha': 1e-07, 'hidden_layer_sizes': (7, 3), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.438652174664 Params: {'alpha': 1e-08, 'hidden_layer_sizes': (5, 2), 'random_state': 1, 'solver': 'lbfgs'}
Score: 0.458992546022 Params: {'alpha': 1e-08, 'hidden_layer_sizes': (6, 2), 'random_state': 1, 's

In [77]:
from sklearn.svm import SVC
param_grid = [
    {'C': [0.2, 0.5, 0.8, 1.0, 1.5, 2.0, 5.0, 10.0], 'kernel': ['rbf', 'sigmoid', 'poly']},
]
sv_classifier = SVC()
grid_search = GridSearchCV(sv_classifier, param_grid, cv=10, scoring="neg_mean_squared_error")
grid_search.fit(passengers_prepared, passengers_labels)
print("Best params:", grid_search.best_params_)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("Score:", np.sqrt(-mean_score), "Params:", params)

Best params: {'C': 2.0, 'kernel': 'rbf'}
Score: 0.440250184235 Params: {'C': 0.2, 'kernel': 'rbf'}
Score: 0.478469000323 Params: {'C': 0.2, 'kernel': 'sigmoid'}
Score: 0.521988419519 Params: {'C': 0.2, 'kernel': 'poly'}
Score: 0.432201056002 Params: {'C': 0.5, 'kernel': 'rbf'}
Score: 0.512484584586 Params: {'C': 0.5, 'kernel': 'sigmoid'}
Score: 0.505586765578 Params: {'C': 0.5, 'kernel': 'poly'}
Score: 0.419000951971 Params: {'C': 0.8, 'kernel': 'rbf'}
Score: 0.527342286983 Params: {'C': 0.8, 'kernel': 'sigmoid'}
Score: 0.487195597847 Params: {'C': 0.8, 'kernel': 'poly'}
Score: 0.41732158268 Params: {'C': 1.0, 'kernel': 'rbf'}
Score: 0.529998940003 Params: {'C': 1.0, 'kernel': 'sigmoid'}
Score: 0.462042363932 Params: {'C': 1.0, 'kernel': 'poly'}
Score: 0.415635427976 Params: {'C': 1.5, 'kernel': 'rbf'}
Score: 0.533959136592 Params: {'C': 1.5, 'kernel': 'sigmoid'}
Score: 0.415635427976 Params: {'C': 1.5, 'kernel': 'poly'}
Score: 0.410535413628 Params: {'C': 2.0, 'kernel': 'rbf'}
Score: 

In [78]:
grid_search.best_params_

{'C': 2.0, 'kernel': 'rbf'}