# Section 1-5 - Final Checks

We now arrive at the last piece of puzzle - comparing the mean against the median when filling in the training data.

## Pandas - Extracting data

In [59]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/train.csv')

## Pandas - Cleaning data

In [60]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

As we don't know whether the mean or the median will do better, we calculate both.

In [61]:
age_mean = df['Age'].mean()
age_median = df['Age'].median()

In [62]:
from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

df = df.fillna(-1)

train_data = df.values

## Scikit-learn - Training the model

In [63]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

imputer = Imputer(missing_values=-1)

classifier = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
    ('imp', imputer),
    ('clf', classifier),
])

We now include the mean-median comparison into our pipeline.

In [64]:
parameter_grid = {
    'imp__strategy': ['mean', 'median'],
    'clf__max_features': [0.5, 1],
    'clf__max_depth': [5, None],
}

In [65]:
grid_search = GridSearchCV(pipeline, parameter_grid, cv=5, verbose=3)
grid_search.fit(train_data[0:,2:], train_data[0:,0])

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean .....
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean, score=0.793296 -   0.2s
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean .....
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean, score=0.826816 -   0.2s
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean .....
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean, score=0.825843 -   0.2s
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean .....
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean, score=0.786517 -   0.2s
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean .....
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strategy=mean, score=0.853107 -   0.2s
[CV] clf__max_features=0.5, clf__max_depth=5, imp__strategy=median ...
[CV]  clf__max_features=0.5, clf__max_depth=5, imp__strateg

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    6.7s finished





GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values=-1, strategy='mean', verbose=0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__max_features': [0.5, 1], 'clf__max_depth': [5, None], 'imp__strategy': ['mean', 'median']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [66]:
sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
print grid_search.best_score_, grid_search.best_params_

0.821548821549 {'clf__max_features': 0.5, 'clf__max_depth': 5, 'imp__strategy': 'median'}


As before, we replace the -1 values in the column Age by the better performer.

In [67]:
if grid_search.best_params_['imp__strategy'] == 'mean':
    print 'in age mean', age_median
    df['Age'] = df['Age'].map(lambda x: age_mean if x == -1 else x)
else:
    print 'in age median', age_median
    df['Age'] = df['Age'].map(lambda x: age_median if x == -1 else x)

in age median 28.0


In [68]:
train_data = df.values

In [69]:
model = RandomForestClassifier(
    n_estimators = 100, 
    max_features=grid_search.best_params_['clf__max_features'], 
    max_depth=grid_search.best_params_['clf__max_depth'])
model = model.fit(train_data[0:,2:],train_data[0:,0])

## Scikit-learn - Making predictions

In [70]:
df_test = pd.read_csv('../data/test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

Similarly we fill in the NAs in the test data with the better performer.

In [71]:
df_test['Age'] = df_test['Age'].fillna(age_mean)

In [72]:
fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            fare_means[x['Pclass']] if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')],
                axis=1)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

output = model.predict(test_data[:,1:])

  app.launch_new_instance()


## Pandas - Preparing for submission

In [73]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]


df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('../results/titanic_1-5.csv', index=False)