In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib as plt
%matplotlib inline
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [2]:
#Split values based on the two CSVs
training = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
testing = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")

data = training
target = data['Survived'] 

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [4]:
# Show percentage survivors by sex and by class
pvt_train = pd.pivot_table(data=training, index=['Survived'], columns=['Sex', 'Embarked'], values =['PassengerId'], aggfunc='count')
pvt_train.apply(lambda x: x / (x.max() + x.min()))

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId
Sex,female,female,female,male,male,male
Embarked,C,Q,S,C,Q,S
Survived,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
0,0.123288,0.25,0.310345,0.694737,0.926829,0.825397
1,0.876712,0.75,0.689655,0.305263,0.073171,0.174603


In [5]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
def title_extract(name):
    title = name.split(',')[1].split('.')[0].strip()
    if title in title_map: title = title_map[title]
    return title

In [38]:

data['expensive_coach_woman'] = ((data.Sex == 'female') & (data.Pclass==3) & (data.Fare >= 20)) * 1
# based on looking at the values there are a few that appear very infrequently, we will map them to more useful values
title_map = {'Capt': 'Sir', 'Don':'Sir', 'Major':'Sir', 'Sir':'Sir', 'Col':'Sir', 'Mlle':'Sir', 'Jonkheer':'Sir',
             'Mme': 'Lady', 'Lady':'Lady', 'the Countess':'Lady', 'Ms': 'Miss'}

data['Title'] = data.Name.apply(lambda x: title_extract(x))
data['Surname'] = data.Name.apply(lambda x: x.split(',')[0])
data['Family_size'] = data.SibSp + data.Parch + 1
data['FamilyID'] = (data.Family_size.astype(str) + data.Surname) * ((data.Family_size >= 2) * 1)
data['FamilyID'] = data['FamilyID'].apply(lambda x: 'small' if x == '' else x)

In [39]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,expensive_coach_woman,Title,Surname,Family_size,FamilyID
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,0,Mr,Braund,2,2Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,0,Mrs,Cumings,2,2Cumings


In [40]:
features = data[['expensive_coach_woman', 'Pclass', 'Age', 
                 'Fare', 'Family_size']].copy()
features = pd.concat([features, pd.get_dummies(data['Sex'], prefix='Sex').drop('Sex_male', axis=1)], axis=1)
features = pd.concat([features, pd.get_dummies(data[data['Title'].isin(['Mr','Miss','Mrs'])]['Title'])], axis=1)
features[['Mr', 'Miss', 'Mrs']] = features[['Mr', 'Miss', 'Mrs']].fillna(0) 
features = pd.concat([features, pd.get_dummies(data['Embarked'], prefix='Embarked')], axis=1).copy()
features.head()

Unnamed: 0,expensive_coach_woman,Pclass,Age,Fare,Family_size,Sex_female,Miss,Mr,Mrs,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22,7.25,2,0,0,1,0,0,0,1
1,0,1,38,71.2833,2,1,0,0,1,1,0,0
2,0,3,26,7.925,1,1,1,0,0,0,0,1
3,0,1,35,53.1,2,1,0,0,1,0,0,1
4,0,3,35,8.05,1,0,0,1,0,0,0,1


In [41]:
# Handle missing values
# We will try to loop through the different classes and sex to apply the median age
for sex in xrange(0,2):
    for cabin_class in xrange(0,4):
        feature_group = features[
            (features.Pclass == cabin_class) & 
            (features.Sex_female == sex)]
        median_age_group = feature_group.Age.dropna().median()
        missing_age_ix = feature_group[pd.isnull(feature_group.Age)].index
        features.loc[missing_age_ix, 'Age'] = median_age_group

In [42]:
# Feature Enineering
features['child'] = (features.Age.dropna() <= 18)*1

In [43]:
features.head()

Unnamed: 0,expensive_coach_woman,Pclass,Age,Fare,Family_size,Sex_female,Miss,Mr,Mrs,Embarked_C,Embarked_Q,Embarked_S,child
0,0,3,22,7.25,2,0,0,1,0,0,0,1,0
1,0,1,38,71.2833,2,1,0,0,1,1,0,0,0
2,0,3,26,7.925,1,1,1,0,0,0,0,1,0
3,0,1,35,53.1,2,1,0,0,1,0,0,1,0
4,0,3,35,8.05,1,0,0,1,0,0,0,1,0


### Building the model 

In [13]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA, RandomizedPCA

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC


In [14]:
scaler = StandardScaler()

In [15]:
features.describe()

Unnamed: 0,expensive_coach_woman,Pclass,Age,Fare,Family_size,Sex_female,Miss,Mr,Mrs,Embarked_C,Embarked_Q,Embarked_S,child
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.041526,2.308642,29.112424,32.204208,1.904602,0.352413,0.205387,0.580247,0.140292,0.188552,0.08642,0.722783,0.156004
std,0.199616,0.836071,13.304424,49.693429,1.613459,0.47799,0.404211,0.493796,0.347485,0.391372,0.281141,0.447876,0.363063
min,0.0,1.0,0.42,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,21.5,7.9104,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,26.0,14.4542,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,3.0,36.0,31.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,3.0,80.0,512.3292,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
classifiers = [RandomForestClassifier(), GradientBoostingClassifier(), 
               ExtraTreesClassifier(), LogisticRegression(), DecisionTreeClassifier(),
               KNeighborsClassifier(), SVC(), LinearSVC(), GaussianNB()]

print '\n'
for classifier in classifiers:
    pipeline = Pipeline([
        ('scl', scaler),
        #('imp', imputer),
        ('clf', classifier),
    ])
    #scores = cross_val_score(pipeline, features.values, target, cv=5, scoring='accuracy')
    scores = cross_val_score(pipeline, features, target, cv=10, scoring='accuracy')
    print str(classifier)
    print('min -', scores.min(), 'mean -', scores.mean(), 'max -', scores.max())
    print '\n\n'



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
('min -', 0.7415730337078652, 'mean -', 0.79802292588809443, 'max -', 0.84269662921348309)



GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
('min -', 0.7528089887640449, 'mean -', 0.83282998524571572, 'max -', 0.9101123595505618)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max

In [None]:
imputer = Imputer(strategy='median', missing_values=-1)
scalar = StandardScaler()
pca = PCA()

classifier = GradientBoostingClassifier(n_estimators=100, subsample=.8)

params = {
    'pca__n_components': np.arange(2,features.shape[1]+1).tolist() ,
    'clf__learning_rate': [0.1, 0.15, 0.2],
    'clf__max_features': [0.76, 0.77, 0.78],
    'clf__max_depth': [3.6, 3.63],
}

pipeline = Pipeline([
    ('scl', scalar),
    ('imp', imputer),
    ('pca', pca),
    ('clf', classifier),
])

grid_search = GridSearchCV(pipeline, params, cv=10, scoring='accuracy')
grid_search.fit(features.values, target)
print grid_search.best_score_
print grid_search.best_params_
# with pca
# 0.866883208974
# {'clf__max_features': 0.77, 'clf__max_depth': 3.63, 'clf__learning_rate': 0.1, 'pca__n_components': 3}
# without pca without scalar
# 0.87819667245
# {'clf__max_features': 0.77, 'clf__max_depth': 3.63, 'clf__learning_rate': 0.1}
# without pca with scalar
# 0.878755362749
# {'clf__max_features': 0.76, 'clf__max_depth': 3.63, 'clf__learning_rate': 0.1}

0.818181818182
{'clf__max_features': 0.76, 'clf__max_depth': 3.6, 'clf__learning_rate': 0.1, 'pca__n_components': 9}


In [None]:
imputer = Imputer(strategy='median', missing_values=-1)
scalar = StandardScaler()
pca = PCA()

classifier = RandomForestClassifier(n_estimators=100)

params = {
    'pca__n_components': np.arange(2,features.shape[1]+1).tolist() ,
    'clf__criterion':['gini','entropy'],
    'clf__max_features': [0.75, 0.8, 0.9],
    'clf__max_depth': [3.75, 3.9, 4, 4.1],
    'clf__min_samples_split': [2, 10, 50],
}

pipeline = Pipeline([
    ('scl', scalar),
    ('imp', imputer),
    ('pca', pca),
    ('clf', classifier),
])

grid_search2 = GridSearchCV(pipeline, params, cv=10, scoring='accuracy')
grid_search2.fit(features.values, target)
print grid_search2.best_score_
print grid_search2.best_params_
# with pca
# 0.86815556909
# {'pca__n_components': 15, 'clf__criterion': 'entropy', 'clf__max_depth': 4, 'clf__max_features': 0.8, 'clf__min_samples_split': 50}
# without pca, with scalar
# 0.884116233304
# {'clf__criterion': 'entropy', 'clf__max_depth': 4.1, 'clf__max_features': 0.9, 'clf__min_samples_split': 2}
# without pca, without scalar
# 0.882875577875
# {'clf__criterion': 'entropy', 'clf__max_depth': 4, 'clf__max_features': 0.9, 'clf__min_samples_split': 2}

# Try building the actual model

In [19]:
testing['expensive_coach_woman'] = ((testing.Sex == 'female') & (testing.Pclass==3) & (testing.Fare >= 20)) * 1
# based on looking at the values there are a few that appear very infrequently, we will map them to more useful values
title_map = {'Capt': 'Sir', 'Don':'Sir', 'Major':'Sir', 'Sir':'Sir', 'Col':'Sir', 'Mlle':'Sir', 'Jonkheer':'Sir',
             'Mme': 'Lady', 'Lady':'Lady', 'the Countess':'Lady', 'Ms': 'Miss'}

testing['Title'] = testing.Name.apply(lambda x: title_extract(x))
testing['Surname'] = testing.Name.apply(lambda x: x.split(',')[0])
testing['Family_size'] = testing.SibSp + testing.Parch + 1
testing['FamilyID'] = (testing.Family_size.astype(str) + testing.Surname) * ((testing.Family_size >= 2) * 1)
testing['FamilyID'] = testing['FamilyID'].apply(lambda x: 'small' if x == '' else x)

In [20]:
testing.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,expensive_coach_woman,Title,Surname,Family_size,FamilyID
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,Mr,Kelly,1,small
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,Mrs,Wilkes,2,2Wilkes
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,Mr,Myles,1,small
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,Mr,Wirz,1,small
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,Mrs,Hirvonen,3,3Hirvonen


In [21]:
features = testing[['expensive_coach_woman', 'Pclass', 'Age', 'Fare', 'Family_size']].copy()

In [22]:
features = testing[['expensive_coach_woman', 'Pclass', 'Age', 'Fare', 'Family_size']].copy()
features = pd.concat([features, pd.get_dummies(testing['Sex'], prefix='Sex').drop('Sex_male', axis=1)], axis=1)
features = pd.concat([features, pd.get_dummies(testing[testing['Title'].isin(['Mr','Miss','Mrs'])]['Title'])], axis=1)
features[['Mr', 'Miss', 'Mrs']] = features[['Mr', 'Miss', 'Mrs']].fillna(0) 
features = pd.concat([features, pd.get_dummies(testing['Embarked'], prefix='Embarked')], axis=1).copy()
features.head()

Unnamed: 0,expensive_coach_woman,Pclass,Age,Fare,Family_size,Sex_female,Miss,Mr,Mrs,Embarked_C,Embarked_Q,Embarked_S
0,0,3,34.5,7.8292,1,0,0,1,0,0,1,0
1,0,3,47.0,7.0,2,1,0,0,1,0,0,1
2,0,2,62.0,9.6875,1,0,0,1,0,0,1,0
3,0,3,27.0,8.6625,1,0,0,1,0,0,0,1
4,0,3,22.0,12.2875,3,1,0,0,1,0,0,1


In [23]:
# Handle missing values
# We will try to loop through the different classes and sex to apply the median age
for sex in xrange(0,2):
    for cabin_class in xrange(0,4):
        feature_group = features[
            (features.Pclass == cabin_class) & 
            (features.Sex_female == sex)]
        median_age_group = feature_group.Age.dropna().median()
        missing_age_ix = feature_group[pd.isnull(feature_group.Age)].index
        features.loc[missing_age_ix, 'Age'] = median_age_group

In [24]:
med_fare = features['Fare'].dropna().median()
features['Fare'] = features['Fare'].fillna(med_fare)

In [25]:
# Feature Enineering
features['child'] = (features.Age.dropna() <= 18)*1

In [26]:
features_scaled = scaler.fit_transform(features)

In [28]:
features_scaled

array([[-0.16439899,  0.87348191,  0.39945123, ...,  2.84375747,
        -1.35067551, -0.38516444],
       [-0.16439899,  0.87348191,  1.35927311, ..., -0.35164743,
         0.74037028, -0.38516444],
       [-0.16439899, -0.31581919,  2.51105936, ...,  2.84375747,
        -1.35067551, -0.38516444],
       ..., 
       [-0.16439899,  0.87348191,  0.70659423, ..., -0.35164743,
         0.74037028, -0.38516444],
       [-0.16439899,  0.87348191, -0.40679915, ..., -0.35164743,
         0.74037028, -0.38516444],
       [-0.16439899,  0.87348191, -0.40679915, ..., -0.35164743,
        -1.35067551, -0.38516444]])

In [29]:
predict1 = pd.DataFrame(data=grid_search.predict(features_scaled), columns=['Survived'])
predict1['PassengerId'] = testing['PassengerId']

In [30]:
predict2 = pd.DataFrame(data=grid_search2.predict(features_scaled), columns=['Survived'])
predict2['PassengerId'] = testing['PassengerId']

In [31]:
predict1.to_csv('prediction_gbc_20160128.csv', index=False)
predict1.to_csv('prediction_rf_20160128.csv', index=False)