In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import re
import csv
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder



In [2]:
train = pd.read_csv("train.csv", header = 0)
test = pd.read_csv("test.csv", header = 0)

In [18]:
def clean_data(df):
    
    #gender and embarked binarization
    df = pd.concat([df, pd.get_dummies(df['Sex'])], axis = 1)
    df = pd.concat([df, pd.get_dummies(df['Embarked'].fillna('S'), prefix= 'Embarked')], axis = 1)
      
    #create new feature called famsize
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1.0
    df['Family']=df['SibSp']*df['Parch']
    #fare fix
    df.loc[ (df.Fare.isnull())&(df.Pclass==1),'Fare'] =np.median(df[df['Pclass'] == 1]['Fare'].dropna())
    df.loc[ (df.Fare.isnull())&(df.Pclass==2),'Fare'] =np.median( df[df['Pclass'] == 2]['Fare'].dropna())
    df.loc[ (df.Fare.isnull())&(df.Pclass==3),'Fare'] = np.median(df[df['Pclass'] == 3]['Fare'].dropna())
    df['Fare_Per_Person'] = df['Fare'] / df['FamilySize']

    #title
    title_transforms = {'Dona.': 'Mrs.',
                        'Countess.': 'Mrs.',
                        'Jonkheer.': 'Mr.',
                        'Capt.': 'Mr.',
                        'Don.': 'Mr.',
                        'Lady.': 'Mrs.',
                        'Major.': 'Mr.',
                        'Mlle.': 'Mrs.',
                        'Mme.': 'Mrs.',
                        'Sir.': 'Master.'}
    
    df['Title'] = df['Name'].str.extract("([\w]+\.)")
    df['Title'] = df.replace({'Title': title_transforms})['Title']
    def replace_titles(x):
        title=x['Title']
        if title in ['Mr.','Don.', 'Major.', 'Capt.', 'Jonkheer.', 'Rev.', 'Col.']:
            return 'Mr.'
        elif title in ['Master.']:
            return 'Master.'
        elif title in ['Countess.', 'Mme.','Mrs.']:
            return 'Mrs.'
        elif title in ['Mlle.', 'Ms.','Miss.']:
            return 'Miss.'
        elif title =='Dr.':
            if x['Sex']=='Male':
                return 'Mr.'
            else:
                return 'Mrs.'
        elif title =='':
            if x['Sex']=='Male':
                return 'Master.'
            else:
                return 'Miss.'
        else:
            return title

    df['Title']=df.apply(replace_titles, axis=1)
    df = pd.concat([df, pd.get_dummies(df['Title'], prefix = 'Title')], axis = 1)
    
    #age fix
    df['AgeFill'] = df['Age'] \
        .groupby([df['Sex'], df['Title']]) \
        .apply(lambda x: x.fillna(x.mean()))
    
    df['AgeCat']=df['AgeFill']
    df.loc[ (df.AgeFill<=10) ,'AgeCat'] = 'child'
    df.loc[ (df.AgeFill>60),'AgeCat'] = 'aged'
    df.loc[ (df.AgeFill>10) & (df.AgeFill <=30) ,'AgeCat'] = 'adult'
    df.loc[ (df.AgeFill>30) & (df.AgeFill <=60) ,'AgeCat'] = 'senior'
    df = pd.concat([df, pd.get_dummies(df['AgeCat'], prefix = 5)], axis = 1)
    
    #cabin fix
    df['Cabin_Category'] = df['Cabin'].str.extract("(\w)").fillna("")
    df['Cabin_Category'] = np.where(df['Cabin_Category'] == "", 0, 1)
    
    
    #added features
    df['AgeClass']=df['AgeFill']*df['Pclass']
    df['ClassFare']=df['Pclass']*df['Fare_Per_Person']
    df['HighLow']=df['Pclass']
    df.loc[ (df.Fare_Per_Person<8) ,'HighLow'] = 0
    df.loc[ (df.Fare_Per_Person>=8) ,'HighLow'] = 1
    
    le = LabelEncoder()
    le.fit(df['Ticket'])
    df['Ticket'] = le.transform(df['Ticket'])
    
    df = df.drop(['PassengerId', 'Age', 'AgeCat', 'Name', 'Sex', 'Embarked', 'Cabin', 'Ticket', 'Title'], axis = 1)
    return df

In [19]:
x = clean_data(train).drop('Survived', axis = 1)
y = train['Survived']
test_new = clean_data(test)

In [20]:
x

Unnamed: 0,Pclass,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Family,Fare_Per_Person,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,AgeFill,5_adult,5_aged,5_child,5_senior,Cabin_Category,AgeClass,ClassFare,HighLow
0,3,1,0,7.2500,0,1,0,0,1,2,0,3.625000,0,0,1,0,22.000000,1,0,0,0,0,66.000000,10.875000,0
1,1,1,0,71.2833,1,0,1,0,0,2,0,35.641650,0,0,0,1,38.000000,0,0,0,1,1,38.000000,35.641650,1
2,3,0,0,7.9250,1,0,0,0,1,1,0,7.925000,0,1,0,0,26.000000,1,0,0,0,0,78.000000,23.775000,0
3,1,1,0,53.1000,1,0,0,0,1,2,0,26.550000,0,0,0,1,35.000000,0,0,0,1,1,35.000000,26.550000,1
4,3,0,0,8.0500,0,1,0,0,1,1,0,8.050000,0,0,1,0,35.000000,0,0,0,1,0,105.000000,24.150000,1
5,3,0,0,8.4583,0,1,0,1,0,1,0,8.458300,0,0,1,0,32.852798,0,0,0,1,0,98.558394,25.374900,1
6,1,0,0,51.8625,0,1,0,0,1,1,0,51.862500,0,0,1,0,54.000000,0,0,0,1,1,54.000000,51.862500,1
7,3,3,1,21.0750,0,1,0,0,1,5,3,4.215000,1,0,0,0,2.000000,0,0,1,0,0,6.000000,12.645000,0
8,3,0,2,11.1333,1,0,0,0,1,3,0,3.711100,0,0,0,1,27.000000,1,0,0,0,0,81.000000,11.133300,0
9,2,1,0,30.0708,1,0,1,0,0,2,0,15.035400,0,0,0,1,14.000000,1,0,0,0,0,28.000000,30.070800,1


In [22]:
test_new

Unnamed: 0,Pclass,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Family,Fare_Per_Person,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,AgeFill,5_adult,5_aged,5_child,5_senior,Cabin_Category,AgeClass,ClassFare,HighLow
0,3,0,0,7.8292,0,1,0,1,0,1,0,7.829200,0,0,1,0,34.500000,0,0,0,1,0,103.500000,23.487600,0
1,3,1,0,7.0000,1,0,0,0,1,2,0,3.500000,0,0,0,1,47.000000,0,0,0,1,0,141.000000,10.500000,0
2,2,0,0,9.6875,0,1,0,1,0,1,0,9.687500,0,0,1,0,62.000000,0,1,0,0,0,124.000000,19.375000,1
3,3,0,0,8.6625,0,1,0,0,1,1,0,8.662500,0,0,1,0,27.000000,1,0,0,0,0,81.000000,25.987500,1
4,3,1,1,12.2875,1,0,0,0,1,3,1,4.095833,0,0,0,1,22.000000,1,0,0,0,0,66.000000,12.287500,0
5,3,0,0,9.2250,0,1,0,0,1,1,0,9.225000,0,0,1,0,14.000000,1,0,0,0,0,42.000000,27.675000,1
6,3,0,0,7.6292,1,0,0,1,0,1,0,7.629200,0,1,0,0,30.000000,1,0,0,0,0,90.000000,22.887600,0
7,2,1,1,29.0000,0,1,0,0,1,3,1,9.666667,0,0,1,0,26.000000,1,0,0,0,0,52.000000,19.333333,1
8,3,0,0,7.2292,1,0,1,0,0,1,0,7.229200,0,0,0,1,18.000000,1,0,0,0,0,54.000000,21.687600,0
9,3,2,0,24.1500,0,1,0,0,1,3,0,8.050000,0,0,1,0,21.000000,1,0,0,0,0,63.000000,24.150000,1


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
import scipy
import matplotlib as mpl
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

In [25]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size = .33)

In [41]:
clfl2=XGBClassifier()
parameters = {'n_estimators': [100, 200, 300, 400, 500], 
              'max_depth': range(1, 6), 
              'reg_lambda': [1],
              'reg_alpha': [1]}
fitmodel = GridSearchCV(clfl2, param_grid=parameters, cv=10, scoring="accuracy")
fitmodel.fit(x_train, y_train)
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.grid_scores_

(XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
        gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=1,
        min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
        objective='binary:logistic', reg_alpha=1, reg_lambda=1,
        scale_pos_weight=1, seed=0, silent=True, subsample=1),
 {'max_depth': 1, 'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 1},
 0.85402684563758391,
 [mean: 0.84564, std: 0.04062, params: {'n_estimators': 100, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 1},
  mean: 0.85403, std: 0.04195, params: {'n_estimators': 200, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 1},
  mean: 0.85067, std: 0.04078, params: {'n_estimators': 300, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 1},
  mean: 0.84732, std: 0.04136, params: {'n_estimators': 400, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 1},
  mean: 0.84732, std: 0.04680, params: {'n_estimators': 500, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 1},
  mean: 0.84

In [105]:
clfl2=RandomForestClassifier()
parameters = {'n_estimators': [100, 200, 300, 400, 500], 
              'max_depth': range(1, 6)}
fitmodel = GridSearchCV(clfl2, param_grid=parameters, cv=10, scoring="accuracy")
fitmodel.fit(x_train, y_train)
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.grid_scores_

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=5, max_features='auto', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 {'max_depth': 5, 'n_estimators': 200},
 0.83389261744966447,
 [mean: 0.78523, std: 0.06279, params: {'n_estimators': 100, 'max_depth': 1},
  mean: 0.78188, std: 0.06171, params: {'n_estimators': 200, 'max_depth': 1},
  mean: 0.78188, std: 0.06392, params: {'n_estimators': 300, 'max_depth': 1},
  mean: 0.77517, std: 0.06613, params: {'n_estimators': 400, 'max_depth': 1},
  mean: 0.78020, std: 0.06399, params: {'n_estimators': 500, 'max_depth': 1},
  mean: 0.80369, std: 0.06175, params: {'n_estimators': 100, 'max_depth': 2},
  mean: 0.79698, std: 0.05583, params: {'n_estimators': 200, 'max_depth': 2},
  mean: 0.80369, std: 0.05973,

In [28]:
for i in range(1, 11):
    clf = RandomForestClassifier(n_estimators = 500, max_depth = i)
    clf.fit(x_train, y_train)
    x_train_pred = clf.predict(x_train)
    x_test_pred = clf.predict(x_test)
    print "train-test accuracy: %s - %s %s" %(accuracy_score(x_train_pred, y_train), accuracy_score(x_test_pred, y_test), i)

train-test accuracy: 0.781879194631 - 0.796610169492 1
train-test accuracy: 0.791946308725 - 0.810169491525 2
train-test accuracy: 0.828859060403 - 0.840677966102 3
train-test accuracy: 0.838926174497 - 0.847457627119 4
train-test accuracy: 0.86744966443 - 0.840677966102 5
train-test accuracy: 0.889261744966 - 0.84406779661 6
train-test accuracy: 0.919463087248 - 0.847457627119 7
train-test accuracy: 0.93288590604 - 0.84406779661 8
train-test accuracy: 0.947986577181 - 0.84406779661 9
train-test accuracy: 0.956375838926 - 0.84406779661 10


In [101]:
for i in range(1, 20):
    clf = RandomForestClassifier(n_estimators = 500, max_depth = 7)
    clf.fit(x_train, y_train)
    x_train_pred = clf.predict(x_train)
    x_test_pred = clf.predict(x_test)
    print "train-test accuracy: %s - %s %s" %(accuracy_score(x_train_pred, y_train), accuracy_score(x_test_pred, y_test), i)

train-test accuracy: 0.887583892617 - 0.837288135593 1
train-test accuracy: 0.88255033557 - 0.837288135593 2
train-test accuracy: 0.880872483221 - 0.833898305085 3
train-test accuracy: 0.887583892617 - 0.830508474576 4
train-test accuracy: 0.885906040268 - 0.823728813559 5
train-test accuracy: 0.887583892617 - 0.830508474576 6
train-test accuracy: 0.879194630872 - 0.820338983051 7
train-test accuracy: 0.889261744966 - 0.827118644068 8
train-test accuracy: 0.892617449664 - 0.833898305085 9
train-test accuracy: 0.889261744966 - 0.823728813559 10
train-test accuracy: 0.879194630872 - 0.820338983051 11
train-test accuracy: 0.887583892617 - 0.830508474576 12
train-test accuracy: 0.879194630872 - 0.833898305085 13
train-test accuracy: 0.877516778523 - 0.830508474576 14
train-test accuracy: 0.894295302013 - 0.837288135593 15
train-test accuracy: 0.875838926174 - 0.823728813559 16
train-test accuracy: 0.874161073826 - 0.827118644068 17
train-test accuracy: 0.884228187919 - 0.833898305085 18
tr

In [100]:
test_new.describe()

Unnamed: 0,Pclass,SibSp,Parch,Ticket,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Family,Fare_Per_Person,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,AgeFill,5_adult,5_aged,5_child,5_senior,Cabin_Category,AgeClass,ClassFare,HighLow
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.447368,0.392344,180.944976,35.560845,0.363636,0.636364,0.244019,0.110048,0.645933,1.839713,0.444976,21.770888,0.050239,0.188995,0.583732,0.177033,30.222256,0.461722,0.026316,0.062201,0.449761,0.217703,63.568484,33.040328,0.557416
std,0.841838,0.89676,0.981429,107.533763,55.856972,0.481622,0.481622,0.430019,0.313324,0.478803,1.519072,1.668752,35.603507,0.2187,0.391974,0.49353,0.382154,13.021233,0.49913,0.160265,0.24181,0.498066,0.413179,29.8499,34.798837,0.497288
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.51,0.0,0.0
25%,1.0,0.0,0.0,85.25,7.8958,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.6344,0.0,0.0,0.0,0.0,21.831133,0.0,0.0,0.0,0.0,0.0,42.0,21.675,0.0
50%,3.0,0.0,0.0,181.0,14.4542,0.0,1.0,0.0,0.0,1.0,1.0,0.0,8.6625,0.0,0.0,1.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,61.5,23.6874,1.0
75%,3.0,1.0,0.0,279.75,31.471875,1.0,1.0,0.0,0.0,1.0,2.0,0.0,25.982813,0.0,0.0,1.0,0.0,36.875,1.0,0.0,0.0,1.0,0.0,86.75,27.7208,1.0
max,3.0,8.0,9.0,362.0,512.3292,1.0,1.0,1.0,1.0,1.0,11.0,16.0,262.375,1.0,1.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,1.0,181.5,262.375,1.0


In [29]:
clf = RandomForestClassifier(n_estimators = 500, max_depth = 7)
clf.fit(x, y)
output = clf.predict(test_new)
test_ids = test['PassengerId']

In [30]:
predictions_file = open("titanic_20160704.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(test_ids, output))
predictions_file.close()

This will result in a prediction accuracy of 78.469%, which is better but not there yet :(

In [86]:
test_new.describe()

Unnamed: 0,Pclass,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Family,Fare_Per_Person,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,AgeFill,AgeCategory_adult,AgeCategory_aged,AgeCategory_child,AgeCategory_senior,Cabin_Category,AgeClass,ClassFare,HighLow
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.447368,0.392344,35.560845,0.363636,0.636364,0.244019,0.110048,0.645933,1.839713,0.444976,21.770888,0.050239,0.188995,0.583732,0.177033,30.222256,0.461722,0.026316,0.062201,0.449761,0.217703,63.568484,33.040328,0.557416
std,0.841838,0.89676,0.981429,55.856972,0.481622,0.481622,0.430019,0.313324,0.478803,1.519072,1.668752,35.603507,0.2187,0.391974,0.49353,0.382154,13.021233,0.49913,0.160265,0.24181,0.498066,0.413179,29.8499,34.798837,0.497288
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.51,0.0,0.0
25%,1.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.6344,0.0,0.0,0.0,0.0,21.831133,0.0,0.0,0.0,0.0,0.0,42.0,21.675,0.0
50%,3.0,0.0,0.0,14.4542,0.0,1.0,0.0,0.0,1.0,1.0,0.0,8.6625,0.0,0.0,1.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,61.5,23.6874,1.0
75%,3.0,1.0,0.0,31.471875,1.0,1.0,0.0,0.0,1.0,2.0,0.0,25.982813,0.0,0.0,1.0,0.0,36.875,1.0,0.0,0.0,1.0,0.0,86.75,27.7208,1.0
max,3.0,8.0,9.0,512.3292,1.0,1.0,1.0,1.0,1.0,11.0,16.0,262.375,1.0,1.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,1.0,181.5,262.375,1.0
