In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [2]:
train_raw = pd.read_csv('data_sets/train.csv')

In [3]:
foo = train_raw.copy()
foo['Age'] = foo['Age'].fillna(foo['Age'].mean())

In [4]:
def preprocess_categorical(data,column):
    processed = data.copy()
    for status in data[column].unique():
        processed[column+'_'+str(status)] = (data[column]==status).astype(int)
    processed = processed.drop(column,axis=1)
    return processed
def deck_letter(x):
    if type(x) == str:
        return x[0]
    else:
        return np.nan
def preprocess_cabin(data):
    data['Cabin'] = data['Cabin'].apply(deck_letter)
    return data
def preprocess_name(data):
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1].split()[0])
    return data    
def preprocess(data):
    data = preprocess_cabin(data)
    data = preprocess_name(data)
    categorical = ['Sex','Embarked','Cabin','Title']
    iters = 0
    while iters < len(categorical):
        data = preprocess_categorical(data,categorical[iters])
        iters += 1
    data = data.fillna(data.mean())
    return data
def preprocess_test(test,train):
    test = preprocess(test)
    for c_train in train.columns:
        if c_train not in test.columns:
            test[c_train] = 0
    for c_test in test.columns:
        if c_test not in train.columns:
            test = test.drop(c_test,axis=1)
    return test

In [5]:
train_p = preprocess(train_raw)
ignore = ['Name','Ticket','PassengerId']
train = train_p.drop(ignore,axis=1)

In [6]:
train_p.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,...,Title_Mme.,Title_Ms.,Title_Major.,Title_Lady.,Title_Sir.,Title_Mlle.,Title_Col.,Title_Capt.,Title_the,Title_Jonkheer.
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.352413,0.722783,...,0.001122,0.001122,0.002245,0.001122,0.001122,0.002245,0.002245,0.001122,0.001122,0.001122
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429,0.47799,0.47799,0.447876,...,0.033501,0.033501,0.047351,0.033501,0.033501,0.047351,0.047351,0.033501,0.033501,0.033501
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
train_x = train.drop(['Survived'],axis=1)
train_y = train['Survived'].values
train_x.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,...,Title_Mme.,Title_Ms.,Title_Major.,Title_Lady.,Title_Sir.,Title_Mlle.,Title_Col.,Title_Capt.,Title_the,Title_Jonkheer.
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.352413,0.722783,0.188552,0.08642,...,0.001122,0.001122,0.002245,0.001122,0.001122,0.002245,0.002245,0.001122,0.001122,0.001122
std,0.836071,13.002015,1.102743,0.806057,49.693429,0.47799,0.47799,0.447876,0.391372,0.281141,...,0.033501,0.033501,0.047351,0.033501,0.033501,0.047351,0.047351,0.033501,0.033501,0.033501
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,29.699118,0.0,0.0,14.4542,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,35.0,1.0,0.0,31.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
print train_x.columns
train_x.describe()

Index([u'Pclass', u'Age', u'SibSp', u'Parch', u'Fare', u'Sex_male',
       u'Sex_female', u'Embarked_S', u'Embarked_C', u'Embarked_Q',
       u'Embarked_nan', u'Cabin_nan', u'Cabin_C', u'Cabin_E', u'Cabin_G',
       u'Cabin_D', u'Cabin_A', u'Cabin_B', u'Cabin_F', u'Cabin_T',
       u'Title_Mr.', u'Title_Mrs.', u'Title_Miss.', u'Title_Master.',
       u'Title_Don.', u'Title_Rev.', u'Title_Dr.', u'Title_Mme.', u'Title_Ms.',
       u'Title_Major.', u'Title_Lady.', u'Title_Sir.', u'Title_Mlle.',
       u'Title_Col.', u'Title_Capt.', u'Title_the', u'Title_Jonkheer.'],
      dtype='object')


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,...,Title_Mme.,Title_Ms.,Title_Major.,Title_Lady.,Title_Sir.,Title_Mlle.,Title_Col.,Title_Capt.,Title_the,Title_Jonkheer.
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.352413,0.722783,0.188552,0.08642,...,0.001122,0.001122,0.002245,0.001122,0.001122,0.002245,0.002245,0.001122,0.001122,0.001122
std,0.836071,13.002015,1.102743,0.806057,49.693429,0.47799,0.47799,0.447876,0.391372,0.281141,...,0.033501,0.033501,0.047351,0.033501,0.033501,0.047351,0.047351,0.033501,0.033501,0.033501
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,29.699118,0.0,0.0,14.4542,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,35.0,1.0,0.0,31.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
clf = AdaBoostClassifier(n_estimators=100)