In [13]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
from preprocess import *
from useful_tools import *
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [14]:
train_raw = pd.read_csv('data_sets/train.csv')

In [15]:
train = preprocess(train_raw)

In [16]:
train_x = train.drop(['Survived'],axis=1)
train_y = train['Survived'].values
train_x.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender,Same_Last,ticket_suffix,Embarked_S,Embarked_C,...,ticket_prefix_A/S,ticket_prefix_SC/AH Basle,ticket_prefix_A/4,ticket_prefix_WE/P,ticket_prefix_S.W./PP,ticket_prefix_S.O./P.P.,ticket_prefix_F.C.,ticket_prefix_SOTON/O2,ticket_prefix_S.C./PARIS,ticket_prefix_C.A./SOTON
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.361582,0.523008,0.381594,32.204208,0.647587,0.882155,297745.874299,0.722783,0.188552,...,0.001122,0.001122,0.003367,0.002245,0.001122,0.003367,0.001122,0.002245,0.002245,0.001122
std,0.836071,13.019697,1.102743,0.806057,49.693429,0.47799,1.488014,656159.899304,0.447876,0.391372,...,0.033501,0.033501,0.057961,0.047351,0.033501,0.057961,0.033501,0.047351,0.047351,0.033501
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,541.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,14882.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,112379.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,35.0,1.0,0.0,31.0,1.0,1.0,347082.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,80.0,8.0,6.0,512.3292,1.0,8.0,3101317.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train_x, train_y, test_size=0.15, random_state=0)

In [6]:
clf = AdaBoostClassifier(n_estimators=1000)

In [7]:
clf.fit(X_train,Y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=1000, random_state=None)

In [8]:
val_predictions = clf.predict(X_valid)
print error_rate(val_predictions,Y_valid)

0.171641791045


In [9]:
def cross_validation(train_x,train_y,p_range,k=50,parameter='n_estimators',silence=True,**kwargs):
    errors = []
    for p in p_range:
        kfold = KFold(len(X_train), n_folds=k,shuffle=True)
        error = 0
        params = dict({parameter:p},**kwargs)
        for train_index, val_index in kfold:
            x_train = train_x.iloc[train_index]
            y_train = train_y[train_index]
            x_val = train_x.iloc[val_index]
            y_val = train_y[val_index]
            clf = AdaBoostClassifier(**params)
            clf.fit(x_train,y_train)
            predictions = clf.predict(x_val)
            err = np.sum((predictions != y_val),dtype=float)/len(y_val)
            error += err/k
        if silence != True:
            print p,':',error
        errors.append(error)
    best = p_range[np.argmin(errors)]
    return best,errors

In [10]:
def parameter_search(train_x,train_y,parameters,k=50,max_cycles=3,step=1.):
    '''perform iterative search for best parameters
       initialize parameters with a guess of where the best should be
       like stochastic gradient decsent but for parameters
    '''
    iters = 0
    params = np.array(parameters.keys())
    values = np.array(parameters.values())
    while iters < max_cycles:
        print iters,':',dict(zip(params,values))
        for i in np.random.choice(np.arange(0,len(params)),len(params),replace=False):
            param,val = params[i],values[i]
            p_dict = {k:v for k,v in zip(params,values) if k != param}
            lower = np.logspace(0.5*np.log10(val),np.log10(val),3,dtype=int)
            if lower[0] <= 0:
                lower[0] = 1
            upper = np.logspace(np.log10(val),np.log10(val)+1,3,dtype=int)
            param_range = np.unique(np.concatenate((lower,upper)))
            print param,':',val
            best_p,errors = cross_validation(train_x,train_y,param_range,parameter=param,**p_dict)
            p_diff = best_p - val
            values[i] = val + step*p_diff
        iters += 1
    return params,values

In [21]:
estimator_range = np.logspace(0,3,9,dtype=int)
best_n_estimator,estimator_error = cross_validation(X_train,Y_train,estimator_range,k=50,parameter='n_estimators',silence=False)

1 : 0.208416666667
2 : 0.208583333333
5 : 0.1865
13 : 0.17575
31 : 0.190083333333
74 : 0.187666666667
177 : 0.183666666667
421 : 0.19175
1000 : 0.200916666667


In [24]:
base_estimators = [DecisionTreeClassifier(max_depth=n) for n in np.arange(1,20)]
args = {'n_estimators':1000}
best_estimator,estimator_error = cross_validation(X_train,Y_train,base_estimators,k=50,parameter='base_estimator',silence=False,**args)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best') : 0.190833333333
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best') : 0.22625
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best') : 0.21
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_

In [25]:
best_estimator

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')