In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [2]:
train_raw = pd.read_csv('data_sets/train.csv')

In [6]:
train = train_raw.drop(['ID','TARGET'],axis=1)
#train_x = train_x.reindex(np.random.permutation(train_x.index))
train_x = preprocessing.scale(train)
train_y = train_raw['TARGET'].values

In [8]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [12]:
def cross_validation(classifier,train_x,train_y,p_range,k=10,parameter='n_estimators',silence=True,**kwargs):
    errors = []
    for p in p_range:
        kfold = KFold(len(X_train), n_folds=k,shuffle=True)
        error = 0
        params = dict({parameter:p},**kwargs)
        for train_index, val_index in kfold:
            x_train = train_x[train_index]
            y_train = train_y[train_index]
            x_val = train_x[val_index]
            y_val = train_y[val_index]
            clf = classifier(**params)
            clf.fit(x_train,y_train)
            predictions = clf.predict(x_val)
            err = np.sum((predictions != y_val),dtype=float)/len(y_val)
            error += err/k
        if silence != True:
            print p,':',error
        errors.append(error)
    best = p_range[np.argmin(errors)]
    return best,errors

In [None]:
estimators_range = np.logspace(0,3,4,dtype=int)
args = {'base_estimator':RandomForestClassifier(n_estimators=100,max_depth=1)}
best_n_estimators, n_estimators_errors = cross_validation(AdaBoostClassifier,
                                                         X_train,Y_train,
                                                         estimators_range,
                                                         parameter='n_estimators',
                                                         k=3,
                                                         silence=False,
                                                         **args)

In [9]:
np.mean(Y_train)

0.039479742173112342

In [10]:
np.mean(Y_valid)

0.039923704288345171