In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [2]:
train_raw = pd.read_csv('data_sets/train.csv')

In [3]:
train = train_raw.drop(['ID','TARGET'],axis=1)
#train_x = train_x.reindex(np.random.permutation(train_x.index))
train_x = preprocessing.scale(train)
train_y = train_raw['TARGET'].values



In [4]:
class0 = train_x[train_y == 0]
class1 = train_x[train_y == 1]

In [5]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [8]:
def cross_validation(classifier,train_x,train_y,p_range,k=10,parameter='n_estimators',silence=True,**kwargs):
    errors = []
    for p in p_range:
        kfold = KFold(len(X_train), n_folds=k,shuffle=True)
        error = 0
        params = dict({parameter:p},**kwargs)
        for train_index, val_index in kfold:
            x_train = train_x[train_index]
            y_train = train_y[train_index]
            x_val = train_x[val_index]
            y_val = train_y[val_index]
            clf = classifier(**params)
            clf.fit(x_train,y_train)
            predictions = clf.predict(x_val)
            err = np.sum((predictions != y_val),dtype=float)/len(y_val)
            error += err/k
        if silence != True:
            print p,':',error
        errors.append(error)
    best = p_range[np.argmin(errors)]
    return best,errors

def major_class(train_y):
    n_class0 = np.sum(train_y == 0)
    n_class1 = np.sum(train_y == 1)
    if n_class0 > n_class1:
        return 0
    else:
        return 1
def minor_class(train_y):
    n_class0 = np.sum(train_y == 0)
    n_class1 = np.sum(train_y == 1)
    if n_class0 < n_class1:
        return 0
    else:
        return 1
    
def class_diff(train_y):
    n_class0 = np.sum(train_y == 0)
    n_class1 = np.sum(train_y == 1)
    min_class = minor_class(train_y)
    if min_class:
        class_diff = n_class0 - n_class1
    else:
        class_diff = n_class1 - n_class0
    return class_diff

def resample(train_x,train_y,k=5):
    '''Resample the data to fix class imbalance
        uses nearest neighbors method
        data should be scaled to mean 0 and std 1
        Returns synthetic data needed to balance
    '''
    min_class = minor_class(train_y)
    mc_data = train_x[train_y == min_class]
    cdiff = class_diff(train_y)
    
    nearest_neighbors = NearestNeighbors()
    nearest_neighbors.fit(mc_data)
    
    idxs = np.random.choice(np.arange(0,len(mc_data)),cdiff,replace=True)
    
    data_points = mc_data[idxs]
    dist,nni = nearest_neighbors.kneighbors(data_points,k)
    pivot_points = mc_data[nni[:,np.random.randint(0,k)]]
    print 'pp',pivot_points.shape
    print 'dp',data_points.shape
    scale = np.random.random(cdiff)
    synthetic_data = data_points + scale*pivot_points

    return synthetic_data

# Adaboost Random Forest

In [9]:
train_x.shape

(76020, 369)

In [None]:
synthetic_data = resample(train_x,train_y,k=5)

In [None]:
syn_train_x = np.vstack((train_x,synthetic_data)) 

In [14]:
estimators_range = np.logspace(0,3,4,dtype=int)
args = {'base_estimator':RandomForestClassifier(n_estimators=100,max_depth=1)}
best_n_estimators, n_estimators_errors = cross_validation(AdaBoostClassifier,
                                                         X_train,Y_train,
                                                         estimators_range,
                                                         parameter='n_estimators',
                                                         k=3,
                                                         silence=False,
                                                         **args)

1 : 0.0394797421731
10 : 0.0394797421731
100 : 0.0393975269666
1000 : 0.0403512233623


In [26]:
depth_range = np.arange(1,11)
args = {'base_estimator':RandomForestClassifier(n_estimators=100,max_depth=1)}
best_n_estimators, n_estimators_errors = cross_validation(AdaBoostClassifier,
                                                         X_train,Y_train,
                                                         estimators_range,
                                                         parameter='n_estimators',
                                                         k=3,
                                                         silence=False,
                                                         **args)

ValueError: operands could not be broadcast together with shapes (70004,) (70004,369) 

array([[   0, 2878, 1675, 1504, 1819,  189, 1223, 2839,  344, 1940]])

In [41]:
nearest_neighbors = NearestNeighbors()
nearest_neighbors.fit(class1)
nearest_neighbors.kneighbors(class1[np.random.randint(0,len(class1),len(class1))])

73012
3008
