# How bagging ensembles resample data

In [100]:
# load required packages
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

np.random.seed(1)

In [101]:
# Bagging flow
data = pd.DataFrame({
        'animal_type':['dog','wolf','wolf','dog','dog','wolf','dog'],
        'weight':[30, 40, 45, 50, 25, 15, 35],
        'anger_level':[0, 10, 7, 8, 5, 3, 2],
        'cuddly':['cuddly','not','not','not','cuddly','cuddly','cuddly']
    })

In [102]:
data = pd.concat([data for x in range(2)], axis=0)
data.reset_index(inplace=True)

In [103]:
Y = data.animal_type.map(lambda x: 1 if x == 'dog' else 0).values

In [104]:
Y

array([1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1])

In [105]:
import patsy

X = patsy.dmatrix('~ cuddly + anger_level + weight', data=data,
                  return_type='dataframe')

In [106]:
X.drop('Intercept', axis=1, inplace=True)

In [107]:
Xmat = X.values

In [108]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydot

dot_data = StringIO()  

#export_graphviz(dtc, out_file=dot_data,
#                feature_names=X.columns,
#                filled=True, rounded=True,
#                special_characters=True)  

#graph = pydot.graph_from_dot_data(dot_data.getvalue())  
#Image(graph.create_png()) 

In [109]:
#from sklearn.cross_validation import cross_val_score

#scores = cross_val_score(dtc, Xmat, Y, cv=3)

#print scores
#print np.mean(scores)

In [110]:
dtc_simple = DecisionTreeClassifier(max_depth=2)

scores_simple = cross_val_score(dtc_simple, Xmat, Y, cv=3)

print scores_simple
print np.mean(scores_simple)

[ 0.8   1.    0.75]
0.85


In [111]:
# first we create the "base model" estimator
dtc = DecisionTreeClassifier(max_depth=None)

# n_estimators is the number of "base models" (copies of the dtc classifier blueprint)
# that we want to fit and then average
n_estimators=11

# max_samples tells us what fraction of the size of the original dataset the bootstrapped
# datasets are going to be.
max_samples=0.5

max_features=0.66

#bag = BaggingClassifier(dtc)

In [112]:
rows = range(X.shape[0])
cols = range(X.shape[1])
print 'original rows:', rows
print 'original cols:', cols

number_of_resampled_rows = int(round(max_samples * X.shape[0]))
number_of_columns = int(round(max_features * X.shape[1]))

#print number_of_resampled_rows
#print number_of_columns

resampled_rows = np.random.choice(rows, size=number_of_resampled_rows, replace=True)
resampled_cols = np.random.choice(cols, size=number_of_columns, replace=False)

print 'resampled rows:', resampled_rows
print 'resampled_cols', resampled_cols

#print resampled_rows

#X

original rows: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
original cols: [0, 1, 2]
resampled rows: [ 8  9 11  5  0  0  1]
resampled_cols [2 1]


In [113]:
X.iloc[resampled_rows, resampled_cols]

Unnamed: 0,weight,anger_level
8,40.0,10.0
9,45.0,7.0
11,25.0,5.0
5,15.0,3.0
0,30.0,0.0
0,30.0,0.0
1,40.0,10.0


In [114]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

Xcopy = X.copy()

In [120]:
folds = KFold(len(rows), n_folds=3)

accuracy_for_each_fold = []

# 1. Split our X and Y data into 3 folds:
# (KFold gives us the indexes/rows of the training and testing portions
# instead of the actual subsets.)
for train_indices, test_indicies in folds:
    
    # Uset the training and testing indicies to make our training and testing
    # subsets of the data:
    #print range(X.shape[0])
    #print train_indices, test_indicies
    Xtrain, Xtest = X.iloc[train_indices, :], X.iloc[test_indicies, :]
    Ytrain, Ytest = Y[train_indices], Y[test_indicies]
    
    # Make a list to keep track of the predictions from our classifiers
    # inside the bagging procedure
    estimator_predictions = []
    
    # 2. Now we will do the Bagging procedure:
    for estimator_num in range(n_estimators):
        #print 'estimator number:', estimator_num
        
        # get the number of rows and columns that are in Xtrain, Ytrain (same)
        train_rows = range(Xtrain.shape[0])
        train_cols = range(Xtrain.shape[1])
        
        # 3. We set max_samples to 0.5 earlier, so we will resample a dataset
        # to have half the rows that are in Xtrain/Ytrain
        #
        # We set max_features to 0.66, so we'll resample the same dataset
        # to have 2 columns instead of 3, at random
        number_of_resampled_rows = int(round(max_samples * Xtrain.shape[0]))
        number_of_columns = int(round(max_features * Xtrain.shape[1]))
        
        # 4. actually do the resampling:
        # We need random row and column numbers from the number of rows and
        # columns in Xtrain/Ytrain
        # First, use np.random.choice to choose from elements in train_rows
        # with replacement, and only choose how many is specified in
        # number_of_resampled_rows
        #
        # Do the same for the columns, but NOT with replacement, since then
        # we would have duplicated prediction data.
        resampled_rows = np.random.choice(train_rows, size=number_of_resampled_rows, 
                                          replace=True)
        resampled_cols = np.random.choice(train_cols, size=number_of_columns,
                                          replace=False)
        
        # 5. Use the randomly selected rows and columns to make
        # our resampled Xtrain and Ytrain
        Xtrain_resamp = Xtrain.iloc[resampled_rows, resampled_cols].values
        Ytrain_resamp = Ytrain[resampled_rows]
        
        # 6. Make the classifier for this iteration through n_estimators
        dtc = DecisionTreeClassifier(max_depth=None)
        dtc.fit(Xtrain_resamp, Ytrain_resamp)
        
        # 7. Predict the values of Ytest (the testing data for this fold)
        Y_pred = dtc.predict(Xtest.iloc[:, resampled_cols].values)
        
        # 8. Append our predictions to the estimator_predictions list
        estimator_predictions.append(Y_pred)
        
    # 9. The bagging for loop has completed
    voted_on_y_pred = []
    votes_list = []
    # Go through the rows of Ytest:
    # Y    row
    # dog  0
    # wolf 1
    # dog  2
    # 
    # current_y_row will be 0 then 1 then 2
    for current_y_row in range(len(Ytest)):
        # keep track of the estimators votes for that Y row
        estimator_votes = []
        
        # Iterate through the estimators' votes for Y
        # GET THE Y PREDICTION FOR THE CURRENT ROW
        for prediction in estimator_predictions:
            estimator_votes.append(prediction[current_y_row])
        
        # how many votes are for 1
        how_many_in_class_1 = len([c for c in estimator_votes if c == 1])
        
        # how many votes are 0
        how_many_in_class_0 = n_estimators - how_many_in_class_1
        
        votes_list.append(estimator_votes)
        
        if how_many_in_class_1 > how_many_in_class_0:
            voted_on_y_pred.append(1)
        else:
            voted_on_y_pred.append(0)
    
    
    #print 'FOLD:'
    for i, y in enumerate(Ytest):
        pct_voted_dog = np.mean(votes_list[i])
        
        print 'record:', test_indicies[i], 'true Y:', y, '\tvotes:', votes_list[i], 'pct vote dog:', "{0:.3f}".format(pct_voted_dog)
        Xcopy.ix[test_indicies[i], 'pct_voted_dog'] = np.mean(votes_list[i])
        
        animal = None
        if y == 1:
            animal = 'dog'
        else:
            animal = 'wolf'
        
        Xcopy.ix[test_indicies[i], 'animal_type'] = animal
    
    # the accuracy for the current fold
    fold_accuracy = accuracy_score(Ytest, voted_on_y_pred)
    
    # keep track of this fold's accuracy
    accuracy_for_each_fold.append(fold_accuracy)
  

record: 0 true Y: 1 	votes: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1] pct vote dog: 0.909
record: 1 true Y: 0 	votes: [1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0] pct vote dog: 0.455
record: 2 true Y: 0 	votes: [1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0] pct vote dog: 0.455
record: 3 true Y: 1 	votes: [1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0] pct vote dog: 0.545
record: 4 true Y: 1 	votes: [0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0] pct vote dog: 0.545
record: 5 true Y: 0 	votes: [1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1] pct vote dog: 0.636
record: 6 true Y: 1 	votes: [1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1] pct vote dog: 0.727
record: 7 true Y: 1 	votes: [1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1] pct vote dog: 0.818
record: 8 true Y: 0 	votes: [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1] pct vote dog: 0.273
record: 9 true Y: 0 	votes: [0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1] pct vote dog: 0.455
record: 10 true Y: 1 	votes: [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0] pct vote dog: 0.364
record: 11 true Y: 1 	votes: [1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0] pct vote dog: 0.545
record: 12 tru

In [116]:
print np.mean(accuracy_for_each_fold)
Xcopy[['animal_type','pct_voted_dog'] + [x for x in Xcopy.columns if x not in ['animal_type','pct_voted_dog']]]

0.916666666667


Unnamed: 0,animal_type,pct_voted_dog,cuddly[T.not],anger_level,weight
0,dog,1.0,0.0,0.0,30.0
1,wolf,0.363636,1.0,10.0,40.0
2,wolf,0.272727,1.0,7.0,45.0
3,dog,0.545455,1.0,8.0,50.0
4,dog,0.545455,0.0,5.0,25.0
5,wolf,0.454545,0.0,3.0,15.0
6,dog,0.909091,0.0,2.0,35.0
7,dog,1.0,0.0,0.0,30.0
8,wolf,0.272727,1.0,10.0,40.0
9,wolf,0.272727,1.0,7.0,45.0


In [117]:
# all that above would be the same as this:

# NOTE: due to random sampling, this is not guaranteed to get the same CV accuracy
# as my implementation above.
dtc = DecisionTreeClassifier(max_depth=None)
bag = BaggingClassifier(dtc, n_estimators=11, max_samples=0.5, max_features=0.66)
scores = cross_val_score(bag, X.values, Y, verbose=1)
print scores
print np.mean(scores)

[ 0.8  0.6  0.5]
0.633333333333


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished
