In [16]:
import numpy as np
from sklearn.base import clone
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold,cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score
import pandas as pd
import warnings


In [17]:
warnings.filterwarnings('ignore')

In [18]:
pinguin = pd.read_csv('penguins.csv')

In [19]:
pinguin.dropna(inplace = True)

In [20]:
class BaggedTreeClassifier(object):
    #initializer
    def __init__(self,n_elements=100):
        self.n_elements = n_elements
        self.models     = []
    
    def __del__(self):
        del self.n_elements
        del self.models
        
    def __make_bootstraps(self,data):
        dc   = {}
        unip = 0
        b_size = data.shape[0]
        idx = [i for i in range(b_size)]
        for b in range(self.n_elements):
            sidx   = np.random.choice(idx,replace=True,size=b_size)
            b_samp = data[sidx,:]
            unip  += len(set(sidx))
            oidx   = list(set(idx) - set(sidx))
            o_samp = np.array([])
            if oidx:
                o_samp = data[oidx,:]
            dc['boot_'+str(b)] = {'boot':b_samp,'test':o_samp}
        return(dc)
    def get_params(self, deep = False):
        return {'n_elements':self.n_elements}

    def fit(self,X_train,y_train,print_metrics=False):
        training_data = np.concatenate((X_train,y_train.reshape(-1,1)),axis=1)
        dcBoot = self.__make_bootstraps(training_data)
        accs = np.array([])
        pres = np.array([])
        recs = np.array([])
        cls = DecisionTreeClassifier(class_weight='balanced')
        for b in dcBoot:
            model = clone(cls)
            model.fit(dcBoot[b]['boot'][:,:-1],dcBoot[b]['boot'][:,-1].reshape(-1, 1))
            self.models.append(model)
            if dcBoot[b]['test'].size:
                yp  = model.predict(dcBoot[b]['test'][:,:-1])
                acc = accuracy_score(dcBoot[b]['test'][:,-1],yp)
                pre = precision_score(dcBoot[b]['test'][:,-1],yp)   
                rec = recall_score(dcBoot[b]['test'][:,-1],yp)
                accs = np.concatenate((accs,acc.flatten()))
                pres = np.concatenate((pres,pre.flatten()))
                recs = np.concatenate((recs,rec.flatten()))
        if print_metrics:
            print("Standard error in accuracy: %.2f" % np.std(accs))
            print("Standard error in precision: %.2f" % np.std(pres))
            print("Standard error in recall: %.2f" % np.std(recs))
            
    def predict(self,X):
        if not self.models:
            print('You must train the ensemble before making predictions!')
            return(None)
        predictions = []
        for m in self.models:
            yp = m.predict(X)
            predictions.append(yp.reshape(-1,1))
        ypred = np.round(np.mean(np.concatenate(predictions,axis=1),axis=1)).astype(int)
        return(ypred)

In [21]:
pinguin

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [22]:
pinguin.dropna(inplace = True)

In [23]:
pinguin_temp = pinguin.copy()
for labels, content in pinguin_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        pinguin_temp[labels] = pd.Categorical(content).codes

In [24]:
X = pinguin_temp.drop("sex",axis = 1).values
y = pinguin_temp["sex"].values

In [25]:
ens = BaggedTreeClassifier()

## train the ensemble & view estimates for prediction error ##
ens.fit(X,y,print_metrics=True)

Standard error in accuracy: 0.03
Standard error in precision: 0.05
Standard error in recall: 0.05


In [26]:
scoring_metrics = ['accuracy','precision','recall']
dcScores        = cross_validate(ens,X,y,cv=StratifiedKFold(10),scoring=scoring_metrics)
print('Mean Accuracy: %.2f' % np.mean(dcScores['test_accuracy']))
print('Mean Precision: %.2f' % np.mean(dcScores['test_precision']))
print('Mean Recall: %.2f' % np.mean(dcScores['test_recall']))

Mean Accuracy: 0.89
Mean Precision: 0.89
Mean Recall: 0.89


In [27]:
from sklearn.ensemble import BaggingClassifier

ens = BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced'),n_estimators=100)

scoring_metrics = ['accuracy','precision','recall']
dcScores        = cross_validate(ens,X,y,cv=StratifiedKFold(10),scoring=scoring_metrics)
print('Mean Accuracy: %.2f' % np.mean(dcScores['test_accuracy']))
print('Mean Precision: %.2f' % np.mean(dcScores['test_precision']))

Mean Accuracy: 0.88
Mean Precision: 0.89
