In [1]:
import os
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [26]:
class DecisionStump:
    
    def __init__(self):
        self.d = None
        self.best_split_idx = None
        self.best_split_val = None
        self.min_wt_err = None
        
    def fit(self, data, target, sample_weights=None, verbose=0):
        assert len(data.shape) == 2, 'Expected 2-d input data'
        assert data.shape[0] == target.shape[0], 'Expected data and target to have equal number of samples'
        assert len(target.shape) == 1 or target.shape[1] == 1, 'Only supports binary classification'
        data = pd.DataFrame(data)
        target = pd.DataFrame(target)
        assert all(target.isin([1, -1])), 'targets need to be in [+1, -1]'
        n, d = data.shape
        self.d = d
        if sample_weights is None:
            sample_weights = (1/n) * np.ones((n, ))

        self.min_wt_err = np.inf
        for i in range(d):
            combined = pd.concat([data.iloc[:, i], target], axis = 1)
            combined.sort_values(by = combined.columns[0], inplace = True)
            # x[d] <= thr -> negative class, x[d] > thr -> positive class
            preds = np.ones((n, ))
            wt_err = np.sum(sample_weights*(preds != np.array(combined.iloc[:, 1])).astype('float64'))
            if wt_err < self.min_wt_err:
                self.min_wt_err = wt_err
                self.best_split_idx = i
                self.best_split_val = combined.iat[0, 0]
            for j in range(n):
                prev_contrib = sample_weights[j] * (float(combined.iat[j, 1] != 1))
                new_contrib = sample_weights[j] * (float(combined.iat[j, 1] != -1))
                wt_err = wt_err - prev_contrib + new_contrib
                if wt_err < self.min_wt_err:
                    self.min_wt_err = wt_err
                    self.best_split_idx = i
                    self.best_split_val = combined.iat[j, 0]
                    if verbose:
                        print('New best split with feature {}, split val {}, err {}'.format(i,
                                                                                            combined.iat[j, 0],
                                                                                            wt_err))
        print('Minimum weighted classification error = {}'.format(self.min_wt_err))
            
    def predict(self, data):
        assert len(data.shape) == 2, 'Expectd 2-d input'
        assert data.shape[1] == self.d, 'Expected input to have {} features'.format(self.d)
        n, d = data.shape
        data = pd.DataFrame(data)
        preds = np.zeros((n, ), dtype = 'int8')
        for i in range(n):
            if data.iat[i, self.best_split_idx] <= self.best_split_val:
                preds[i] = -1
            else:
                preds[i] = +1
            
        return preds

In [95]:
class BoostedStumps:
    
    def __init__(self, n_estimators=10):
        self.sample_weights = None
        self.d = None
        self.n_estimators = n_estimators
    
    def fit(self, data, target):
        assert len(data.shape) == 2, 'Expected 2-d input data'
        assert data.shape[0] == target.shape[0], 'Expected data and target to have equal number of samples'
        assert len(target.shape) == 1 or target.shape[1] == 1, 'Only supports binary classification'
        data = pd.DataFrame(data)
        target = pd.DataFrame(target)
        assert all(target.isin([1, -1])), 'targets need to be in [+1, -1]'

        n, d = data.shape
        self.d = d
        self.sample_weights = (1/n) * np.ones((n, ))
        self.classifier = [DecisionStump() for i in range(self.n_estimators)]
        self.clf_wts = np.zeros((self.n_estimators, ))
        for i in range(self.n_estimators):
            # update sample weights
            y_net = np.zeros((n, ))
            for j in range(i - 1):
                y_net += self.clf_wts[j] * self.classifier[j].predict(data)
            y_net = np.exp(-y_net * (target[target.columns[0]]))
            self.sample_weights = self.sample_weights * y_net
            self.sample_weights /= np.sum(self.sample_weights)
            # fit new classifer and calculate its weight
            self.classifier[i].fit(data, target, self.sample_weights)
            eps = self.classifier[i].min_wt_err
            self.clf_wts[i] = (1/2) * np.log((1 - eps)/eps)
#             y_preds = self.classifier[i].predict(data)
#             self.sample_weights *= np.exp(-self.clf_wts[i] * y_preds * np.array(target[target.columns[0]]))
#             self.sample_weights /= np.sum(self.sample_weights)
        
    def predict(self, data):
        assert len(data.shape) == 2, 'Expectd 2-d input'
        assert data.shape[1] == self.d, 'Expected input to have {} features'.format(self.d)
        n, d = data.shape
        data = pd.DataFrame(data)
        y = np.zeros((n, ))
        for i in range(self.n_estimators):
            y += self.clf_wts[i] * self.classifier[i].predict(data)
        y = np.sign(y)
        return y
            

In [4]:
dataset = datasets.load_breast_cancer()

In [5]:
dataset.keys()

dict_keys(['feature_names', 'target_names', 'data', 'DESCR', 'target'])

In [6]:
dataset.data.shape

(569, 30)

In [7]:
y = pd.DataFrame(dataset.target, columns= [dataset.target_names[0]], dtype = 'int8')
y[y['malignant'] == 0] = -1
y.head()

Unnamed: 0,malignant
0,-1
1,-1
2,-1
3,-1
4,-1


In [29]:
y_arr = np.array(y['malignant'])
# y_arr[y_arr == -1].shape
y_df = pd.DataFrame(y)
all(y_df.isin([1, -1]))
# y_df.values

True

In [9]:
X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [103]:
model = DecisionStump()
model.fit(X, y, verbose = 0)

Minimum weighted classification error = 0.3479789103690687


In [104]:
print(model.best_split_idx, model.best_split_val, model.min_wt_err)

9 0.05534 0.3479789103690687


In [105]:
y_pred = model.predict(X)
acc = (pd.DataFrame(y_pred, columns = ['malignant']) == y).astype('float64')
acc.mean()
# print('acc = {}'.format(acc)))

malignant    0.652021
dtype: float64

In [106]:
model = BoostedStumps(n_estimators = 15)
model.fit(X, y)

Minimum weighted classification error = 0.3479789103690688
Minimum weighted classification error = 0.3479789103690686
Minimum weighted classification error = 0.3434139236026029
Minimum weighted classification error = 0.2900114260454331
Minimum weighted classification error = 0.2515070363591458
Minimum weighted classification error = 0.2392687736406172
Minimum weighted classification error = 0.23744255971008055
Minimum weighted classification error = 0.23730502231342546
Minimum weighted classification error = 0.2372923521778353
Minimum weighted classification error = 0.2372893325174103
Minimum weighted classification error = 0.23728847650697094
Minimum weighted classification error = 0.23728823269540467
Minimum weighted classification error = 0.23728816325066904
Minimum weighted classification error = 0.2372881434708368
Minimum weighted classification error = 0.2372881378369857


In [100]:
model.clf_wts

array([0.31396752, 0.31396752, 0.32405865, 0.44766428, 0.54529542,
       0.57834631, 0.58337605, 0.58375593, 0.58379093, 0.58379927,
       0.58380164, 0.58380231, 0.5838025 , 0.58380256, 0.58380257])

In [101]:
y_pred = model.predict(X)
acc = (pd.DataFrame(y_pred, columns = ['malignant']) == y).astype('float64')
acc.mean()

malignant    0.604569
dtype: float64