# Boosting Decision Stumps

## The banknotes dataset


In [1]:
# read the dataset
import pandas as pd

banknotes = pd.read_csv('banknotes-data.csv', sep=',', header=0).sample(frac=1)

banknotes_x = banknotes.values[:, :-1]
banknotes_y = banknotes.values[:, -1]

# split the data into the training and test dataset
test_X = banknotes_x[:500]
test_y = banknotes_y[:500]
train_X = banknotes_x[500:]
train_y = banknotes_y[500:]

## Finding the best decision stump for each coordinate


In [2]:
def findDecisionStump(train_X, train_y):
    #variables
    features = train_X.shape[1]
    threshold = [0 for _ in range(features)]
    error = [float('inf') for _ in range(features)]
    stump_type = ['minus' for _ in range(features)]  

    #finding best threshold for each feature
    for i in range(features):
        #get unique values for each feature
        feature = train_X[:, i]
        threshold_values = pd.unique(feature)

        #itterating over unique feature values to find best threshold for each feature
        for value in threshold_values:
            err_minus = err_plus = 0
            for j in range(len(train_y)):
                if feature[j] < value:
                    err_minus += (train_y[j] != 1)
                else:
                    err_minus += (train_y[j] != 0)

                if feature[j] >= value:
                    err_plus += (train_y[j] != 1)
                else:
                    err_plus += (train_y[j] != 0)

            err_minus /= len(train_y)
            err_plus /= len(train_y)

            if err_minus < error[i]:
                threshold[i] = value
                error[i] = err_minus
                stump_type[i] = 'minus'

            if err_plus < error[i]:
                threshold[i] = value
                error[i] = err_plus
                stump_type[i] = 'plus'

    return threshold, error, stump_type

## Boosting


In [3]:
import cvxpy as cvx
import numpy as np

# Create two scalar optimization variables
x = cvx.Variable()
y = cvx.Variable()

# Create constraints
constraints = [x >= 0,
               y >= 0,
               x + y <= 180,
               x + 2*y <= 240,
               3 * x + y <= 300]

# Form objective
obj = cvx.Maximize(140*x +235*y)

# Form and solve problem.
prob = cvx.Problem(obj, constraints)
prob.solve()  

print("status:", prob.status)
print("optimal value", prob.value)
print("optimal var", x.value, y.value)

status: optimal
optimal value 29819.999988680618
optimal var 71.99999982368749 84.00000005686965


    Your problem is being solved with the ECOS solver by default. Starting in 
    CVXPY 1.5.0, Clarabel will be used as the default solver instead. To continue 
    using ECOS, specify the ECOS solver explicitly using the ``solver=cp.ECOS`` 
    argument to the ``problem.solve`` method.
    


CVXPY also allows for multi-dimensional variables and point-wise constraints, and combinations:

In [4]:
x = cvx.Variable(2, nonneg=True)
A = np.array(((1, 2), (3, 4)))
y = cvx.Variable()
c = np.ones(2)

constrains = [A @ x <= 4,
              x >= y]

obj = cvx.Maximize(c @ x + y)

prob = cvx.Problem(obj, constrains)
prob.solve()

print("status:", prob.status)
print("optimal value", prob.value)
print("optimal var", x.value)

status: optimal
optimal value 1.71428571412859
optimal var [0.57142857 0.57142857]


In [14]:
threshold, trainingerror, stump_types = findDecisionStump(train_X, train_y)

print("Empirical Weak Learner Training Error:", trainingerror)

#matrix A
A = np.zeros((len(train_y), 4))

#filling A
for i in range(len(train_y)):
    for j in range(4):
        #best stump is s^-_{j,t}
        if stump_types[j] == 'minus':
            #marking 1 for misclassification
            A[i][j] = 1 if (train_X[i][j] < threshold[j] and train_y[i] != 1) or (train_X[i][j] >= threshold[j] and train_y[i] == 1) else 0
        #best stump is s^+_{j,t}
        elif stump_types[j] == 'plus':
            #marking 1 for misclassification
            A[i][j] = 1 if (train_X[i][j] >= threshold[j] and train_y[i] != 1) or (train_X[i][j] < threshold[j] and train_y[i] == 1) else 0


#variables to max over
p = cvx.Variable(4) #dsitribution over the 4 features
t = cvx.Variable()   

constraints = [cvx.sum(p) == 1,  #p must sum to 1
               p >= 0,           #p must be non-negative
               A @ p >= t        #for each i, (Ap)_i >= t
              ]


objective = cvx.Maximize(t)
#defining and solving the problem
problem = cvx.Problem(objective, constraints)
problem.solve(solver=cvx.ECOS)

#empirical prediction error for aggregated classifier
empirical_error = 0

#classifiction of each feature weak learner for each data point on the test data
A = np.zeros((len(test_y), 4))
for i in range(len(test_y)):
    for j in range(4):
        #best stump is s^-_{j,t}, 
        if stump_types[j] == 'minus':
            #marking 1 for classification
            A[i][j] = 1 if test_X[i][j] < threshold[j] else 0
        #best stump is s^+_{j,t}    
        elif stump_types[j] == 'plus':
            #marking 1 for classification
            A[i][j] = 1 if test_X[i][j] >= threshold[j] else 0

#empirical prediction error for aggregated classifier
for i in range(len(test_y)):
    prediction_error = np.dot(A[i], p.value)
    if (prediction_error < .5 and test_y[i] == 1) or (prediction_error >= .5 and test_y[i] == 0):
        empirical_error += 1
empirical_error /= len(test_y)

#calculate learner errors for test data
error = [0, 0, 0, 0]
for i in range(4):
    for j in range(len(test_y)):
        #best stump is s^-_{j,t}
        if stump_types[i] == 'minus':
            #adding 1 to error if misclassified
            error[i] += 1 if (test_X[j][i] < threshold[i] and test_y[j] != 1) or (test_X[j][i] >= threshold[i] and test_y[j] == 1) else 0
        #best stump is s^+_{j,t}
        elif stump_types[i] == 'plus':
            #adding 1 to error if misclassified
            error[i] += 1 if (test_X[j][i] >= threshold[i] and test_y[j] != 1) or (test_X[j][i] < threshold[i] and test_y[j] == 1) else 0

    error[i] /= len(test_y)
print("Status:", problem.status)
print("Optimal LP Values", p.value, t.value)
print("Empirical Error of the Boosted Classifier on the Test Data:", empirical_error)
print("Empirical Errors for Each Feature Weak Learner:", error)
print("We can see that the aggregated classifier has a lower error than any of the weak learners. The worst weak learner has an error of .124 and the aggregated classifier has an error of .098.")

Empirical Weak Learner Training Error: [0.15958668197474168, 0.29965556831228474, 0.37083811710677383, 0.4397244546498278]
Status: optimal
Optimal LP Values [0.32839585 0.34782904 0.05824235 0.26553275] -5.012127671565756e-12
Empirical Error of the Boosted Classifier on the Test Data: 0.098
Empirical Errors for Each Feature Weak Learner: [0.124, 0.29, 0.378, 0.434]
We can see that the aggregated classifier has a lower error than any of the weak learners. The worst weak learner has an error of .124 and the aggregated classifier has an error of .098.
