In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

In [2]:
class LogitBoost:
    def __init__(self, xt, yt, xv, yv, k):
        self.load_data(xt, yt, xv, yv)
        
        self.N_iter = k
        
        self.M = self.x_t.shape[1]
        self.N = self.x_t.shape[0]
        self.B = np.zeros(self.M)
        
        self.losses = []
    
    
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        if "dexter" in x_train_file:
            self.x_t = np.loadtxt(x_train_file, delimiter=',')
            self.x_v = np.loadtxt(x_valid_file, delimiter=',')
        else:
            self.x_t = np.loadtxt(x_train_file)
            self.x_v = np.loadtxt(x_valid_file)
        
        self.y_t = np.loadtxt(y_train_file) 
        self.y_v = np.loadtxt(y_valid_file) 
        
        self.y_t = self.y_t.flatten()
        self.y_v = self.y_v.flatten()
        
        self.x_t = np.insert(self.x_t, 0, 1., axis=1)
        self.x_v = np.insert(self.x_v, 0, 1., axis=1)
        
        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns         

                
    def gradient_update(self):
        
        return 0
    
    def loss(self):
        
        return 0

    def predict(self):
        xB = np.matmul(self.x_t, self.B)
        yPred = np.sign(xB)
        trainAcc = 1- np.mean(yPred == self.y_t.flatten())
        
        xB = np.matmul(self.x_v, self.B)
        yPred = np.sign(xB)
        testAcc = 1 - np.mean(yPred == self.y_v.flatten())
        return trainAcc, testAcc
    
    def train(self):
        self.losses = np.zeros(self.N_iter)
        for i in range(self.N_iter):
            H = np.matmul(self.x_t, self.B)
            #H = np.sign(H)  #may not be needed
            
            p = np.exp(H)/np.exp(H) + np.exp(-H)
            w = (p)*(1 - p)
            z = (self.y_t - p)/(p)*(1-p)

            #print("H shape: ", H.shape)
            #print("p shape: ", p.shape)
            #print("w shape: ", w.shape)
            #print("z shape: ", z.shape)
            
            for j in range(self.M+1):
                x_j = self.x_t[:j+1]
            
            
            H_j = np.argmin(H)
            loss = np.sum(np.log(1 + np.exp((-2 * self.y_t - 1) *H[H_j])))
            self.losses[i] = loss
            
            print("\rk: ", str(self.N_iter), " Iteration: ", str(i), " Loss: ", round(loss, 2), sep='', end='', flush=True)
        print("")
    
    def print_features(self):
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])

        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])

                
        #self.x_t, self.y_t, self.x_v, self.y_v = x_train, y_train, x_valid, y_valid 



In [3]:
def plotLosses(xVals, losses):
    plt.plot(xVals[1:], losses[1:], linestyle='-', marker='o', color='g')
    plt.grid(True)
    plt.title('Training Loss vs Boosted Iterations k=500')
    plt.xlabel('Iteration Number (i)')
    plt.ylabel('Training Loss (L_i)')
    plt.show()

In [4]:
def plotErrors(K, trainErrors, testErrors):
    plt.plot(K, trainErrors, linestyle='-', marker='o', color='r', label='Train')
    plt.plot(K, testErrors, linestyle='-', marker='o', color='b', label='Test')
    plt.grid(True)
    plt.title('Misclassification Error vs Number of Boosted Iterations')
    plt.xlabel('Number of Boosted Iterations (k)')
    plt.ylabel('Misclassification Error (%)')
    plt.legend()
    plt.show()

In [5]:
def genErrorTable(K, trainErrors, testErrors):
    errorTable = pd.DataFrame({"Training Error (%)":[0, 0, 0, 0, 0], "Test Error (%)":[0, 0, 0, 0, 0]}, index=K)
    errorTable.index.name = "Iterations"
    for k, trainErr, testErr in zip(K, trainErrors, testErrors):
        errorTable.loc[k, 'Training Error (%)'] = trainErr
        errorTable.loc[k, 'Test Error (%)'] = testErr

    return errorTable

In [6]:
def runModel(K):
    trainErrors = []
    testErrors = []
    
    for k in K:
        model = LogitBoost(x_train_file, y_train_file, x_valid_file, y_valid_file, k)
        model.train()
        if k == 500:
            losses = model.losses
        trainAccuracy, testAccuracy = model.predict()
        #print(trainAccuracy, testAccuracy)
        trainErrors.append(round((trainAccuracy) * 100, 2))
        testErrors.append(round((testAccuracy) * 100, 2))
    print("Train Errors: ", trainErrors)
    print("Test Errors: ", testErrors)
    
    return losses, trainErrors, testErrors

In [7]:
K = [10, 30, 100, 300, 500]
xVals = xVals = np.arange(1, 501)

# Madelon

In [8]:
x_train_file = "../data/MADELON/madelon_train.data"
y_train_file = "../data/MADELON/madelon_train.labels"

x_valid_file = "../data/MADELON/madelon_valid.data"
y_valid_file = "../data/MADELON/madelon_valid.labels"

In [9]:
losses, trainErrors, testErrors = runModel(K)

k: 10 Iteration: 9 Loss: 1386.2943611198907
k: 30 Iteration: 29 Loss: 1386.2943611198907
k: 100 Iteration: 99 Loss: 1386.2943611198907 Loss: 1386.2943611198907100 Iteration: 15 Loss: 1386.2943611198907 Iteration: 35 Loss: 1386.294361119890746 Loss: 1386.2943611198907 Iteration: 61 Loss: 1386.2943611198907 Loss: 1386.2943611198907
k: 300 Iteration: 299 Loss: 1386.294361119890700 Iteration: 11 Loss: 1386.2943611198907300 Iteration: 13 Loss: 1386.29436111989071386.2943611198907300 Iteration: 39 Loss: 1386.2943611198907300 Iteration: 112 Loss: 1386.2943611198907300 Iteration: 114 Loss: 1386.2943611198907300 Iteration: 120 Loss: 1386.2943611198907 Iteration: 121 Loss: 1386.2943611198907122 Loss: 1386.2943611198907 Loss: 1386.2943611198907300 Iteration: 140 Loss: 1386.2943611198907147 Loss: 1386.2943611198907 Iteration: 164 Loss: 1386.2943611198907300 Iteration: 180 Loss: 1386.2943611198907300 Iteration: 183 Loss: 1386.2943611198907 Loss: 1386.29436111989071386.2943611198907300 Iteration: 21

KeyboardInterrupt: 

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
table = genErrorTable(K, trainErrors, testErrors)
table

# Dexter

In [None]:
x_train_file = "../data/dexter/dexter_train.csv"
y_train_file = "../data/dexter/dexter_train.labels"

x_valid_file = "../data/dexter/dexter_valid.csv"
y_valid_file = "../data/dexter/dexter_valid.labels"

In [None]:
losses, trainErrors, testErrors = runModel(K)

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
genErrorTable(K, trainErrors, testErrors)

# Gisette

In [None]:
x_train_file = "../data/Gisette/gisette_train.data"
y_train_file = "../data/Gisette/gisette_train.labels"

x_valid_file = "../data/Gisette/gisette_valid.data"
y_valid_file = "../data/Gisette/gisette_valid.labels"


In [None]:
losses, trainErrors, testErrors = runModel(K)

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
genErrorTable(K, trainErrors, testErrors)