In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

In [None]:
class LogitBoost:
    def __init__(self, xt, yt, xv, yv, k):
        self.load_data(xt, yt, xv, yv)
        
        self.N_iter = k
        # self.lr = 0.0001 

 
        self.M = self.x_t.shape[1]
        self.N = self.x_t.shape[0]
        self.B = np.zeros(self.M)
        
        self.losses = []
    
    
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        with open(x_train_file) as file:
            
            if "dexter" in x_train_file:
                self.x_t = pd.read_csv(file, delim_whitespace=False, header=None)
            else:    
                self.x_t = pd.read_csv(file, delim_whitespace=True, header=None)
            self.x_t = self.x_t.to_numpy(dtype=np.float64)

        with open(y_train_file) as file:
            self.y_t = pd.read_csv(file, header=None)
            self.y_t = self.y_t.to_numpy(dtype=np.float64)

        with open(x_valid_file) as file:
            
            if "dexter" in x_valid_file:
                self.x_v = pd.read_csv(file, delim_whitespace=False, header=None)
            else:    
                self.x_v = pd.read_csv(file, delim_whitespace=True, header=None)
            self.x_v = self.x_v.to_numpy(dtype=np.float64)
            
        with open(y_valid_file) as file:
            self.y_v = pd.read_csv(file, header=None)
            self.y_v = self.y_v.to_numpy(dtype=np.float64)

        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns 

        # Standardize data
        for i in range(0, self.M):
            if (not np.any(self.x_v[:, i])):
                self.x_v[:, i].fill(0)
            else:  
                self.x_v[:, i] = (self.x_v[:, i] - np.mean(self.x_v[:, i]))/(np.std(self.x_v[:, i]))

            if(not np.any(self.x_t[:, i])):
                self.x_t[:, i].fill(0)
            else:
                self.x_t[:, i] = (self.x_t[:, i] - np.mean(self.x_t[:, i]))/(np.std(self.x_t[:, i]))  
        
        self.x_v = np.insert(self.x_v, 0, 1, axis=1)
        self.x_t = np.insert(self.x_t, 0, 1, axis=1)

                
    def gradient_update(self):
        loss = self.loss()
        self.B = self.B - self.lr * loss

        
        return self.B, LA.norm(loss)
    
    def loss(self):
    
        Bx=np.sum(self.x_t*self.B, axis=1)
        yBx=np.sum(Bx*self.y_t.flatten())
        
        temp=2.*(yBx-1.)/(1.+(yBx-1.)*(yBx-1.))
        temp=temp*self.y_t.flatten()
        temp[yBx>1.]=0.
        
        grad=np.sum((self.x_t).T*temp, axis=1)+2.*self.s*self.B
        return grad

    def predict(self):
        xB = np.matmul(self.x_t, self.B)
        yPred = np.sign(xB)
        trainAcc = 1- np.mean(yPred == self.y_t.flatten())
        
        xB = np.matmul(self.x_v, self.B)
        yPred = np.sign(xB)
        testAcc = 1 - np.mean(yPred == self.y_v.flatten())
        return trainAcc, testAcc
    
    def train(self):
        mPrev = self.M
        for i in range(0, self.N_iter):
            if k == 30:
                B, loss = self.gradient_update()
                self.losses.append(loss)
            else:
                self.gradient_update()
            
            temp = (self.N_iter - 2 * i)/(2 * i * self.u + self.N_iter)
            temp = max(0., temp)
            mi = k + (self.M - k) * temp
            mi = int(mi)
            
            if mi < mPrev:
                indc = np.absolute(self.B)
                sort = np.argsort(indc)
                sort = sort[-mi:]
                self.B=self.B[sort]
                self.x_t=(self.x_t.T[sort]).T
                self.x_v=(self.x_v.T[sort]).T
                mPrev = mi
                
            #print("Iteration: ", i, "M_i: ", int(mi))
            #print('Non-zero count: ', np.count_nonzero(self.B))
            
        return 0
    
    
    def print_features(self):
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])

        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])

                
        #self.x_t, self.y_t, self.x_v, self.y_v = x_train, y_train, x_valid, y_valid 



In [None]:
def plotLosses(xVals, losses):
    plt.plot(xVals[1:], losses[1:], linestyle='-', marker='o', color='g')
    plt.grid(True)
    plt.title('Training Loss vs Boosted Iterations k=500')
    plt.xlabel('Iteration Number (i)')
    plt.ylabel('Training Loss (L_i)')
    plt.show()

In [None]:
def plotErrors(K, trainErrors, testErrors):
    plt.plot(K, trainErrors, linestyle='-', marker='o', color='r', label='Train')
    plt.plot(K, testErrors, linestyle='-', marker='o', color='b', label='Test')
    plt.grid(True)
    plt.title('Misclassification Error vs Number of Boosted Iterations')
    plt.xlabel('Number of Boosted Iterations (k)')
    plt.ylabel('Misclassification Error (%)')
    plt.legend()
    plt.show()

In [None]:
def genErrorTable(K, trainErrors, testErrors):
    errorTable = pd.DataFrame({"Training Error (%)":[0, 0, 0, 0, 0], "Test Error (%)":[0, 0, 0, 0, 0]}, index=K)
    errorTable.index.name = "Iterations"
    for k, trainErr, testErr in zip(K, trainErrors, testErrors):
        errorTable.loc[k, 'Training Error (%)'] = trainErr
        errorTable.loc[k, 'Test Error (%)'] = testErr

    errorTable

In [None]:
def runModel(K):
    trainErrors = []
    testErrors = []
    
    for k in K:
        model = LogitBoost(x_train_file, y_train_file, x_valid_file, y_valid_file, k)
        model.train()
        if k == 500:
            losses = model.losses
        trainAccuracy, testAccuracy = model.predict()
        #print(trainAccuracy, testAccuracy)
        trainErrors.append(round((trainAccuracy) * 100, 2))
        testErrors.append(round((testAccuracy) * 100, 2))
    print("Train Errors: ", trainErrors)
    print("Test Errors: ", testErrors)
    
    return losses, trainErrors, testErrors

In [None]:
K = [10, 30, 100, 300, 500]
xVals = xVals = np.arange(1, 501)

# Gisette

In [None]:
x_train_file = "../data/Gisette/gisette_train.data"
y_train_file = "../data/Gisette/gisette_train.labels"

x_valid_file = "../data/Gisette/gisette_valid.data"
y_valid_file = "../data/Gisette/gisette_valid.labels"

In [None]:
losses, trainErrors, testErrors = runModel(K)

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
genErrorTable(K, trainErrors, testErrors)

# Dexter

In [None]:
x_train_file = "../data/dexter/dexter_train.csv"
y_train_file = "../data/dexter/dexter_train.labels"

x_valid_file = "../data/dexter/dexter_valid.csv"
y_valid_file = "../data/dexter/dexter_valid.labels"

In [None]:
losses, trainErrors, testErrors = runModel(K)

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
genErrorTable(K, trainErrors, testErrors)

# Madelon

In [None]:
x_train_file = "../data/MADELON/madelon_train.data"
y_train_file = "../data/MADELON/madelon_train.labels"

x_valid_file = "../data/MADELON/madelon_valid.data"
y_valid_file = "../data/MADELON/madelon_valid.labels"

In [None]:
losses, trainErrors, testErrors = runModel(K)

In [None]:
plotLosses(xVals, losses)

In [None]:
plotErrors(K, trainErrors, testErrors)

In [None]:
genErrorTable(K, trainErrors, testErrors)