# NAMES: Timothy Barao, Marlan McInnes-Taylor
# FSUIDS: tjb13b, mm05f

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

In [2]:
class Logiboost:
    def __init__(self, xt, yt, xv, yv):
        self.load_data(xt, yt, xv, yv)
        self.B_all = np.zeros(self.M)
     
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        if "dexter" in x_train_file:
            self.x_t = np.loadtxt(x_train_file, delimiter=',')
            self.x_v = np.loadtxt(x_valid_file, delimiter=',')
        else:
            self.x_t = np.loadtxt(x_train_file)
            self.x_v = np.loadtxt(x_valid_file)
        
        self.y_t = np.loadtxt(y_train_file) 
        self.y_v = np.loadtxt(y_valid_file) 
        
        self.y_t = self.y_t.flatten()
        self.y_v = self.y_v.flatten()
        
        self.x_t = np.insert(self.x_t, 0, 1., axis=1)
        self.x_v = np.insert(self.x_v, 0, 1., axis=1)
        
        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns         
    
    def WLS(self, w, z, j):
        
        x_j = self.x_t[:, j + 1]

        a = np.sum(w)
        b = np.sum(w * x_j)
        c = np.sum(w * (x_j ** 2))
        d = np.sum(w * z)
        e = np.sum(w * x_j * z)

        f = np.array([c*d-b*e, a*e - b*d])
        g = a * c - (b**2)
        if g == 0:
            B_j = np.array([d/a, 0])
        else:
            B_j = ((1)/(g)) * f  
       
        return B_j
            
    def plot(self, k):
        plt.plot(np.arange(0, k), self.losses, linestyle='-', marker='o', color='g')
        plt.grid(True)
        plt.title('Training Loss vs Iteration Number k=30')
        plt.xlabel('Iteration Number (i)')
        plt.ylabel('Training Loss (L_i)')
        plt.show()
    
    def train(self, k):
        self.losses = np.zeros(k)
        for i in range(k):
            H = np.matmul(self.x_t,self.B_all) #???
               
            p = (1)/(1 + np.exp(-2 * H))
            w = p * (1 - p)
            z = (0.5*(self.y_t + 1) - p)/(w)     
            
            coef = np.zeros((2, self.M - 1))
            newloss = np.zeros((self.M - 1, 1))
            
            a = np.sum(w)
            b = np.dot(w, self.x_t)
            c = np.dot(w, (self.x_t ** 2))
            d = np.dot(w, z)
            e = np.dot(np.dot(w, self.x_t), z)
                       
                       
            
            for j in range(self.M-1):        
                x_j = self.x_t[:, j + 1]

                a = np.sum(w)
                b = np.sum(w * x_j)
                c = np.sum(w * (x_j ** 2))
                d = np.sum(w * z)
                e = np.sum(w * x_j * z)

                f = np.array([c*d-b*e, a*e - b*d])
                g = a * c - (b**2)
                if g == 0:
                    B_j = np.array([d/a, 0])
                else:
                    B_j = ((1)/(g)) * f  
                
                H_j = H + 0.5*(B_j[0] + B_j[1] * x_j)
                                                      
                loss = np.sum(np.log(1 + np.exp((-2 * self.y_t - 1) *H_j)))
                
                coef[:, j] = B_j
                newloss[j] = loss
           
            j_hat = np.argmin(newloss)
            
            self.B_all[0] = self.B_all[0] +  0.5 * coef[0, j_hat]  
            self.B_all[j_hat + 1] = self.B_all[j_hat + 1] +  0.5 * coef[1, j_hat]
            self.losses[i] = newloss[j_hat]+0.00001
            print("\rk: ", k, "Iteration: ", i, "Loss: ", newloss[j_hat], end="", flush=True)
            
            
        y_train_p = np.sign(np.matmul(self.x_t,self.B_all))
        y_test_p = np.sign(np.matmul(self.x_v,self.B_all))
        
        tr_err = 1 - np.mean(np.equal(y_train_p, self.y_t))
        v_err = 1 - np.mean(np.equal(y_test_p, self.y_v))
        
        print("\nTraining Error: ", tr_err)
        print("Testing Error: ", v_err)
        
    def print_features(self):
        print("\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])
        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])
        print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")

In [3]:
#K = [10, 30, 100, 300, 500]
K = [300]
trainErrors = []
testErrors = []
losses = []

# Madelon

In [4]:
x_train_file = "../data/MADELON/madelon_train.data"
y_train_file = "../data/MADELON/madelon_train.labels"

x_valid_file = "../data/MADELON/madelon_valid.data"
y_valid_file = "../data/MADELON/madelon_valid.labels"

In [5]:
for k in K:
    model = Logiboost(x_train_file, y_train_file, x_valid_file, y_valid_file)
    model.train(k)
    model.plot(k)
    
    

ValueError: shapes (501,) and (2000,) not aligned: 501 (dim 0) != 2000 (dim 0)

# Dexter

In [None]:
x_train_file = "../data/dexter/dexter_train.csv"
y_train_file = "../data/dexter/dexter_train.labels"

x_valid_file = "../data/dexter/dexter_valid.csv"
y_valid_file = "../data/dexter/dexter_valid.labels"

In [None]:
for k in K:
    model = Logiboost(x_train_file, y_train_file, x_valid_file, y_valid_file)
    model.train(k)
    model.plot(k)

# Gisette 

In [None]:
x_train_file = "../data/Gisette/gisette_train.data"
y_train_file = "../data/Gisette/gisette_train.labels"

x_valid_file = "../data/Gisette/gisette_valid.data"
y_valid_file = "../data/Gisette/gisette_valid.labels"

In [None]:
for k in K:
    model = Logiboost(x_train_file, y_train_file, x_valid_file, y_valid_file)
    model.train(k)
    model.plot(k)

In [None]:
h(x, O_k) = a * x_j + b

In [None]:
class Logiboost:
    def __init__(self, xt, yt, xv, yv):
        self.load_data(xt, yt, xv, yv)
        self.B_all = np.zeros(self.M)
     
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        if "dexter" in x_train_file:
            self.x_t = np.loadtxt(x_train_file, delimiter=',')
            self.x_v = np.loadtxt(x_valid_file, delimiter=',')
        else:
            self.x_t = np.loadtxt(x_train_file)
            self.x_v = np.loadtxt(x_valid_file)
        
        self.y_t = np.loadtxt(y_train_file) 
        self.y_v = np.loadtxt(y_valid_file) 
        
        self.y_t = self.y_t.flatten()
        self.y_v = self.y_v.flatten()
        
        self.x_t = np.insert(self.x_t, 0, 1., axis=1)
        self.x_v = np.insert(self.x_v, 0, 1., axis=1)
        
        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns         
 
        #self.print_features()
              
    #def gradient_update(self):  

    
    #def loss(self):


    #def predict(self):
    
    def WLS(self, w, z, j):
        
        x_j = self.x_t[:, j + 1]

        a = np.sum(w)
        b = np.sum(w * x_j)
        c = np.sum(w * (x_j ** 2))
        d = np.sum(w * z)
        e = np.sum(w * x_j * z)

        f = np.array([c*d-b*e, a*e - b*d])
        g = a * c - (b**2)
        if g == 0:
            B_j = np.array([d/a, 0])
        else:
            B_j = ((1)/(g)) * f  
       
        return B_j
            
    def plot(self, k):
        plt.plot(np.arange(0, k), self.losses, linestyle='-', marker='o', color='g')
        plt.grid(True)
        plt.title('Training Loss vs Iteration Number k=30')
        plt.xlabel('Iteration Number (i)')
        plt.ylabel('Training Loss (L_i)')
        plt.show()
    
    def train(self, k):
        self.losses = np.zeros(k)
        for i in range(k):
            H = np.matmul(self.x_t,self.B_all) #???
               
            p = (1)/(1 + np.exp(-2 * H))
            w = p * (1 - p)
            z = (0.5*(self.y_t + 1) - p)/(w)     
            
            coef = np.zeros((2, self.M - 1))
            newloss = np.zeros((self.M - 1, 1))
            
            for j in range(self.M-1):
                B_j = self.WLS(w, z, j)
                
                H_j = H + 0.5*(B_j[0] + B_j[1] * x_j)
                                                      #predicted values? 
                loss = np.sum(np.log(1 + np.exp((-2 * self.y_t - 1) *H_j)))
                
                coef[:, j] = B_j
                newloss[j] = loss
           
            j_hat = np.argmin(newloss)
            
            self.B_all[0] = self.B_all[0] +  0.5 * coef[0, j_hat]  
            self.B_all[j_hat + 1] = self.B_all[j_hat + 1] +  0.5 * coef[1, j_hat]
            self.losses[i] = newloss[j_hat]
            print("\rk: ", k, "Iteration: ", i, "Loss: ", newloss[j_hat], end="", flush=True)
            
            
        y_train_p = np.sign(np.matmul(self.x_t,self.B_all))
        y_test_p = np.sign(np.matmul(self.x_v,self.B_all))
        
        tr_err = 1 - np.mean(np.equal(y_train_p, self.y_t))
        v_err = 1 - np.mean(np.equal(y_test_p, self.y_v))
        
        print("\nTraining Error: ", tr_err)
        print("Testing Error: ", v_err)
        
    def print_features(self):
        print("\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])
        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])
        print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")

In [None]:
class Logiboost:
    def __init__(self, xt, yt, xv, yv):
        self.load_data(xt, yt, xv, yv)
        self.B_all = np.zeros(self.M)
     
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        if "dexter" in x_train_file:
            self.x_t = np.loadtxt(x_train_file, delimiter=',')
            self.x_v = np.loadtxt(x_valid_file, delimiter=',')
        else:
            self.x_t = np.loadtxt(x_train_file)
            self.x_v = np.loadtxt(x_valid_file)
        
        self.y_t = np.loadtxt(y_train_file) 
        self.y_v = np.loadtxt(y_valid_file) 
        
        self.y_t = self.y_t.flatten()
        self.y_v = self.y_v.flatten()
        
        self.x_t = np.insert(self.x_t, 0, 1., axis=1)
        self.x_v = np.insert(self.x_v, 0, 1., axis=1)
        
        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns         
 
        #self.print_features()
              
    #def gradient_update(self):  

    
    #def loss(self):


    #def predict(self):
    
    def WLS(self, w, z, j, x_j):
        a = np.sum(w)
        b = np.sum(w * x_j)
        c = np.sum(w * (x_j ** 2))
        d = np.sum(w * z)
        e = np.sum(w * x_j * z)

        f = np.array([c*d-b*e, a*e - b*d])
        g = a * c - (b**2)
        if g == 0:
            B_j = np.array([d/a, 0])
        else:
            B_j = ((1)/(g)) * f  
       
        return B_j
            
    def plot(self, k):
        plt.plot(np.arange(0, k), self.losses, linestyle='-', marker='o', color='g')
        plt.grid(True)
        plt.title('Training Loss vs Iteration Number k=30')
        plt.xlabel('Iteration Number (i)')
        plt.ylabel('Training Loss (L_i)')
        plt.show()
    
    def train(self, k):
        self.losses = np.zeros(k)
        #H = np.zeros(self.M+1)
        for i in range(k):
            H = np.matmul(self.x_t, self.B_all)
            #H = np.sign(H)  #may not be needed
            
            p = np.exp(H)/np.exp(H) + np.exp(-H)
            w = (p)*(1 - p)
            z = (self.y_t - p)/(p)*(1-p)
            
            print("H shape: ", H.shape)
            print("p shape: ", p.shape)
            print("w shape: ", w.shape)
            print("z shape: ", z.shape)
            
            for j in range(self.M+1):
                x_j = self.x_t[:j+1]
                
                
            #H_j = np.argmin(H)
            #loss = np.sum(np.log(1 + np.exp((-2 * self.y_t - 1) *H[H_j])))
            #self.losses[i] = loss
            
            #print("H shape: ", H.shape)
            #print("p shape: ", p.shape)
            #print("w shape: ", w.shape)
            #print("z shape: ", z.shape)
            
            #for j in range(self.M-1):
            #self.B_all = self.B_all + 0.5 * H[H_j]
            #print("\rk: ", k, "Iteration: ", i, "Loss: ", loss, end="", flush=True)
            
            
        y_train_p = np.sign(np.matmul(self.x_t,self.B_all))
        y_test_p = np.sign(np.matmul(self.x_v,self.B_all))
        
        tr_err = 1 - np.mean(np.equal(y_train_p, self.y_t))
        v_err = 1 - np.mean(np.equal(y_test_p, self.y_v))
        
        print("\nTraining Error: ", tr_err)
        print("Testing Error: ", v_err)
        
    def print_features(self):
        print("\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])
        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])
        print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")