In [None]:
# Import Relevant Libraries
import os
import numpy as np
import dask.dataframe as dd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_curve
from scipy.special import expit
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

# Homework 6
Author: Mao Nishino

## Problem 1
Implement Logitboost using univariate (based on a single feature, with intercept) lin-
ear regressors as weak learners. At each boosting iteration choose the weak learner that
obtains the largest reduction in the loss function on the training set D = {(xi, yi), i =
1, ..., N }, with yi ∈{0, 1},
$\begin{equation} L = \sum_{i=1}^{N}\ln(1+\log(-\tilde{y}_ih(\bf{x}_i))) \tag*{} \end{equation}$ 

where  ̃yi = 2yi −1 take values ±1 and h(x) = h1(x) + ... + hk(x) is the boosted
classifier. Please note that the Logitboost algorithm from the slides uses yi ∈ {0, 1}
and the loss uses  ̃yi ∈{−1, 1}.

In [16]:
NITER = 100

def log_one_exp(x):
    """ Aux function for the calculation of logloss
    Copied from my solution in HW3
    """
    if x < 0:
        return np.log(1+np.exp(x))
    else:
        return np.log(1+np.exp(-x))

v_log_one_exp = np.vectorize(log_one_exp)

class AddBias():
    """ Prepends columns of 1s to the dataset
    """
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return np.hstack([np.ones((X.shape[0], 1)), X])

class LogitBoostClassifier():
    """ Binary Classification using LogitBoost
    Each weak learner h_k(x)=ax_i+b is characterized by the tuple (i, [a,b])
    where i is the feature to use and a, b are coefficients. 
    The strong learner is characterized by the list of (i, [a,b]).
    
    Attributes:
        k (int): the number of total iterations
        strong_learner (list) : the strong learner described above.
        train_errors (list): the training loss vs the iteration number
    """
    def __init__(self, k: int):
        self.k = k
        self.strong_learner = []
        self.train_errors = []

    def _eval_learner(self, strong_learner, X) -> float:
        """ Using a strong learner h,
            output an array of h(x)'s
            where x is each row of X

            Args:
                strong_learner: a strong learner described above
                X(np.array): the matrix of data
            Returns:
                h(X) (np.array): array of h(x) where x is each row of X
        """
        pred = 0 # If stronglearner = [], return 0
        for weak_learner in strong_learner:
            a = weak_learner[1][0]
            b = weak_learner[1][1]
            i = weak_learner[0]
            pred += a*X[:,i]+b
        return pred

    def fit(self, X, y):
        for i in self.k:
            p = expit(self._eval_learner(self.strong_learner, X))
            w = p*(1-p)
            z = (y-p)/w

        return self
    
    def predict(self, X):
        pass
    
    def predict_proba(self, X):
        pass

class HW6():
    """ A class that achieves everything required for this assignment.
    Attributes:
    trainloss_300 (numpy.array) : an array that returns the training loss
                                for the i+1th iteration with 300 features
    misclass_all (numpy.array) : first column contains k, 
    second contains train errors, the third contains test errors
    train30_pred_proba_y (numpy.array) : an array that contains the predicted value for
                                    the case with 30 features for ROC
    test30_pred_proba_y (numpy.array) : an array that contains the predicted value for
                                    the case with 30 features for ROC
    All of the arrays are of the format
    [k=500, k=300, k=100, k=30, k=10]
    """

    def __init__(self, 
                 train_x : dd.DataFrame,
                 train_y : dd.DataFrame,
                 test_x : dd.DataFrame,
                 test_y : dd.DataFrame) -> None:
        """ Finds all of the attributes.
            Args:
            train_x, train_y : training data
            test_x, test_y : test data
        """
        ks = [500, 300, 100, 30, 10]
        train_errors = []
        test_errors = []
        for k in ks:
            pipe = Pipeline([('scaler', StandardScaler()),
                            ('addbias', AddBias()),
                            ('lb', LogitBoostClassifier())])
            pipe.fit(train_x, train_y)
            # Calculate train errors
            train_pred_y = pipe.predict(train_x)
            train_error = 1-accuracy_score(train_y, train_pred_y)
            train_errors.append(train_error)
            # Calculate test errors
            test_pred_y = pipe.predict(test_x)
            test_error = 1-accuracy_score(test_y, test_pred_y)
            test_errors.append(test_error)
            
            if k == 300:
                self.trainloss_300 = pipe['lb'].train_errors
            if k == 30:
                self.train30_pred_proba_y = pipe.predict_proba(train_x)
                self.test30_pred_proba_y = pipe.predict_proba(test_x)

        self.misclass_all = [list(x) for x in zip(ks, train_errors, test_errors)]
    
    def plot_trainloss_300(self):
        plt.figure(figsize = (10,6))
        plt.plot(range(1,300+1), self.trainloss_30)
        plt.title('# Iteration vs Training Loss When #Features = 300')
        plt.xlabel('# Iteration')
        plt.ylabel('Training_Loss')
        plt.grid(True)
        plt.show()
    
    def plot_misclass_all(self):
        plt.figure(figsize = (10,6))
        ks  = [item[1] for item in self.misclass_all]
        train_errors = [item[2] for item in self.misclass_all]
        test_errors  = [item[3] for item in self.misclass_all] 
        plt.plot(ks, train_errors, marker = 'o', label = 'Training Error')
        plt.plot(ks, test_errors, marker = 'x', label = 'Test Error')        
        plt.title('k vs Misclassification Errors')
        plt.xlabel('k')
        plt.ylabel('Misclassification Errors')
        plt.legend()
        plt.grid(True)
        plt.show()
    
    def report_table(self):
        #desired_features  = [item[0] for item in self.misclass_all]
        num_nonzero  = [item[1] for item in self.misclass_all]
        train_errors = [item[2] for item in self.misclass_all]
        test_errors  = [item[3] for item in self.misclass_all]
        table = {
         #   '#Desired_Features' : desired_features,
            '#Features' : num_nonzero,
            'Training Errors' : train_errors,
            'Test Errors' : test_errors
        }
        df = pd.DataFrame(table)
        display(df)
    
    def show_roc(self, y, test_y):
        fpr_train, tpr_train, thr = \
            roc_curve(y, self.train100_pred_proba_y)
        fpr_test, tpr_test, thr =  \
            roc_curve(test_y, self.test100_pred_proba_y)

        # Plot ROC
        plt.figure(figsize=(10, 6))
        plt.plot(fpr_train, tpr_train, marker = 'o', label = 'Training ROC')
        plt.plot(fpr_test, tpr_test, marker = 'x', label = 'Test ROC')
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR)')
        plt.legend()
        plt.title('ROC curves for k = 30')
        plt.grid(True)
        plt.show()
  

NameError: name 'np' is not defined

### Question (a)
Using the Gisette data, train a Logitboost classifier on the training set, with
k ∈{10, 30, 100, 300, 500} boosting iterations. Plot the training loss vs iteration
number for k = 300. Report in a table the misclassification errors on the training
and test set for the models obtained for all these k. Plot the misclassification
errors on the training and test set vs k. Also plot the train and test ROC curves
of the obtained model with 30 features.