In [1]:
# Import relevant libralies
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.pipeline import Pipeline
import pandas as pd
import dask.dataframe as dd
import numpy as np
from scipy.special import expit
import os
import matplotlib.pyplot as plt

# Homework 4
Author: Mao Nishino

## Problem 1
Implement the TISP variable selection method for classification (as described in
pages 12 of the Regularized Loss course notes), with the hard-thresholding penalty,
(which is described in page 11, where you take $η = 0$). Take special care to normal-
ize each column of the $X$ matrix to have zero mean and variance 1, and to use the
same mean and standard deviation that you used for normalizing the train set also for
normalizing the test set.

In [2]:
class TISP():
    """ TISP algorithm by She (2009)
    Reference:
    https://arxiv.org/pdf/0812.5061.pdf

    Args:
    lambda (float): the regularization hyperparameter
    learn_rate (float): the learning rate
    max_iter (int) : maximum number of iterations
    w_ (numpy.array) : the regression coefficients 
    """
    def __init__(
        self,
        lamda: float,
        learn_rate: float,
        max_iter = 100
        ):
        self.lamda = lamda
        self.learn_rate = learn_rate
        self.max_iter = max_iter
        self.w_ = None
    
    def _threshold(self, w):
        """ The hard thresholding function
        """
        return np.where(np.abs(w)<self.lamda,
            0,
            w
        )

    def fit(self, X, y):
        w = np.zeros(X.shape[1])
        for i in range(self.max_iter):
            Xw = np.matmul(X,w)
            # This is actually -1*gradient
            grad = np.matmul(X.T, y-expit(Xw))
            w = self._threshold(w+self.learn_rate*grad)
        self.w_ = w
        return self

    def predict(self, X):
        Xw = np.matmul(X, self.w_)
        return np.where(
            expit(Xw) > 0.5,
            1,
            0
        )

    def predict_proba(self, X):
        Xw = np.matmul(X, self.w_)
        return expit(Xw)

class AddBias():
    """ Prepends columns of 1s to the dataset
    """
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return np.hstack([np.ones((X.shape[0], 1)), X])

def show_nonzero_coeff(lamda_range, num_nonzero):
    plt.figure(figsize = (10,6))
    plt.plot(lamda_range, num_nonzero)
    plt.title('Lambda vs the number of nonzero weights')
    plt.xlabel('Lambda')
    plt.ylabel('# nonzero weights')
    plt.grid(True)
    plt.show()

def show_misclassification(pipe: Pipeline, x, y, test_x, test_y):
    # Make predictions
    train_pred = pipe.predict(x)
    test_pred = pipe.predict(test_x)

    train_error = 1-accuracy_score(y, train_pred)
    test_error = 1-accuracy_score(test_y, test_pred)

    # Put it in a DataFrame so that it looks nicer
    table = {
        'Training Error' : [train_error],
        'Test Error' : [test_error]
    }

    return pd.DataFrame(table)

def show_roc(pipe: Pipeline, x, y, test_x, test_y):
    # Make predictions
    train_pred = pipe.predict_proba(x)
    test_pred = pipe.predict_proba(test_x)

    fpr_train, tpr_train, thr = roc_curve(y, train_pred)
    fpr_test, tpr_test, thr = roc_curve(test_y, test_pred)

    # Plot ROC
    plt.figure(figsize=(10, 6))
    plt.plot(fpr_train, tpr_train, marker = 'o', label = 'Training ROC')
    plt.plot(fpr_test, tpr_test, marker = 'x', label = 'Test ROC')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.legend()
    plt.title('ROC curves')
    plt.grid(True)
    plt.show()

## Problem (a)
Using the Gisette data, train a TISP classifier on the training set, starting with
w(0) = 0, with 100 iterations. Find appropriate thresholds λ to select approximately 10, 30, 100, 300, 500 features. Plot the train misclassification error vs
iteration number when selecting 30 features. Plot the final train and test misclassification error vs the number of selected features. Report in a table these
misclassification errors on the training and test set, the corresponding numbers
of selected features and the values of λ. Also plot the train and test ROC curves
of the obtained model with 100 features. (3 points)

In [7]:
#Load gisette
path = "./gisette/"
train_x = dd.read_csv(os.path.join(path, 'gisette_train.data'),
                         delim_whitespace = True,
                         header = None,
                         blocksize = None).compute()

train_y = dd.read_csv(os.path.join(path, 'gisette_train.labels'),
                         delim_whitespace=True,
                         header = None,
                         blocksize = None).compute()

test_x = dd.read_csv(os.path.join(path, 'gisette_valid.data'),
                         delim_whitespace=True,
                         header = None, 
                         blocksize = None).compute()

test_y = dd.read_csv(os.path.join(path, 'gisette_valid.labels'),
                         delim_whitespace=True,
                         header = None,
                         blocksize = None).compute()

# Replace -1 with 0 so that it fits the slides
train_y.replace(-1, 0, inplace = True)
test_y.replace(-1, 0, inplace = True)

num_nonzero = []

for lamda in np.arange(0.1, 1, 0.1):
    pipe = Pipeline([('scaler', StandardScaler()),
                      ('addbias', AddBias()),
                      ('tisp', TISP(lamda=lamda,
                                   learn_rate = 1.))])
    pipe.fit(train_x, train_y)
    num_nonzero.append(np.count_nonzero(pipe['tisp'].w_))

show_nonzero_coeff(np.arange(0.1, 1, 0.1), num_nonzero)

ValueError: Unable to coerce to Series, length must be 1: given 6000