# Programming assignment 6: Optimization: Logistic regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

## Your task

In this notebook code skeleton for performing logistic regression with gradient descent is given. 
Your task is to complete the functions where required. 
You are only allowed to use built-in Python functions, as well as any `numpy` functions. No other libraries / imports are allowed.

## Load and preprocess the data

In this assignment we will work with the UCI ML Breast Cancer Wisconsin (Diagnostic) dataset https://goo.gl/U2Uwz2.

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. There are 212 malignant examples and 357 benign examples.

In [2]:
X, y = load_breast_cancer(return_X_y=True)

# Add a vector of ones to the data matrix to absorb the bias term
X = np.hstack([np.ones([X.shape[0], 1]), X])

# Set the random seed so that we have reproducible experiments
np.random.seed(0)

# Split into train and test
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

## Task 1: Implement the sigmoid function 

In [3]:
def sigmoid(X):
    """
    Applies the sigmoid function elementwise to the input data.
    
    Parameters
    ----------
    X : array, arbitrary shape
        Input data.
        
    Returns
    -------
    X_sigmoid : array, arbitrary shape.
        Data after applying the sigmoid function.
    """
    
    sigm = 1/ (1+np.exp(-X))
    return sigm

## Task 2: Implement the negative log likelihood

In [5]:
def negative_log_likelihood(X, y, w):
    """
    Negative Log Likelihood of the Logistic Regression.
    
    Parameters
    ----------
    X : array, shape [N, D]
        (Augmented) feature matrix.
    y : array, shape [N]
        Classification targets.
    w : array, shape [D]
        Regression coefficients (w[0] is the bias term).
        
    Returns
    -------
    nnl : float
        The negative log likelihood.
    """    
    sig = sigmoid(np.matmul(X,w))
    nll = -np.sum(y*np.log(sig) + (1-y)*np.log(1-sig))
    return nll

## Task 3: Implement the gradient of the NLL w.r.t w

In [6]:
def get_gradient(X, y, w, mini_batch_indices, lmbda):
    """
    Calculates the gradient (full or mini-batch) of the negative log likelilhood w.r.t. w.
    
    Parameters
    ----------
    X : array, shape [N, D]
        (Augmented) feature matrix.
    y : array, shape [N]
        Classification targets.
    w : array, shape [D]
        Regression coefficients (w[0] is the bias term).
    mini_batch_indices: array, shape [mini_batch_size]
        The indices of the data points to be included in the (stochastic) calculation of the gradient.
        This includes the full batch gradient as well, if mini_batch_indices = np.arange(n_train).
    lmbda: float
        Regularization strentgh. lmbda = 0 means having no regularization.
        
    Returns
    -------
    dw : float
        array, shape [D]
        Gradient w.r.t. w
    """
    
    dw = 0
    return dw

In [7]:
def logistic_regression(X, y, num_steps, learning_rate, mini_batch_size, lmbda):
    """
    Performs logistic regression with (stochastic) gradient descent.
    
    Parameters
    ----------
    X : array, shape [N, D]
        (Augmented) feature matrix.
    y : array, shape [N]
        Classification targets.
    num_steps : int
        Number of steps of gradient descent to perform.
    learning_rate: float
        The learning rate to use when updating the parameters w.
    mini_batch_size: int
        The number of examples in each mini-batch.
        If mini_batch_size=n_train we perform full batch gradient descent. 
    lmbda: float
        Regularization strentgh. lmbda = 0 means having no regularization.
        
    Returns
    -------
    w : array, shape [D]
        Optimal regression coefficients (w[0] is the bias term).
    trace: list
        Trace of the negative log likelihood after each step of gradient descent.
    """
    
    trace = [] # saves the value of the log likelihood at each step to be able to plot it later
    n_train = X.shape[0] # number of training instances
    
    w = np.zeros(X.shape[1]) # initialize the parameters to zeros
    
    # run gradient descent for a given number of steps
    for step in range(num_steps):
        
        permuted_idx = np.random.permutation(n_train) # shuffle the data
        
        # go over each mini-batch and update the paramters
        # note that if mini_batch_size = n_train we perform full batch GD and this loop runs only once
        for idx in range(0, n_train, mini_batch_size):
            
            # get the random indices to be included in the mini batch
            mini_batch_indices = permuted_idx[idx:idx+mini_batch_size]
            
            gradient = get_gradient(X, y, w, mini_batch_indices, lmbda)

            # update the parameters
            w = w - learning_rate * gradient

            # calculate and save the negative log likelihood
            nnl = negative_log_likelihood(X, y, w)
            trace.append(nnl)
        
        # print the negative log likelihood every 10th iteration to monitor the progress
        if step % 10 == 0:
            print(nnl)
        
    return w, trace

## Task 4: Implement the function to obtain the predictions

In [8]:
def predict(X, w):
    """
    Parameters
    ----------
    X : array, shape [N_test, D]
        (Augmented) feature matrix.
    w : array, shape [D]
        Regression coefficients (w[0] is the bias term).
        
    Returns
    -------
    y_pred : array, shape [N_test]
        A binary array of predictions.
    """
    y_pred = np.matmul(w,X)
    return y_pred

### Full batch gradient descent without regularization

In [9]:
n_train = X_train.shape[0]
w, trace = logistic_regression(X_train, y_train, num_steps=8000, learning_rate=1e-8, mini_batch_size=n_train, lmbda=0.0)

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [10]:
plt.plot(trace)
y_pred = predict(X_test, w)

print('accuracy: {:.2f}, f1_score: {:.2f}'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)))

NameError: name 'trace' is not defined

### Full batch gradient descent with regularization

In [11]:
w, trace = logistic_regression(X_train, y_train, num_steps=8000, learning_rate=1e-8, mini_batch_size=n_train, lmbda=1e-3)

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [12]:
plt.plot(trace)
y_pred = predict(X_test, w)

print('accuracy: {:.2f}, f1_score: {:.2f}'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)))

NameError: name 'trace' is not defined

### Mini-batch gradient descent without regularization

In [13]:
w, trace = logistic_regression(X_train, y_train, num_steps=2000, learning_rate=1e-8, mini_batch_size=10, lmbda=0)

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [14]:
plt.plot(trace)
y_pred = predict(X_test, w)

print('accuracy: {:.2f}, f1_score: {:.2f}'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)))

NameError: name 'trace' is not defined

### Mini-batch gradient descent with regularization

In [15]:
w, trace = logistic_regression(X_train, y_train, num_steps=2000, learning_rate=1e-8, mini_batch_size=10, lmbda=1e-3)

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [16]:
plt.plot(trace)
y_pred = predict(X_test, w)

print('accuracy: {:.2f}, f1_score: {:.2f}'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)))

NameError: name 'trace' is not defined