# Homework 3 Code

In [2]:
# Add import statements here
import numpy as np, matplotlib as plt, pandas as pd, time
from scipy import stats

In [3]:
# To access files in your Google Drive, run this block and follow the instructions
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# To test if the above block worked, run this block
!ls '/content/gdrive/My Drive/Wash U Fall 2020/417T Intro to Machine Learning F2020/CSV_Files'

cleveland_test.csv  cleveland_train.csv


 ## Find test error

The `find_test_error` function computes the test error of a linear classifier $w$. 

The hypothesis is assumed to be of the form $sign([1, x(N,:)] \cdot w)$.

Inputs:
* `w` is the weight vector
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)

Outputs:
* `test_error` is the binary error of $w$ on the data set $(X, y)$ error; this should be between 0 and 1. 

In [9]:
def find_test_error(w, X, y):

  N = X.shape[0]
  col = X.shape[1]
  ones = np.ones(N)
  X = np.c_[ones, X]
  
  #Calcuate the binary error 1/N * Sum ( sign(w*x) != y | 1; else 0)
  hx = np.sign(np.matmul(X,w))
  test_error = (hx - y)/2          # 1-1=0, 1-(-1)=2
  test_error = abs(np.sum(test_error) / test_error.shape[0])

  #Equivalent sigmoid function
  # xw = np.matmul(X, w)
  # yxw = y*xw
  # sigmoid = np.exp(yxw)/(1+np.exp(yxw))
  # test_error = np.sum(sigmoid > 0.5)/N  #The cutoff is 0.5

  return test_error

 ## Logistic Regression

The `logistic_reg`  learn a logistic regression model using gradient descent.

Inputs:
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)
* `w_init` is the initial value of the w vector ($d+1$ dimensional)
* `max_its` is the maximum number of iterations to run for
* `eta` is the learning rate

Outputs:
* t is the number of iterations gradient descent ran for
* w is the learned weight vector
* e_in is the in-sample (cross-entropy) error 

In [36]:
def logistic_reg(X, y, w_init, max_its, eta):
  
  #Add ones to X
  N = X.shape[0]
  ones = np.ones(N)
  X = np.c_[ones, X]
  col = X.shape[1]

  #Set up variables to start
  eps = 10**(-6)
  t = 0
  w = w_init
  gt = np.ones(col) #Gradient vector with number of elements equal to weights

  num = y*X # numerator of the gradient
  #Run the gradient descent algorithm to find the weights
  while np.nanmax(abs(gt)) > eps and t < max_it:  #while less than max iterations and magnitude of one element of gradient > eps
    h = np.matmul(X, w)
    gt = 1+np.exp(h*y) # denominator of gradient, constant value, changes after each iteration
    gt = (1/N)*np.sum(num/gt, axis=0)
    gt = np.reshape(gt, (col,1))
    w += gt*eta #update weights
    t+=1 #update iterations


  r1 = X[0]
  r1 = w*r1[:,np.newaxis]
  h = np.matmul(X, w)
  e_in = np.log(1+np.exp(-y*h))
  #print(X, r1, np.matmul(X[0],w))
  e_in = 1/N*np.sum(e_in)

  #clean it up
  w = w[:,:]
  return t, w, e_in, np.nanmax(abs(gt))

## Run and Plot

Run your code and plot figures below

In [37]:
X = np.genfromtxt('/content/gdrive/My Drive/Wash U Fall 2020/417T Intro to Machine Learning F2020/CSV_Files/cleveland_train.csv', skip_header=True, delimiter=',')
X_test = np.genfromtxt('/content/gdrive/My Drive/Wash U Fall 2020/417T Intro to Machine Learning F2020/CSV_Files/cleveland_test.csv', skip_header=True, delimiter=',')

col = X.shape[1] 
y = X[:,col-1]          #Initialize y vector
y = np.where(y==0,-1,y) #In y vector, change all 0s to -1s
y = y.reshape(y.shape[0],1)
X = X[:,0:col-1]          #Remove y vector from X

#Test set
y_test = X_test[:,col-1]
y_test = np.where(y_test==0,-1,y_test)
y_test = y_test.reshape(y_test.shape[0],1)
X_test = X_test[:,0:col-1]

w = np.zeros((col,1)) #initialize weight vector to zeros

X_z = stats.zscore(X, axis=0)
X_testZ = stats.zmap(X_test, X, axis=0)

eta = 10**(-5)
max_it = 10**6
start_time = time.time()
t, w, e_in, stp_cdn = logistic_reg(X_z ,y,w,max_it, eta)
end_time = time.time()
time_passed = abs(start_time - end_time)
test_error_training_set = find_test_error(w, X_z, y)
test_error_test_set = find_test_error(w, X_testZ, y_test)
print("Learning rate {}: \n E_in is {}. \n The binary error on training set is {}. \n The binary error on test set is {}. \n The training process took {} seconds over {} iterations. \n Stop condition was at {}"
.format(eta, e_in, test_error_training_set, test_error_test_set, time_passed, t, stp_cdn))

Learning rate 1e-05: 
 E_in is 0.4166924959018434. 
 The binary error on training set is 0.006578947368421052. 
 The binary error on test set is 0.06896551724137931. 
 The training process took 45.64273929595947 seconds over 1000000 iterations. 
 Stop condition was at 0.02221937354957484


In [237]:
w

array([[ 0.00052699],
       [ 0.00682633],
       [ 0.04066563],
       [ 0.10819079],
       [ 0.01244496],
       [-0.00082465],
       [-0.01090669],
       [ 0.0394019 ],
       [-0.02658727],
       [ 0.0377911 ],
       [ 0.10391188],
       [ 0.03779678],
       [ 0.11746569],
       [ 0.25456501]])