# **ECE420 - Assignment 1**
### Saud Badar - 1002554595
### Kyu Bum Kim - 1003969100

In [0]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import time

from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/ECE421/Lab1')

def loadData():
    with np.load('notMNIST.npz') as data :
        Data, Target = data ['images'], data['labels']
        posClass = 2
        negClass = 9
        dataIndx = (Target==posClass) + (Target==negClass)
        Data = Data[dataIndx]/255.
        Target = Target[dataIndx].reshape(-1, 1)
        Target[Target==posClass] = 1
        Target[Target==negClass] = 0
        np.random.seed(421)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data, Target = Data[randIndx], Target[randIndx]
        trainData, trainTarget = Data[:3500], Target[:3500]
        validData, validTarget = Data[3500:3600], Target[3500:3600]
        testData, testTarget = Data[3600:], Target[3600:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Section 1: Linear Regression [18 points]**
## **1) Loss Function and Gradient [4pt]**
Implement two vectorized Numpy functions to compute the loss and gradient respectively. Both functions should accept 5 arguments - the weight vector, the bias, the data matrix, the labels, and λ, the regularization parameter. The loss function return a number (indicating total loss). The gradient function should be an analytical expression of the loss function and return the gradient with respect to the weights and the gradient with respect to the bias. Both function headers are below. Include both the analytical expression for the gradient and
the Python code snippet in your report.

### Answer: 

In [0]:
"""
calculates the mean squared error and returns the total loss
W is a weight, 1D-array of size: 784 (28 x 28) 
b is the bias
x is the batch of Mnist data images : (3500, 28, 28)
y is the batch of labels : (3500, 1)
reg is λ, the regularization parameter
"""
def MSE(W, b, x, y, reg):
  #reshaping the batch to satisfy the size of weights for multiplication 
  #(x shape = (3500, 784), W shape = (784, 1))
  x_flat = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2]))
 
  #calculate error using the formula (e shape = (3500, 1))
  e = np.subtract(np.matmul(x_flat, W) + b, y)
 
  #calculating overall mean squared error (loss function)
  mse = np.square(e).mean()
 
  #calculating weight decay loss
  weight_loss = reg / 2 * (np.linalg.norm(W) ** 2)
  
  #return the sum of the two losses for overall loss
  return weight_loss + mse
 

In [0]:
def gradMSE(W, b, x, y, reg):
  # y = y.squeeze()
  #   pred = np.matmul(x,W) + b
  #   err = y - pred
    
  #   #second part of this multiplies each error value for each input
  #   df_dw = -2*np.matmul(x.transpose(), err)/len(x) + reg*W
  #   df_db = np.mean(-2*err)

  #   return df_dw, df_db
    # Your implementation here
    # Reshaping part of the code 
    newX = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2])) # New shape of x is [3500, 784]

    #calculate error using the formula (e shape = (3500, 1))
    e_in = np.subtract(np.matmul(newX, newW) + b, y)

    #Find the gradient of weights
    w_gradient = (2 * np.matmul(newX.transpose(), e_in))/(x.shape[0])

    #Add the regulization factor
    w_gradient = w_gradient + reg*newW 

    #Find the gradient of the bias
    b_gradient = np.sum(e_in) / (x.shape[0])

    return w_gradient, b_gradient
   

## **2) Gradient Descent Implementation [6 pts]:**
Using the gradient computed from Part A, implement the batch Gradient Descent algorithm
to classify the two classes in the notMNIST dataset. The function should accept 8 arguments
the weight vector, the bias, the data matrix, the labels, the learning rate, the number of
epochs 1 , λ and an error tolerance (set to 1 × 10 −7 ). The error tolerance will be used to
compute the difference between the old and updated weights per iteration. The function
should return the optimized weight vector and bias.
### Answer:

In [0]:
"""
calculates optimal values of the vectors weights and bias determined by
the gradient of mean squared error loss function (MSE) and returns these 
optimal values 
returns weight, then bias
W is the weight
b is the bias
x is the data
y is the labels
alpha is learning rate (step)
epochs are iterations for each batch
reg is λ, the regularization parameter
error_tol is the error tolerance (set to 1 × 10 −7 )
lossType is the type of loss function (MSE or CE)
"""
def grad_descent(W, b, x, y, alpha, epochs, reg, error_tol, lossType):
  trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
  train_loss = []
  test_loss = []
  valid_loss = []
  train_acc = []
  valid_acc = []
  test_acc = []

  for i in range(epochs):
    train_acc.append(accuracy(W, b, x, y, lossType))
    valid_acc.append(accuracy(W, b, validData, validTarget, lossType))
    test_acc.append(accuracy(W, b, testData, testTarget, lossType))
    #find loss and gradient weight and bias determined by input of this function
    if lossType == 'MSE':
      loss = MSE(W, b, x, y, reg)
      train_loss.append(loss)
      valid_loss.append(MSE(W, b, validData, validTarget, reg))
      test_loss.append(MSE(W, b, testData, testTarget, reg))
      weights, bias = gradMSE(W, b, x, y, reg)

    elif lossType == 'CE':
      loss = crossEntropyLoss(W, b, x, y, reg)
      train_loss.append(loss)
      valid_loss.append(crossEntropyLoss(W, b, validData, validTarget, reg))
      test_loss.append(crossEntropyLoss(W, b, testData, testTarget, reg))

      weights, bias = gradCE(W, b, x, y, reg)
      
      #if error is smaller than the tolerance, we have the optimal values
      #for weights and bias, thus we can stop iterating

    if loss <= error_tol:
      break 

    #update ewight and bias from calculated gradient
    W -= alpha * weights
    b -= alpha * bias
  return W, b, train_loss, valid_loss, test_loss, train_acc, valid_acc, test_acc

## **3) Tuning the Learning Rate [3 pts]:**

Test your implementation of Gradient Descent with 5000 epochs and λ = 0. Investigate the impact of learning rate, α = 0.005, 0.001, 0.0001 on the performance of your classifier. Plot the training, validation and test losses.

### Answer:
**Learning Rate = 0.005 Graphs**

Loss/Error: 

![Learning_Rate = 0.005 (Error)](https://i.imgur.com/wlJnMsk.jpg)

Accuracy:


![Learning_Rate = 0.005 (Acc)](https://i.imgur.com/44QTwRT.png)


**Learning Rate = 0.001 Graphs**

Loss/Error:

![Learning_Rate = 0.001 (Error)](https://i.imgur.com/kequ8Lf.png)

Accuracy:

![Learning_Rate = 0.001 (Acc)](https://i.imgur.com/WuMUKhn.png)

**Learning Rate = 0.0001 Graphs**

Loss/Error:

![Learning_Rate = 0.0001 (Error)](https://i.imgur.com/7epr35j.png)

Accuracy:

![Learning_Rate = 0.0001 (Acc)](https://i.imgur.com/WVkYuXj.png)

From what we can tell all the the training, validation and the test loss are converging slowly towards zero. So the higher our learning rate is the faster the graphs converges to zero. We can tell this from the lowest losses in each graph, **0.005 is 0.0265, 0.001 is 0.0407, 0.0001 is 0.0611.** This shows the higher the learning rate is faster it converges to zero. We have to keep in mind not too increase the learning curve too much as that will cause our graph to never converge.

## **4) Generalization [3 pts]:**

Investigate impact by modifying the regularization parameter, λ = {0.001, 0.1, 0.5}. Plot the training, validation and test loss for α = 0.005 and report the final training, validation and test accuracy of your classifier.

### Answer:

**Regularization Parameter = 0.001**

Error:

![Regularization Parameter = 0.001 (Error)](https://i.imgur.com/LaVoRvu.png)

Accuracy:

![Regularization Parameter = 0.001 (Acc)](https://i.imgur.com/gZioAPA.png)

Final Training Accuracy: 0.986, Final Validation Accuracy: 1.0, Final Test Accuracy: 0.97931 


**Regularization Parameter = 0.1**

Error:

![Regularization Parameter = 0.1 (Error)](https://i.imgur.com/CGclalh.png)


Accuracy:

![Regularization Parameter = 0.1 (Acc)](https://i.imgur.com/78yq5Xb.png)

Final Training Accuracy: 0.98428, Final Validation Accuracy: 1.0, Final Test Accuracy: 0.97931 

**Regularization Parameter = 0.5**

Error:

![Regularization Parameter = 0.5 (Error)](https://i.imgur.com/TCwwK9X.png)


Accuracy:

![Regularization Parameter = 0.5 (Acc)](https://i.imgur.com/GaFJftJ.png)

Final Training Accuracy: 0.98057, Final Validation Accuracy: 1.0, Final Test Accuracy: 0.9862

## **5) Comparing Batch GD with normal equation [2 pts]:**

For linear regression, you can find the optimum weights using the closed form equation for the derivative of the means square error (normal equation). For zero weight decay, Write a Numpy script to find the optimal linear regression weights on the two-class notMNIST dataset using the ”normal equation” of the least squares formula. Compare in terms of final training MSE, accuracy and computation time between Batch GD and normal equation.

### Answer: 

In [0]:
"""
x is the data
y is the labels
"""
def leastSquares(x, y, b, reg):
  # Start the timer
  start = time.time()
 
  # Reshape the matrix and find the transpose of the new matrix
  x_Orig = x.reshape((x.shape[0], x.shape[1] * x.shape[2]))
 
  # Add the one bais vector to all the weights
  xTranspose = x_Orig.transpose()
 
  # Calculate the the error using thid formula W = [(X.T * X)^-1 *(X.T * Y)] 
  e_in = np.linalg.inv(np.matmul(xTranspose, x_Orig))
  wStar = np.matmul(e_in, np.matmul(xTranspose, y))
  b = wStar[0][0]
 
  # End the timer
  end = time.time()
  totalTime = (end - start)
  
  # Find the Accuracy 
  accuracyLeastSquares = accuracy(wStar, b, x, y, 'MSE')
  print("This is the accuracy of the normal equation: ",accuracyLeastSquares)
  print("This is the time it took to complete this computation: %.3lfs" % totalTime)
 
  return wStar, b

In [0]:
"""
HELPER FUNCTIONS
"""

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def plot (W, b, alpha, epochs, reg, lossType):
  trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
  n = epochs # number of epochs
  error_tol = 10**-7
  W, b, train_loss_arr, valid_loss_arr, test_loss_arr, train_acc, valid_acc, test_acc = grad_descent_V2(W, b, trainData, trainTarget, alpha, epochs, reg, error_tol, lossType)

  plt.figure(figsize=(20,10))
  plt.title("Train, Validation and Test Error (Regularization Parameter = 0.5)")
  print("This is the lowest Train loss: ", np.amin(train_loss_arr))
  print("This is the lowest Validation loss: ", np.amin(valid_loss_arr))
  print("This is the lowest Test loss: ", np.amin(test_loss_arr))
  plt.plot(range(n), train_loss_arr, label = "Training Set")
  plt.plot(range(n), valid_loss_arr, label = "Validation Set")
  plt.plot(range(n), test_loss_arr, label = "Test Set")
  plt.xlabel("Epoch")
  plt.ylabel("Error")
  plt.legend(loc='best')
  plt.show()

  plt.figure(figsize=(20,10))
  plt.title("Train, Validation and Test Accuracy (Regularization Parameter = 0.5)")
  print("This is the lowest Train accuracy: ", np.amin(train_acc))
  print("This is the lowest Validation accuracy: ", np.amin(valid_acc))
  print("This is the lowest Test accuracy: ", np.amin(test_acc))
  print("This is the maximum Train accuracy: ", np.amax(train_acc))
  print("This is the maximum Validation accuracy: ", np.amax(valid_acc))
  print("This is the maximum Test accuracy: ", np.amax(test_acc))
  print("This is the final Train accuracy: ", train_acc[epochs - 1])
  print("This is the final Validation accuracy: ", valid_acc[epochs - 1])
  print("This is the final Test accuracy: ", test_acc[epochs - 1])
  plt.plot(range(n), train_acc, label = "Training Set")
  plt.plot(range(n), valid_acc, label = "Validation Set")
  plt.plot(range(n), test_acc, label = "Test Set")
  plt.xlabel("Epoch")
  plt.ylabel("Accuracy (0 to 1)")
  plt.legend(loc='best')
  plt.show()
  
def accuracy(W, b, x, y, lossType):
  #reshaping the batch to satisfy the size of weights for multiplication 
  #(x shape = (3500, 784), W shape = (784, 1))
  x_flat = x.reshape((x.shape[0], x.shape[1] * x.shape[2]))
 
  #calculate error using the formula (e shape = (3500, 1))
  prediction = np.matmul(x_flat, W) + b

  if lossType == 'CE':
    prediction = sigmoid(prediction)

  N = x.shape[0]
  correct = 0
  for i in range(N):
    if (prediction[i] >= 0.5 and y[i] == 1) or (prediction[i] < 0.5 and y[i] == 0):
      correct += 1
  return correct / N

# **Section 2: Logistic Regression [10 points]**

## **1) Loss Function and Gradient [4 pts]:**

Implement two vectorized Numpy functions to compute the Binary Cross Entropy Error and its gradient respectively. Similar to Part 1.1, both functions should accept 5 arguments - the weight vector, the bias, the data matrix, the labels, and the regularization parameter. They should return the loss and gradients with respect to weights and bias respectively. Include the analytical expressions in your report as well as a snippet of your Python code.

### Answer:

In [0]:
"""
calculates the cross entropy and returns the total loss
W is a weight, 1D-array of size: 784 (28 x 28) 
b is the bias
x is the batch of Mnist data images : (3500, 28, 28)
y is the batch of labels : (3500, 1)
reg is λ, the regularization parameter
"""
def crossEntropyLoss(W, b, x, y, reg):
  # Your implementation here
  #reshaping the batch to satisfy the size of weights for multiplication 
  #(x shape = (3500, 784), W shape = (784, 1))
  x_flat = x.reshape((x.shape[0], x.shape[1] * x.shape[2]))

  #for sigmoid func (prediction matrix shape = (3500, 1))
  prediction_matrix = np.dot(x_flat, W) + b
  
  #obtain model output (finding values between 1 and 0)
  model_output = sigmoid(prediction_matrix)

  #calculate error using the formula 
  left_term = np.multiply(-1*y, np.log(model_output))
  right_term = np.multiply(1 - y, np.log(1 - model_output))
  ce = (left_term - right_term).mean()

  #calculating weight decay loss
  weight_loss = (reg / 2) * (np.linalg.norm(W) ** 2)

  #return the sum of the two losses for overall loss
  return weight_loss + ce

In [0]:
def gradCE(W, b, x, y, reg):
    # Your implementation here
    # Reshaping part of the code 
    newW = W.reshape((x.shape[1] * x.shape[2], 1))  # New shape of W is [784, 1]
    newX = x.reshape((x.shape[0], x.shape[1] * x.shape[2])) # New shape of x is [3500, 784]

    #Version 2
    length = x.shape[0]

    #Find our prediction
    x_Prediction = np.dot(newX, newW) + b

    #Obtain model output (finding values between 1 and 0)
    model_output = sigmoid(x_Prediction)
    e_in = model_output - y

    #Find the gradient with respect to the weight
    g_Gradient = (np.dot(newX.transpose(),  e_in))/length

    #Add the regularization factor
    g_Gradient = g_Gradient + reg * newW 

    #Find the gradient with respect to the bias
    b_Gradient = (np.sum(e_in))/ length

    return (g_Gradient, b_Gradient)

## **2) Learning [4 pts]:**
Modify the function from Part 1.2 to include a flag, specifying the type of loss/gradient to use in the classifier. Modify your function to update weights and bias using the Binary Cross Entropy loss and report on the performance of the Logistic Regression model by setting regularization parameter = 0.1, learning curve = 0.005, and 5000 epochs. Plot the loss and accuracy curves for training, validation, and test data set.

### Answer:
We already implemented this feture in Section 2 Part 2. We added a commented version of that code in the section down below.

In [0]:
"""
def grad_descent(W, b, x, y, alpha, epochs, reg, error_tol, lossType):
  trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
  train_loss = []
  test_loss = []
  valid_loss = []
  train_acc = []
  valid_acc = []
  test_acc = []

  for i in range(epochs):
    train_acc.append(accuracy(W, b, x, y, lossType))
    valid_acc.append(accuracy(W, b, validData, validTarget, lossType))
    test_acc.append(accuracy(W, b, testData, testTarget, lossType))
    #find loss and gradient weight and bias determined by input of this function
    if lossType == 'MSE':
      loss = MSE(W, b, x, y, reg)
      train_loss.append(loss)
      valid_loss.append(MSE(W, b, validData, validTarget, reg))
      test_loss.append(MSE(W, b, testData, testTarget, reg))
      weights, bias = gradMSE(W, b, x, y, reg)

    elif lossType == 'CE':
      loss = crossEntropyLoss(W, b, x, y, reg)
      train_loss.append(loss)
      valid_loss.append(crossEntropyLoss(W, b, validData, validTarget, reg))
      test_loss.append(crossEntropyLoss(W, b, testData, testTarget, reg))

      weights, bias = gradCE(W, b, x, y, reg)
      
      #if error is smaller than the tolerance, we have the optimal values
      #for weights and bias, thus we can stop iterating

    if loss <= error_tol:
      break 

    #update ewight and bias from calculated gradient
    W -= alpha * weights
    b -= alpha * bias
  return W, b, train_loss, valid_loss, test_loss, train_acc, valid_acc, test_acc
"""

## **3) Comparison to Linear Regression [2 pts]:**
For zero weight decay, learning rate of 0.005 and 5000 epochs , plot the training cross entropy loss and MSE loss for logistic regression and linear regression respectively. Comment on the effect of cross-entropy loss convergence behaviour.

### Answer:
From the loss function of cross-entropy we can easily tell that the logistic regression model using cross-entropy converges faster than linear regression model using MSE. This tells us that cross-entropy is a better at measuring the loss compared to MSE for classification problems. In our case just by looking at the graph we can tell this does not happen.


In [0]:
def buildGraph(loss, b1, b2, eps):
	#Initialize weight and bias tensors
	tf.set_random_seed(421)
	input_size = 28*28
	W = tf.Variable(tf.truncated_normal(stddev=0.5, shape=(input_size, 1), dtype=tf.float32))
	b = tf.Variable(name='bias', initial_value = tf.ones(1), dtype=tf.float32)
	x = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, input_size))
	y = tf.compat.v1.placeholder(dtype=tf.int8, shape=(None))
	reg = tf.placeholder(dtype=tf.float32, name='lambda')

	predicted_y = tf.add(tf.matmul(x, W), b)
	optimizer = tf.train.AdamOptimizer(learning_rate = 0.001, beta1=b1, beta2=b2, epsilon=eps)

	if loss == "MSE":
	# Your implementation
		err = tf.losses.mean_squared_error(y, predicted_y)
		
	elif loss == "CE":
	#Your implementation here
		predicted_y = tf.sigmoid(predicted_y)
		err = tf.losses.sigmoid_cross_entropy(y, predicted_y)
	
	optimal = optimizer.minimize(err)
	return W, b, x, predicted_y, y, err, optimal

	'''
W is the weight
b is the bias
x is the data
y is the labels
alpha is learning rate (step)
epochs are iterations for each batch
reg is λ, the regularization parameter
error_tol is the error tolerance (set to 1 × 10 −7 )
lossType is the type of loss function (MSE or CE)
'''
def SGD(alpha, epochs, reg, error_tol, batch_size, lossType, b1, b2, eps):
  trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
  train_loss_arr = []
  train_acc_arr = []
  valid_loss_arr = []
  valid_acc_arr = []
  test_loss_arr = []
  test_acc_arr = []
  train_flat = trainData.reshape((trainData.shape[0], trainData.shape[1] * trainData.shape[2]))
  valid_flat = validData.reshape((validData.shape[0], validData.shape[1] * validData.shape[2]))
  test_flat = testData.reshape((testData.shape[0], testData.shape[1] * testData.shape[2]))
  weights, bias, x, predicted_y, y, error, optimal = buildGraph(lossType, b1, b2, eps)
  with tf.Session() as session:
    session.run(tf.initialize_all_variables())
    for step in range(epochs):
      batches = shuffle(train_flat, trainTarget, batch_size)
      
      for batch in enumerate(batches):
        train_data = batch[1][:,0:784]
        train_label = batch[1][:,784]
        train_label = np.expand_dims(train_label, axis=1)
        session.run(optimal, feed_dict = {x: train_data, y: train_label})
      
      train_loss = session.run(error, feed_dict={x: train_data, y: train_label})
      valid_loss = session.run(error, feed_dict={x: valid_flat, y: validTarget})
      test_loss = session.run(error, feed_dict={x: test_flat, y: testTarget})

      train_loss_arr.append(train_loss)
      valid_loss_arr.append(valid_loss)
      test_loss_arr.append(test_loss)
      
      train_pred = session.run(predicted_y, feed_dict={x: train_data, y: train_label})
      valid_pred = session.run(predicted_y, feed_dict={x: valid_flat, y: validTarget})
      test_pred = session.run(predicted_y, feed_dict={x: test_flat, y: testTarget})

      train_acc_arr.append(accuracy_given_pred(train_pred, train_label))
      valid_acc_arr.append(accuracy_given_pred(valid_pred, validTarget))
      test_acc_arr.append(accuracy_given_pred(test_pred, testTarget))
  
  return train_loss_arr, train_acc_arr, valid_loss_arr, valid_acc_arr, test_loss_arr, test_acc_arr


def accuracy_given_pred(predicted, expected):
  N = predicted.shape[0]
  correct = 0
  for i in range(N):
    if (predicted[i] >= 0.5 and expected[i] == 1) or (predicted[i] < 0.5 and expected[i] == 0):
      correct += 1
  #print(correct/N)
  return correct / N


'''
shuffles the dataset into a random order for SGD
'''
def shuffle(data, labels, batch_size):
  #join the data and labels together
  data_with_labels = np.concatenate((data, labels), axis=1)

  #shuffle them
  np.random.shuffle(data_with_labels)

  #split each one by batch size
  randomized_data_with_labels = np.array_split(data_with_labels, data_with_labels.shape[0] // batch_size, axis = 0)

  #return each one
  return randomized_data_with_labels

def plot_sgd (train_loss_arr, train_acc, valid_loss_arr, valid_acc, test_loss_arr, test_acc, lossType, info):

  n = len(train_loss_arr)
  # plt.figure(figsize=(20,10))
  # plt.title('SGD ' + lossType + ' Loss (' + info + ')')
  # print("This is the lowest Train loss: ", np.amin(train_loss_arr))
  # print("This is the lowest Validation loss: ", np.amin(valid_loss_arr))
  # print("This is the lowest Test loss: ", np.amin(test_loss_arr))
  # plt.plot(range(n), train_loss_arr, label = "Training Set")
  # plt.plot(range(n), valid_loss_arr, label = "Validation Set")
  # plt.plot(range(n), test_loss_arr, label = "Test Set")
  # plt.xlabel("Epoch")
  # plt.ylabel("Error")
  # plt.legend(loc='best')
  # plt.show()

  # plt.figure(figsize=(20,10))
  # plt.title("SGD " + lossType + ' Accuracy (' + info  + ')')
  # print("This is the lowest Train accuracy: ", np.amin(train_acc))
  # print("This is the lowest Validation accuracy: ", np.amin(valid_acc))
  # print("This is the lowest Test accuracy: ", np.amin(test_acc))
  # print("This is the maximum Train accuracy: ", np.amax(train_acc))
  # print("This is the maximum Validation accuracy: ", np.amax(valid_acc))
  # print("This is the maximum Test accuracy: ", np.amax(test_acc))
  print("This is the final Train accuracy: ", train_acc[n - 1])
  print("This is the final Validation accuracy: ", valid_acc[n -1])
  print("This is the final Test accuracy: ", test_acc[n - 1])
  # plt.plot(range(n), train_acc, label = "Training Set")
  # plt.plot(range(n), valid_acc, label = "Validation Set")
  # plt.plot(range(n), test_acc, label = "Test Set")
  # plt.xlabel("Epoch")
  # plt.ylabel("Accuracy (0 to 1)")
  # plt.legend(loc='best')
  # plt.show()

In [0]:
"""
TEST
"""
W = np.zeros((28*28, 1))
b = 1
reg = 0
alpha = 0.001
epochs = 700 # number of epochs
error_tol = 10**-7
lossType = 'CE'
batch_size = 1750
b1=0.95
b2=0.999
eps=10**-8
# Testing Grad Descent
train_loss_arr, train_acc_arr, valid_loss_arr, valid_acc_arr, test_loss_arr, test_acc_arr = SGD(alpha, epochs, reg, error_tol, batch_size, lossType, b1, b2, eps)
plot_sgd(train_loss_arr, train_acc_arr, valid_loss_arr, valid_acc_arr, test_loss_arr, test_acc_arr, lossType, 'Experiment 1: Beta 1 = 0.95 (MSE)')

This is the final Train accuracy:  0.9845714285714285
This is the final Validation accuracy:  0.97
This is the final Test accuracy:  0.9862068965517241
