<a href="https://colab.research.google.com/github/oikn2018/CS6910_assignment_1/blob/main/dl_a1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#importing required packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sns
# import wandb
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [3]:
#importing datasets
from keras.datasets import fashion_mnist
(x_train,y_train), (x_test,y_test) = fashion_mnist.load_data()
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.1,shuffle=True)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
(54000, 28, 28) (6000, 28, 28) (54000,) (6000,)


In [4]:
X_train = np.reshape(x_train, (x_train.shape[0], -1)).T
X_train = X_train / 255.
Y_train = (np.eye(np.max(y_train) + 1)[y_train]).T

X_test = np.reshape(x_test, (x_test.shape[0], -1)).T
X_test = X_test / 255.
Y_test = (np.eye(np.max(y_test) + 1)[y_test]).T

X_val = np.reshape(x_val, (x_val.shape[0], -1)).T
X_val = X_val / 255.
Y_val = (np.eye(np.max(y_val) + 1)[y_val]).T

print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape, X_test.shape, Y_test.shape)

(784, 54000) (784, 6000) (10, 54000) (10, 6000) (784, 10000) (10, 10000)


In [5]:
class FeedForwardNN:
  def __init__(self,config=None,epochs=5,hidden_layers=[64,64,64],weight_decay=0,learning_rate=1e-3,optimizer='sgd',batch_size=16,weight_initialization='random',activations='sigmoid',loss_function='cross-entropy',output_function='softmax',gamma=0.9,beta=0.9,beta1=0.9,beta2=0.999,eps=1e-8):
    if config is not None:
      self.epochs = config["epochs"]
      self.learning_rate = config["learning_rate"]
      self.weight_decay = config["weight_decay"]
      self.optimizer = config["optimizer"]
      self.batch_size = config["batch_size"]
      self.weight_initialization = config["weight_initialization"]
      self.activations = config["activations"]
      self.hidden_layers = [config["hidden_layers_size"] for x in range(config["no_hidden_layers"])]
    else:
      self.epochs = epochs
      self.learning_rate = learning_rate
      self.weight_decay = weight_decay
      self.optimizer = optimizer
      self.batch_size = batch_size
      self.weight_initialization = weight_initialization
      self.activations = activations
      self.hidden_layers = hidden_layers

    self.loss_function = loss_function
    self.output_function = output_function
    self.gamma = gamma
    self.beta = beta
    self.beta1 = beta1
    self.beta2 = beta2
    self.eps = eps

    self.initialize()


  def sigmoid(self,x):
    return 1.0 / (1.0 + np.exp(-x))
    
  def tanh(self,x):
    return np.tanh(x)

  def ReLu(self,x):
    p = np.vectorize(lambda a: max(0,a))
    return p(x)

  def d_sigmoid(self,x):
    return self.sigmoid(x)*(1.0 - self.sigmoid(x))
  def d_tanh(self,x):
    return 1.0 - self.tanh(x)*self.tanh(x) 

  def d_ReLu(self,x):
    return 1*(x>0)

  def activation(self,x,n='sigmoid'):
    if n == 'sigmoid':
      return self.sigmoid(x)
    elif n == 'tanh':
      return self.tanh(x)
    elif n == 'ReLu':
      return self.ReLu(x)

  def d_activation(self,x,n='sigmoid'):
    if n == 'sigmoid':
      return self.d_sigmoid(x)
    elif n == 'tanh':
      return self.d_tanh(x)
    elif n=='ReLu':
      return self.d_ReLu(x)


  def softmax(self,x):
    e = np.exp(x)
    return e / np.sum(e,axis=0)

  def output(self,x,n='softmax'):
    if n == 'softmax':
      return self.softmax(x)

  def cross_error(self,Y,inputs):
    Y_hat = inputs[1][-1]
    return -1*np.sum(Y*(np.log(Y_hat)))

  def squared_error(self,Y,inputs):
    Y_hat = inputs[1][-1]
    return (1/2)*np.sum((Y_hat - Y)**2)

  def squared_error_val(self,Y,inputs):
    W,B = self.theta
    Y_hat = inputs[1][-1]
    m = Y.shape[1]
    return (1/(2*m))*np.sum((Y_hat - Y)**2) + (self.lambd/(2*m))*(self.frobenius(W**2) + self.frobenius(B**2))

  def error(self,Y,inputs,n='cross-entropy'):
    if n == 'cross-entropy':
      return self.cross_error(Y,inputs)
    elif n == 'squared-error':
      return self.squared_error(Y,inputs) 
  
  def cross_error_val(self,Y,inputs):
    W,B = self.theta
    Y_hat = inputs[1][-1]
    m = Y.shape[1]
    return (-1/m)*np.sum(Y*(np.log(Y_hat))) + (self.lambd/(2*m))*(self.frobenius(W**2) + self.frobenius(B**2))

  def val_error(self,Y,inputs,n='cross-entropy'):
    if n == 'cross-entropy':
      return self.cross_error_val(Y,inputs)
    if n == 'squared-error':
      return self.squared_error_val(Y,inputs)

  def random_initialize_parameters(self,n,layers):
    L = len(layers)
    biases = []
    weights = []
    for i in range(L):
      bias = np.float128(np.zeros((layers[i],1)))
      if i == 0:
        weight = np.float128(np.random.randn(layers[i],n))
      else:
        weight = np.float128(np.random.randn(layers[i],layers[i-1]))
      biases.append(bias)
      weights.append(weight)
    return (np.array(weights),np.array(biases))
  


  def initialize_parameters(self,n,layers,t):
    if t == 'random':
      return self.random_initialize_parameters(n,layers)
#     elif t == 'Xavier':
#       return self.Xavier_initialize_parameters(n,layers)
  
  def frobenius(self,X):
    s=0
    for x in X:
      s += np.sum(x)
    return s
  def feedforward(self,X,theta,L):
    H = X
    weights ,biases = theta
    activations = []
    pre_activations = []
    for k in range(L-1):
      A = biases[k] + (weights[k] @ H)
      H = self.activation(A,self.activations)
      pre_activations.append(A)
      activations.append(H)
    AL = biases[L-1] + (weights[L-1] @ H)
    Y_hat = self.output(AL,self.output_function)
    pre_activations.append(AL)
    activations.append(Y_hat)
    return (np.array(pre_activations),np.array(activations))
  
  def backprop(self,X,Y,inputs,theta,batch_size,L):
    d_biases = []
    d_weights = []
    pre_activations , activations = inputs
    weights,biases = theta
    Y_hat = activations[-1]
    if self.loss_function == 'squared-error':
      d_AL = Y_hat*(Y_hat - Y)*(1 - Y_hat)
    elif self.loss_function == 'cross-entropy':
      d_AL = Y_hat - Y
    for k in range(L-1,-1,-1):
      if(k == 0):
        d_W = (1/batch_size)*(d_AL @ X.T)
        d_B = (1/batch_size)*np.sum(d_AL,axis=1,keepdims=True)
      else:
        d_W = (1/batch_size)*(d_AL @ activations[k-1].T)
        d_B = (1/batch_size)*np.sum(d_AL,axis=1,keepdims=True)

        d_H = weights[k].T @ d_AL
        d_A = d_H*self.d_activation(pre_activations[k-1],self.activations)
        d_AL = d_A
      d_weights.insert(0,d_W)
      d_biases.insert(0,d_B)
    d_theta = (np.array(d_weights),np.array(d_biases))
    return d_theta


  def update_theta(self,theta,d_theta,learning_rate):
    weights, biases = theta
    d_weights,d_biases = d_theta
    weights = (1 - self.weight_decay)*weights - learning_rate*d_weights
    biases = (1 - self.weight_decay)*biases - learning_rate*d_biases
    return (weights,biases)


  def mini_batch_gradient_descent(self,X,Y,theta,learning_rate,batch_size,L):
    m = X.shape[1]
    # print(weights[0].shape,L)
    err = 0
    for i in range(0,m//batch_size):
      start = i*batch_size
      stop = (i+1)*batch_size
      inputs = self.feedforward(X[:,start:stop],theta,L)
      W,B = theta
      err += self.error(Y[:,start:stop],inputs,self.loss_function) + (self.lambd/2)*(self.frobenius(W**2) + self.frobenius(B**2))
      d_theta = self.backprop(X[:,start:stop],Y[:,start:stop],inputs,theta,batch_size,L)
      theta = self.update_theta(theta,d_theta,learning_rate)
    if(m % batch_size != 0):
      start = i*batch_size

      inputs = self.feedforward(X[:,start:],theta,L)
      d_theta = self.backprop(X[:,start:],Y[:,start:],inputs,theta,m%batch_size,L)

      theta = self.update_theta(theta,d_theta,learning_rate)
      W,B = theta
      err += self.error(Y[:,start:],inputs,self.loss_function) + (self.lambd/2)*(self.frobenius(W**2) + self.frobenius(B**2))
    return (theta,err/m)
  
 
  def optimizations(self,theta,L):
    if self.optimizer == 'sgd':
      return self.mini_batch_gradient_descent(X_train,Y_train,theta,self.learning_rate,1,L)


  def initialize(self):
    layers = self.hidden_layers + [Y_train.shape[0]]
    self.theta = self.initialize_parameters(X_train.shape[0],layers,self.weight_initialization)
    self.lambd = self.weight_decay/self.learning_rate
    self.L = len(layers)
  

  def fit(self):
    self.theta,train_loss = self.optimizations(self.theta,self.L)

    outputs_train = self.feedforward(X_train,self.theta,self.L)
    Y_pred_train = np.argmax(outputs_train[1][-1],0)
    Y_true_train = np.argmax(Y_train,0)
    train_acc = accuracy_score(Y_true_train,Y_pred_train)

    outputs_val = self.feedforward(X_val,self.theta,self.L)
    val_loss = self.val_error(Y_val,outputs_val,self.loss_function)
    Y_pred_val = np.argmax(outputs_val[1][-1],0)
    Y_true_val = np.argmax(Y_val,0)
    val_acc = accuracy_score(Y_true_val,Y_pred_val)
    
    return train_acc,train_loss,val_acc,val_loss
    
    
  def predict(self,X_test):
    L = len(self.hidden_layers) + 1
    outputs = self.feedforward(X_test,self.theta,L)
    Y_pred = np.argmax(outputs[1][-1],0)
    return Y_pred

In [8]:
Models = []
def train():
        model = FeedForwardNN()
        train_acc,train_loss,val_acc,val_loss = 0,0,0,0
        for epoch in range(5):
            '''config["epochs"]'''
            train_acc,train_loss,val_acc,val_loss = model.fit()  # model training code here
            metrics = {
            "accuracy":train_acc,
             "loss":train_loss,
            "validation_accuracy": val_acc,
            "validation_loss": val_loss,
             "epochs":epoch
             }

            print(metrics)

In [9]:
train()


{'accuracy': 0.6876296296296296, 'loss': 1.2751054884977098429, 'validation_accuracy': 0.6851666666666667, 'validation_loss': 0.8780605051328608062, 'epochs': 0}
{'accuracy': 0.7298888888888889, 'loss': 0.8064098620980784197, 'validation_accuracy': 0.7281666666666666, 'validation_loss': 0.76265300583265876815, 'epochs': 1}
{'accuracy': 0.7510370370370371, 'loss': 0.72098974199876615714, 'validation_accuracy': 0.7446666666666667, 'validation_loss': 0.70774396681783765243, 'epochs': 2}
{'accuracy': 0.7648703703703704, 'loss': 0.67260133508796052357, 'validation_accuracy': 0.7591666666666667, 'validation_loss': 0.6710351058654098344, 'epochs': 3}
{'accuracy': 0.7754074074074074, 'loss': 0.6390678139477666097, 'validation_accuracy': 0.7651666666666667, 'validation_loss': 0.64362896058784438685, 'epochs': 4}
