In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torchvision.io import read_image
from torchvision.transforms import ToTensor, Lambda
import pandas as pd
import os
import torch.optim as optim

In [2]:
train_data = datasets.FashionMNIST(root = "data" , train = True, download = True, transform = ToTensor(), target_transform= Lambda( lambda y: torch.zeros(10, dtype= torch.float).scatter_(0,torch.tensor(y),value = 1)))
test_data = datasets.FashionMNIST(root = "data" , train = False, download = True, transform = ToTensor(), target_transform= Lambda( lambda y: torch.zeros(10, dtype= torch.float).scatter_(0,torch.tensor(y),value = 1)))

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/5148 [00:00<?, ?it/s]

Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw



In [3]:
train_dataloader = DataLoader( dataset= train_data, batch_size = 64, shuffle = True)
test_dataloader = DataLoader( dataset= test_data, batch_size = 64, shuffle = True)

In [4]:
# Defining activation and loss functions

#relu class
class relu_activation(object):
  
  #relu
  def activate(self,z):
    zero= torch.zeros_like(z)
    return torch.maximum(zero, z)
  

  #Element-wise Heaviside theta function definition
  def activate_prime(self,z):
    zz= z.clone()
    size = torch.tensor(zz.size()).prod() #total size of input tensor
    z2 = zz.reshape(size)  #flattaning input tensor
    for elem in range(size):  #Taking the derivative element-wise
      if z2[elem] > 0:
       z2[elem]=1
      else: z2[elem] = 0

    out = z2.reshape(torch.tensor(zz.size()).tolist())  #reshaping relu derivative back to original tensor shape
    return out


#Softmax
class softmax_activation(object):

  def activate(self, z):
    out = torch.exp(z)
    norm = out.sum()
    return out/norm

  def activate_prime(self, z):
    return torch.ones_like(z)    #Since softmax is used only for the final layer, I have included the softmax derivative in the derivative of the loss function for convenience


#Using KL divergence of softmax activations
def loss_fn_prime(xx,yy):
  return xx-yy


In [5]:
# connected layer class
class Connected(object):
  def __init__(self, tensor_shape, activation_fn=None):
    #tensor_shape = list of tensor dimensions of connected layer (dim =0 is output)
    self.activation_fn = activation_fn
    self.shape = tensor_shape
    self.input_size = ((torch.tensor(tensor_shape)).prod())/tensor_shape[0]

    self.weight = torch.randn(tuple(tensor_shape))/torch.sqrt(self.input_size)
    self.bias = torch.randn(tensor_shape[0])

    self.dbias = torch.zeros_like(self.bias)
    self.dweight = torch.zeros_like(self.weight)


  def zero_derivative(self):
    self.dbias = torch.zeros_like(self.bias)
    self.dweight = torch.zeros_like(self.weight)


  def update(self, learning_rate, batch_size):
    bias_update = self.dbias.clone()
    weight_update = self.dweight.clone()
    self.bias -= bias_update *(learning_rate / batch_size)
    self.weight -= weight_update *(learning_rate / batch_size)




  def set_input(self, inp):
    self.input = inp
    self.preact = torch.tensordot(self.weight, inp, dims = len(self.shape)-1) + self.bias
    if self.activation_fn:
      self.activ = self.activation_fn.activate(self.preact)
    else: self.activ= self.preact



  def backward(self, delta):
    if self.activation_fn:
      delta_new = delta * self.activation_fn.activate_prime(self.preact)
    else: delta_new = delta

    self.dbias += delta_new
    self.dweight += torch.tensordot(delta_new, self.input, dims =0 )
    delta_new = torch.tensordot(delta_new, self.weight, dims = ([0],[0]))
    return delta_new




In [6]:
#convolutional-pooling layer class
class ConvPool(object):
  
  def __init__(self, image_shape, filter_shape, pooling_shape, activation_fn=None):
    #image_shape = list of dims of input image (feature_num x image_size x image_size)
    #filter_shape = list of convolution filter dimensions (feature_num_out x feature_num_inp x resolution x resolution)
    self.activation_fn = activation_fn

    self.filter_shape = filter_shape
    self.image_shape = image_shape 
    self.pooling_shape = pooling_shape

    self.convolution_size = ( int(image_shape[1]- filter_shape[2] +1) , int(image_shape[2]- filter_shape[3] +1) )
    self.output_size = ( int( (image_shape[1]- filter_shape[2] +1)/pooling_shape[0]), int( (image_shape[2]- filter_shape[3] +1)/pooling_shape[1]) )
    
    self.weight = torch.randn(tuple(filter_shape))/(torch.sqrt(torch.tensor(filter_shape[1]*filter_shape[2]*filter_shape[3])))
    self.bias = torch.tensordot( torch.randn(filter_shape[0]), torch.ones(self.output_size), dims=0 )

    self.dbias = torch.zeros_like(self.bias)
    self.dweight = torch.zeros_like(self.weight)
    
    #initiating convolutional and pooling isometries
    self.conv_isometry = torch.zeros(self.convolution_size[0] , self.convolution_size[1], filter_shape[2], filter_shape[3], image_shape[1], image_shape[2])
    self.pool_isometry= torch.zeros(filter_shape[0], self.output_size[0], self.output_size[1], filter_shape[0], self.convolution_size[0], self.convolution_size[1]) 


    #loop defining the convolution isometry 
    for a in range(self.convolution_size[0]):
      for b in range(self.convolution_size[1]):
        for i in range( int(filter_shape[2]) ):
          for j in range( int(filter_shape[3]) ):
            self.conv_isometry[a,b,i,j,a+i, b+j] = 1




  def zero_derivative(self):
    self.dbias = torch.zeros_like(self.bias)
    self.dweight = torch.zeros_like(self.weight)


  def update(self, learning_rate, batch_size):
    bias_update = self.dbias.clone()
    weight_update = self.dweight.clone()
    self.bias -= bias_update *(learning_rate / batch_size)
    self.weight -= weight_update *(learning_rate / batch_size)




  
  def set_input(self, inp):
    self.input = torch.tensordot(inp, self.conv_isometry, dims= ([1,2],[4,5]))  #5-tensor (input_feature_num x filter_num x filter_num x resolution x resolution)
    act_int = torch.tensordot(self.weight, self.input, dims= ([1,2,3],[0,3,4]))   #3-tensor (output_num x filter_num x filter_num) 
    
    #loop for constructing pooling isometry tensor 
    for k in range(int(self.filter_shape[0])):
      for i in range(int(self.output_size[0])):
        for j in range(int(self.output_size[1])):
          maxim = torch.argmax(act_int[k, int(self.pooling_shape[0])*i: int(self.pooling_shape[0])*(i+1), int(self.pooling_shape[1])*j: int(self.pooling_shape[1])*(j+1)]).item() #selecting maximum element in every pooling window
          self.pool_isometry[k, i , j , k , int(self.pooling_shape[0])*i + int(maxim/self.pooling_shape[1]) , int(self.pooling_shape[1])*j + (maxim % self.pooling_shape[1])] = 1

    self.preact = torch.tensordot(self.pool_isometry, act_int, dims = 3) + self.bias
    if self.activation_fn:
      self.activ = self.activation_fn.activate(self.preact)
    else: self.activ= self.preact 




  def backward(self, delta):
    if self.activation_fn:
      delta_new = delta * self.activation_fn.activate_prime(self.preact)
    else: delta_new = delta
    
    self.dbias += torch.tensordot( torch.sum(delta_new, dim = (1,2)), torch.ones(self.output_size), dims=0 )
    
    delta_new = torch.tensordot(delta_new, self.pool_isometry, dims=([0,1,2],[0,1,2]))  #3-tensor (output_feature_num x filter_num x filter_num)
    self.dweight += torch.tensordot(delta_new , self.input, dims=([1,2],[1,2]))  #4-tensor (output_feature_num x input_feature_num x resolution x resolution)


    delta_new = torch.tensordot(delta_new, self.weight, dims=([0],[0]))  #5-tensor (filter_num x filter_num x input_feature_num x resolution x resolution))
    delta_new = torch.tensordot(delta_new, self.conv_isometry, dims = ([0,1,3,4],[0,1,2,3]))  #3-tensor (input_feature_num x input_size x input_size) undoing convolution isometry to map delta back to the activation space of the previous convolutional layer

    return delta_new



In [95]:
class Network(object):
  def __init__(self, layers):
    self.layers = layers


#--------------Stochastic gradient descent---------------------
  def grad_desc(self, train_data, lr, epochs, test_data=None):

    for epoch in range(epochs):
      dataloader_iter = train_data
      random= torch.randint(0,850,(1,)).item()
      for batch, (X,y) in enumerate(dataloader_iter):
        batch_size = len(X)
        if (batch>random) & (batch<(random+30)):
          for k in range(batch_size):
            self.backprop(X[k],y[k])
          for layer in self.layers:
            layer.update(lr, batch_size)
            print(f"{(torch.min(layer.dweight)*lr/batch_size, torch.max(layer.dweight)*lr/batch_size)}  {(torch.min(layer.dbias)*lr/batch_size, torch.max(layer.dbias)*lr/batch_size) }\n" )
            print(f"{(torch.min(layer.weight), torch.max(layer.weight))}  {(torch.min(layer.bias), torch.max(layer.bias)) }\n" )
            layer.zero_derivative()
            print(f"{(torch.min(layer.dweight), torch.max(layer.dweight))}  {(torch.min(layer.dbias), torch.max(layer.dbias)) }\n\n" )
          print(f"Minibatch {batch} complete \n\n")
      print(f"Epoch {epoch} complete!\n\n")
      
      if test_data:
        test_iter= test_data
        acurracy=0
        number = 0
        for batcht, (Xt, yt) in enumerate(test_iter):
          if (batcht < 10):
            for k in range(len(Xt)):
             out = self.feedforward(Xt[k])
             out_pred = torch.argmax(out)
             out_true = torch.argmax(yt[k])
             number+=1
             if (out_pred == out_true):
               acurracy += 1
            print(f"Epoch {epoch} minibatch {batcht} achieved accuracy {acurracy/number}")


#--------------backpropagation method---------------------
  def backprop(self,x,y):
    out = self.feedforward(x)

    delta = loss_fn_prime(out, y) 

    for k in range(len(self.layers)):
      delta = self.layers[-k-1].backward(delta)

           

#---------------feedforward method---------------------
  def feedforward(self, x):
    inp = x
    for layer in self.layers:
      layer.set_input(inp)
      inp = layer.activ
    return inp







In [89]:
model= Network([ConvPool([1,28,28],[15,1, 5, 5], [2,2], relu_activation()) , ConvPool([15,12,12],[15, 15, 5, 5], [2,2], relu_activation()) , Connected([100,15,4,4], relu_activation()) , Connected([10, 100], softmax_activation()) ])

In [94]:
model.grad_desc(train_dataloader, 0.04, 5, test_data = test_dataloader)

(tensor(-0.0176), tensor(0.0185))  (tensor(-0.0109), tensor(0.0098))

(tensor(-0.5625), tensor(0.5521))  (tensor(-2.1584), tensor(1.3059))

(tensor(0.), tensor(0.))  (tensor(0.), tensor(0.))


(tensor(-0.0638), tensor(0.0642))  (tensor(-0.0048), tensor(0.0041))

(tensor(-0.2021), tensor(0.2063))  (tensor(-0.3865), tensor(1.9883))

(tensor(0.), tensor(0.))  (tensor(0.), tensor(0.))


(tensor(-0.0363), tensor(0.0432))  (tensor(-0.0029), tensor(0.0037))

(tensor(-0.3261), tensor(0.2396))  (tensor(-2.4113), tensor(1.7104))

(tensor(0.), tensor(0.))  (tensor(0.), tensor(0.))


(tensor(-0.0305), tensor(0.0736))  (tensor(-0.0045), tensor(0.0107))

(tensor(-0.3020), tensor(0.3738))  (tensor(-1.6481), tensor(1.9002))

(tensor(0.), tensor(0.))  (tensor(0.), tensor(0.))


Minibatch 694 complete 


(tensor(-0.0116), tensor(0.1624))  (tensor(-0.0045), tensor(0.0836))

(tensor(-0.6434), tensor(0.4909))  (tensor(-2.1577), tensor(1.2481))

(tensor(0.), tensor(0.))  (tensor(0.), tensor(0.))


(tensor(-