In [0]:
class Value:
    """ stores a value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
            
            
        out._backward = _backward

        return out
    
    
        
        

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            
            
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        
        out = Value(np.where(self.data < 0, 0, self.data), (self,), 'ReLU')

        def _backward():
            self.grad += np.where(out.data > 0,1,0) * out.grad
        out._backward = _backward

        return out
    def matmul(self,other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(np.matmul(self.data , other.data), (self, other), 'matmul')
        def _backward():
            self.grad += np.dot(out.grad,other.data.T)
            other.grad += np.dot(self.data.T,out.grad)
            
            
        out._backward = _backward

        return out
    def softmax(self):

        out =  Value(np.exp(self.data) / np.sum(np.exp(self.data), axis=1)[:, None], (self,), 'softmax')
        softmax = out.data
        def _backward():
            self.grad += (out.grad - np.reshape(
            np.sum(out.grad * softmax, 1),
            [-1, 1]
              )) * softmax
        out._backward = _backward

        return out

    def log(self):
        #print(self.data==0.0)
        """
        if len(list(zip(*np.where(self.data == 0.0))))!=0:
            print(self.data)
        """
        out = Value(np.log(self.data),(self,),'log')
        def _backward():
            self.grad += out.grad/self.data
        out._backward = _backward

        return out
    
    
    def reduce_sum(self,axis = None):
        out = Value(np.sum(self.data,axis = axis), (self,), 'REDUCE_SUM')
        
        def _backward():
            output_shape = np.array(self.data.shape)
            output_shape[axis] = 1
            tile_scaling = self.data.shape // output_shape
            grad = np.reshape(out.grad, output_shape)
            self.grad += np.tile(grad, tile_scaling)
            
        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            #print(v)
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [2]:
from keras.datasets import mnist
import keras
import numpy as np

Using TensorFlow backend.


In [0]:
(x_train,y_train),(x_test,y_test) = mnist.load_data()
train_images = np.asarray(x_train, dtype=np.float32) / 255.0
test_images = np.asarray(x_test, dtype=np.float32) / 255.0
train_images = train_images.reshape(60000,784)
test_images = test_images.reshape(10000,784)
y_train = keras.utils.to_categorical(y_train)

In [0]:
def calculate_loss(X,Y,W):
  
  return -(1/X.shape[0])*np.sum(np.sum(Y*np.log(np.exp(np.matmul(X,W)) / np.sum(np.exp(np.matmul(X,W)),axis=1)[:, None]),axis = 1))

In [5]:
batch_size = 32
steps = 40000
Wb = Value(np.random.randn(784,10))# new initialized weights for gradient descent
for step in range(steps):
  ri = np.random.permutation(train_images.shape[0])[:batch_size]
  Xb, yb = Value(train_images[ri]), Value(y_train[ri])
  y_predW = Xb.matmul(Wb)
  probs = y_predW.softmax()

  log_probs = probs.log()
  
  zb = yb*log_probs

  outb = zb.reduce_sum(axis = 1)
  finb = -(1/batch_size)*outb.reduce_sum()  #cross entropy loss
  finb.backward()
  if step%1000==0:
    loss = calculate_loss(train_images,y_train,Wb.data)
    print(f'loss in step {step} is {loss}')
  Wb.data = Wb.data- 0.01*Wb.grad
  Wb.grad = 0
loss = calculate_loss(train_images,y_train,Wb.data)
print(f'loss in final step {step+1} is {loss}')

loss in step 0 is 11.965824562043244
loss in step 1000 is 3.5340812250235207
loss in step 2000 is 2.3862026312303235
loss in step 3000 is 1.8992034209586457
loss in step 4000 is 1.6277956004915592
loss in step 5000 is 1.4518542304774498
loss in step 6000 is 1.3296696811452746
loss in step 7000 is 1.237307320629226
loss in step 8000 is 1.1640020398560544
loss in step 9000 is 1.1050946004669735
loss in step 10000 is 1.0552722325223898
loss in step 11000 is 1.0135529844636009
loss in step 12000 is 0.9776317683814056
loss in step 13000 is 0.9462063287287965
loss in step 14000 is 0.9188650484572274
loss in step 15000 is 0.8939651384373788
loss in step 16000 is 0.8716843701320222
loss in step 17000 is 0.8516149151684732
loss in step 18000 is 0.8326553726649465
loss in step 19000 is 0.8159560798766764
loss in step 20000 is 0.7998413369543754
loss in step 21000 is 0.7850434500979272
loss in step 22000 is 0.7710236421713172
loss in step 23000 is 0.7590027179646724
loss in step 24000 is 0.746461

In [6]:
from sklearn.metrics import accuracy_score
print(f'accuracy on test data is {accuracy_score(np.argmax(np.matmul(test_images,Wb.data),axis = 1),y_test)*100} %')

accuracy on test data is 86.74 %


In [7]:
accuracy_score(np.argmax(np.exp(np.matmul(test_images,Wb.data)) / np.sum(np.exp(np.matmul(test_images,Wb.data)), axis=1)[:, None],axis = 1),y_test)

0.8674

In [0]:
def calculate_loss1(X,Y,W1,W2):
  y1 = np.where(np.matmul(X,W1)<0,0,np.matmul(X,W1))
  prob = np.exp(np.matmul(y1,W2)) / np.sum(np.exp(np.matmul(y1,W2)),axis=1)[:, None]

  return -(1/X.shape[0])*np.sum(np.sum(Y*np.log(prob),axis = 1))

In [9]:
batch_size = 32
steps = 20000
Wb1 = Value(np.random.randn(784,128))
Wb2 = Value(np.random.randn(128,10))# new initialized weights for gradient descent
for step in range(steps):
  ri = np.random.permutation(train_images.shape[0])[:batch_size]
  Xb, yb = Value(train_images[ri]), Value(y_train[ri])
  y_predW1 = Xb.matmul(Wb1).relu()
  y_predW = y_predW1.matmul(Wb2)
  
  probs = y_predW.softmax()

  log_probs = probs.log()

  zb = yb*log_probs

  outb = zb.reduce_sum(axis = 1)
  finb = -(1/batch_size)*outb.reduce_sum()  #cross entropy loss
  finb.backward()
  if step%1000==0:
    loss = calculate_loss1(train_images,y_train,Wb1.data,Wb2.data)
    print(f'loss in step {step} is {loss}')
  Wb1.data = Wb1.data- 0.01*Wb1.grad
  Wb2.data = Wb2.data- 0.01*Wb2.grad
  Wb1.grad = 0
  Wb2.grad = 0
loss = calculate_loss1(train_images,y_train,Wb1.data,Wb2.data)
print(f'loss in final step {step+1} is {loss}')

loss in step 0 is 104.55093089161471
loss in step 1000 is 4.801904007411178
loss in step 2000 is 3.2042965135586274
loss in step 3000 is 2.4715316092511848
loss in step 4000 is 2.025001506053377
loss in step 5000 is 1.763347535093437
loss in step 6000 is 1.5327570850072534
loss in step 7000 is 1.370981805304221
loss in step 8000 is 1.2276269130843802
loss in step 9000 is 1.1433440024735015
loss in step 10000 is 1.0325585242117796
loss in step 11000 is 0.9887109268906167
loss in step 12000 is 0.9003985043834779
loss in step 13000 is 0.8428990153052753
loss in step 14000 is 0.7845594497537397
loss in step 15000 is 0.7633394953693724
loss in step 16000 is 0.6938871874269439
loss in step 17000 is 0.6564890083807886
loss in step 18000 is 0.649455646441251
loss in step 19000 is 0.6222885484394804
loss in final step 20000 is 0.5905740356817294


In [10]:
y1 = np.where(np.matmul(test_images,Wb1.data)<0,0,np.matmul(test_images,Wb1.data))
prob = np.exp(np.matmul(y1,Wb2.data)) / np.sum(np.exp(np.matmul(y1,Wb2.data)),axis=1)[:, None]

from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(prob,axis = 1),y_test)


0.9068