In [0]:
class Value:
    """ stores a value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
            
            
        out._backward = _backward

        return out
    
    
        
        

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            
            
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out
    def matmul(self,other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(np.matmul(self.data , other.data), (self, other), 'matmul')
        def _backward():
            self.grad += np.dot(out.grad,other.data.T)
            other.grad += np.dot(self.data.T,out.grad)
            
            
        out._backward = _backward

        return out
    
    def reduce_sum(self,axis = None):
        out = Value(np.sum(self.data,axis = axis), (self,), 'REDUCE_SUM')
        
        def _backward():
            output_shape = np.array(self.data.shape)
            output_shape[axis] = 1
            tile_scaling = self.data.shape // output_shape
            grad = np.reshape(out.grad, output_shape)
            self.grad += np.tile(grad, tile_scaling)
            
        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            #print(v)
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [0]:
import random
import numpy as np

In [0]:
import random
x_val = []
#generating 100 random data points
for i in range(100):
    a = random.randint(1,10)
    b = random.randint(1,10)
    x_val.append((a,b))
    
x_vals = Value(np.array(x_val))

In [4]:
x_vals

Value(data=[[ 9  9]
 [ 6  5]
 [ 7 10]
 [ 5  4]
 [ 8 10]
 [ 5  3]
 [10  6]
 [ 7  5]
 [ 2  4]
 [10  7]
 [ 4  6]
 [ 3  1]
 [ 8  1]
 [ 8  6]
 [ 8  7]
 [10  4]
 [ 1  9]
 [ 3  2]
 [ 7  2]
 [ 9 10]
 [ 9  5]
 [ 4  3]
 [10  1]
 [ 5  5]
 [ 4  8]
 [ 2  8]
 [ 6  5]
 [ 2  4]
 [ 8 10]
 [ 1  8]
 [ 9  3]
 [ 7  2]
 [ 5  2]
 [ 3  3]
 [ 8 10]
 [ 1  1]
 [ 1  4]
 [ 5  8]
 [ 8  3]
 [ 8  3]
 [ 1  5]
 [ 9  2]
 [ 1  8]
 [ 4  4]
 [ 3 10]
 [ 9  7]
 [ 7  2]
 [ 8  1]
 [ 4  7]
 [ 3  7]
 [ 3 10]
 [ 7  8]
 [ 6  4]
 [ 5  8]
 [ 8  3]
 [ 7  1]
 [ 3  7]
 [ 1  9]
 [ 6  9]
 [ 2  9]
 [ 7  8]
 [ 6  9]
 [ 2  7]
 [ 9  1]
 [ 4  8]
 [ 5 10]
 [ 5  7]
 [ 5  6]
 [ 3  4]
 [ 4  9]
 [ 5  9]
 [ 9  9]
 [ 6  8]
 [ 7  9]
 [ 7  5]
 [ 8  4]
 [ 9 10]
 [ 3 10]
 [ 1 10]
 [ 3  2]
 [ 8  3]
 [ 5  2]
 [ 5  6]
 [10  7]
 [ 9  4]
 [10  2]
 [ 1  7]
 [ 7  9]
 [10  7]
 [10  7]
 [ 7  6]
 [ 6  8]
 [ 1  9]
 [ 1  5]
 [ 5  8]
 [ 5  1]
 [ 7  8]
 [ 8  9]
 [ 9  1]
 [10  7]], grad=0)

In [0]:
y_true = [1.9*x1 + 2.2*x2 for x1,x2 in x_val]  # w1 and w2 are 1.9 and 2.2 respectively
y_true = np.array([y_true])

In [0]:
y_true = Value(np.transpose(y_true))

In [7]:
y_true

Value(data=[[36.9]
 [22.4]
 [35.3]
 [18.3]
 [37.2]
 [16.1]
 [32.2]
 [24.3]
 [12.6]
 [34.4]
 [20.8]
 [ 7.9]
 [17.4]
 [28.4]
 [30.6]
 [27.8]
 [21.7]
 [10.1]
 [17.7]
 [39.1]
 [28.1]
 [14.2]
 [21.2]
 [20.5]
 [25.2]
 [21.4]
 [22.4]
 [12.6]
 [37.2]
 [19.5]
 [23.7]
 [17.7]
 [13.9]
 [12.3]
 [37.2]
 [ 4.1]
 [10.7]
 [27.1]
 [21.8]
 [21.8]
 [12.9]
 [21.5]
 [19.5]
 [16.4]
 [27.7]
 [32.5]
 [17.7]
 [17.4]
 [23. ]
 [21.1]
 [27.7]
 [30.9]
 [20.2]
 [27.1]
 [21.8]
 [15.5]
 [21.1]
 [21.7]
 [31.2]
 [23.6]
 [30.9]
 [31.2]
 [19.2]
 [19.3]
 [25.2]
 [31.5]
 [24.9]
 [22.7]
 [14.5]
 [27.4]
 [29.3]
 [36.9]
 [29. ]
 [33.1]
 [24.3]
 [24. ]
 [39.1]
 [27.7]
 [23.9]
 [10.1]
 [21.8]
 [13.9]
 [22.7]
 [34.4]
 [25.9]
 [23.4]
 [17.3]
 [33.1]
 [34.4]
 [34.4]
 [26.5]
 [29. ]
 [21.7]
 [12.9]
 [27.1]
 [11.7]
 [30.9]
 [35. ]
 [19.3]
 [34.4]], grad=0)

In [8]:
W = Value(np.array([[0.9],[0.2]])) # w1 and w2 initialized to 0.9 and 0.2 respectively
W

Value(data=[[0.9]
 [0.2]], grad=0)

In [9]:
epochs = 10
#gradient descent over the whole dataset
for epoch in range(epochs):
    y_pred = x_vals.matmul(W)
    z = (y_true-y_pred)
    z_ = z**2
    out = z_.reduce_sum(axis = 1)
    fin = 0.01*out.reduce_sum()  #(0.01 = 1/100 = BATCH_SIZE)
    fin.backward()
    print(f'loss in epoch {epoch+1} is {fin}')
    W.data = W.data- 0.01*W.grad
    W.grad = 0

loss in epoch 1 is Value(data=344.91, grad=1)
loss in epoch 2 is Value(data=87.32914902839995, grad=1)
loss in epoch 3 is Value(data=22.826526386959987, grad=1)
loss in epoch 4 is Value(data=6.433547998975098, grad=1)
loss in epoch 5 is Value(data=2.10864601612934, grad=1)
loss in epoch 6 is Value(data=0.8643795163737399, grad=1)
loss in epoch 7 is Value(data=0.4417139197913734, grad=1)
loss in epoch 8 is Value(data=0.26096539672614016, grad=1)
loss in epoch 9 is Value(data=0.16557945665922355, grad=1)
loss in epoch 10 is Value(data=0.10824826286236029, grad=1)


In [10]:
W

Value(data=[[1.96191472]
 [2.13733132]], grad=0)

In [11]:
batch_size = 32
steps = 100
Wb = Value(np.array([[9.0],[22.2]]))# new initialized weights for gradient descent
for step in range(steps):
  ri = np.random.permutation(x_vals.data.shape[0])[:batch_size]
  Xb, yb = Value(x_vals.data[ri]), Value(y_true.data[ri])
  y_predW = Xb.matmul(Wb)
  zb = (yb-y_predW)
  z_b = zb**2
  outb = z_b.reduce_sum(axis = 1)
  finb = 0.32*outb.reduce_sum()  #(0.32 = 1/32 = BATCH_SIZE)
  finb.backward()
  print(f'loss in step {step+1} is {finb}')
  Wb.data = Wb.data- 0.001*Wb.grad
  Wb.grad = 0

loss in step 1 is Value(data=278226.4544, grad=1)
loss in step 2 is Value(data=63772.18526760565, grad=1)
loss in step 3 is Value(data=18953.449879588352, grad=1)
loss in step 4 is Value(data=7152.0502941471, grad=1)
loss in step 5 is Value(data=2834.9708044579033, grad=1)
loss in step 6 is Value(data=1012.7818149786664, grad=1)
loss in step 7 is Value(data=996.3690478488656, grad=1)
loss in step 8 is Value(data=485.70346024457206, grad=1)
loss in step 9 is Value(data=339.92461319700584, grad=1)
loss in step 10 is Value(data=169.6036274967689, grad=1)
loss in step 11 is Value(data=106.82733269226512, grad=1)
loss in step 12 is Value(data=72.95774869535279, grad=1)
loss in step 13 is Value(data=50.8637110315019, grad=1)
loss in step 14 is Value(data=28.83550919938446, grad=1)
loss in step 15 is Value(data=20.705095567942575, grad=1)
loss in step 16 is Value(data=20.090436223797578, grad=1)
loss in step 17 is Value(data=10.741854577055179, grad=1)
loss in step 18 is Value(data=7.62077211

In [12]:
Wb

Value(data=[[1.89999999]
 [2.20000001]], grad=0)