In [44]:
import math 
import numpy as np 
import plotly.express as px 
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import random

%matplotlib inline
pio.templates.default = "plotly_dark"

In [6]:
#Part 1- Derivatives

def f(x):
    return 3*x**2 - 4*x + 5

#Plot f(x) btween -5 and 5 in steps of .25 using plotly 
x = np.arange(-5,5,.25)
y = f(x)
fig = px.line(x=x,y=y)
fig.show()


In [7]:
#In neural nets, we dont calculate derivatives by hand (symbollically) 
#Derivative Definition:
#L = lim h->0 (f(x+h) - f(x))/h

h = 1E-13 #Something small
x = 3
derivative = (f(x+h) - f(x))/h
print("Derivative at x=3 is: ", derivative)

Derivative at x=3 is:  13.997691894473974


In [151]:
#Create a class "Value" that stores a value
class Value:
    def __init__(self, data, label = '', _children=(), _op=''): #Double underscore methods are called "magic methods"
        self.data = data
        self._prev = list(_children)
        self._op = _op
        self.grad = 0 
        self.label = label

    def __repr__(self):
        return f"Value({ self.data}, {self.label})"

    def __add__(self, other): #Called when we use the "+" operator
        if not isinstance(other, Value):
            other = Value(other)
        return Value(self.data+other.data, _children=(self,other), _op='+')

    def __radd__(self, other):
        return self + other

    def __sub__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
        return Value(self.data-other.data, _children=(self,other), _op='-')

    def __rsub__(self, other):
        return self - other

    def __mul__(self, other): #Called when we use the "*" operator
        #If its a number, create a new Value object
        if not isinstance(other, Value):
            other = Value(other)

        return Value(self.data*other.data, _children = (self, other), _op='*')

    def __pow__(self,other):
        if not isinstance(other, Value):
            other = Value(other)
        return Value(self.data**other.data, _children=(self,other), _op='**')

    def __truediv__(self, other):
        if not isinstance(other, Value):
            other = Value(other)

        return Value(self.data/other.data, _children=(self,other), _op='/')

    def __rmul__(self, other): #other * self
        return self*other

    def tanh(self):
        return Value(math.tanh(self.data), _children=(self,), _op='tanh')

    def exp(self):
        return Value(math.exp(self.data), _children=(self,), _op='exp')


    #Backward pass
    def backwards(self, prevGrad = 1, prevOp = '', prevOther = 1):
        
        if prevOp == '*':
            self.grad += prevGrad*prevOther
        elif prevOp == "+":
            self.grad += prevGrad
        elif prevOp == "-":
            self.grad -= prevGrad
        elif prevOp == 'tanh':
            self.grad += (1 - prevOther**2)*prevGrad
        elif prevOp == 'exp':
            self.grad += math.exp(self.data)*prevGrad
        elif prevOp == '**':
            self.grad += prevOther*self.data**(prevOther-1)*prevGrad
        elif prevOp == '/':
            self.grad += prevGrad/prevOther

        else:
            self.grad += 1

        if not self._prev:
            return 

        prev2 = self._prev[1] if len(self._prev) > 1 else self 

        self._prev[0].backwards(self.grad,self._op,prev2.data) 
        if prev2 != self:
            self._prev[1].backwards(self.grad,self._op,self._prev[0].data) 


Chain Rule
----------
The chain rule is a fundamental rule in calculus that allows us to compute the derivative of a function that is composed of other functions. For example, if we have a function f(x) = g(h(x)), then the derivative of f with respect to x is given by:

$$\frac{dz}{dx} = \frac{dz}{dy} \frac{dy}{dx}$$


In [9]:
from graphviz import Digraph

def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % ('', n.data, n.grad), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot


In [10]:
a = Value(1,'a')
b = Value(7,'b') 
c = a + b 
c.label = 'c'
d = Value(3,'d')
e = c * d ; e.label = 'e'

e.backwards()

True


In [113]:
#Given a first node, generate a graph of the entire computation graph using plotly
import networkx as nx

def draw_graph(root):
    nodes, edges = trace(root)
    

    G = nx.DiGraph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)

    pos = nx.spring_layout(G)
    
    node_x = []
    node_y = []

    node_label = []
    node_value = []

    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_label.append(node.label)
        node_value.append(node.data)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_label,

        marker=dict(
            size = 20,
            color = "darkblue",
            

        )
    )
    
    
    #Set text to , v = node_value, grad = node.grad (round to 2 decimal places)
    # node_trace.text = [f"{node.label} = {round(node.data,3)}, grad = {round(node.grad,3)}" for node in G.nodes()]
    node_trace.text = [f"{node.label}" for node in G.nodes()]



    #Make edges
    edge_x = []
    edge_y = []

    ops_x = []
    ops_y = []

    ops = []

    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

        ops.append(edge[1]._op)

        #Create an operations node in the middle of the edge
        ops_x.append((x0+x1)/2)
        ops_y.append((y0+y1)/2)


    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=3, color='#888'),
        hoverinfo='text',
        mode='lines',
    )
    edge_trace.text = ops

    ops_trace = go.Scatter(
        x=ops_x, y=ops_y,
        mode='markers+text',
        hoverinfo='text',
        text=ops,
        marker = dict(
            size = 10,
            color = 'darkred'
        ),
        textposition = 'middle center'
    )


    #Show 
    fig = go.Figure(data=[edge_trace, ops_trace, node_trace])
    fig.show()


In [43]:
#Basic Nueron
x1 = Value(2,'x1')
x2 = Value(0,'x2')

w1 = Value(-3,'w1')
w2 = Value(1,'w2')

b = Value(6.88137,'b')

y = w1*x1 + w2*x2 + b ; y.label = 'y'
f = y.tanh() ; f.label = 'f'

f.backwards()

draw_graph(f)

#Print Gradients
print(f"Gradient of w1: {w1.grad}")
print(f"Gradient of w2: {w2.grad}")
print(f"Gradient of x1: {x1.grad}")
print(f"Gradient of x2: {x2.grad}")
print(f"Gradient of y: {y.grad}")

0.8813700000000004


Gradient of w1: 1.000005072818119
Gradient of w2: 0.0
Gradient of x1: -1.5000076092271784
Gradient of x2: 0.5000025364090595
Gradient of y: 0.5000025364090595


In [42]:
a = Value(1,'a')
b = a+a 
b.backwards()
draw_graph(b)

In [23]:
import torch

In [28]:
x1 = torch.Tensor([2]).double() ; x1.requires_grad = True
x2 = torch.Tensor([0]).double() ; x2.requires_grad = True
w1 = torch.Tensor([-3]).double() ; w1.requires_grad = True
w2 = torch.Tensor([1]).double() ; w2.requires_grad = True
b = torch.Tensor([6.88137]).double() ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print("x1.grad=",x1.grad.item()) 
print("x2.grad=",x2.grad.item())
print("w1.grad=",w1.grad.item())
print("w2.grad=",w2.grad.item())


0.7071050214706146
x1.grad= -1.500007465833125
x2.grad= 0.5000024886110417
w1.grad= 1.0000049772220834
w2.grad= 0.0


In [145]:
class Neuron: 
    def __init__(self, nin):
        #nin = number of inputs going into the nueron
        self.w = [Value(random.uniform(-1,1),f"w{i}") for i in range(nin)]
        self.b = Value(random.uniform(-1,1),'b')

    def __call__(self,x): 
        #w*x + b 
        return ( sum([w*x for w,x in zip(self.w,x)]) + self.b ).tanh()

    def parameters(self): 
        #Return a list of all the parameters of the neuron
        return self.w + [self.b]
        


In [146]:
#Layer of Neurons = List of Nuerons. They aren't connected to each other
class Layer: 
    def __init__(self, nin, nout):
        #nin = number of inputs going into the nueron (x)
        #nout = number of neurons in the layer
        self.neurons = [Neuron(nin) for i in range(nout)]

    def __call__(self,x):
        return [n(x) for n in self.neurons]

    def parameters(self):
        #Return a list of all the parameters of the layer
        return [p for n in self.neurons for p in n.parameters()]

In [94]:
x = [2,3]

#Inputs in Layer are the same (x)
l = Layer(2,3)
l(x)

[Value(-0.9934405903715453, ),
 Value(-0.9960895811299004, ),
 Value(0.9282588462227764, )]

In [147]:
#MLP (Multi Layer Perceptron)
#Layer of Layers

class MLP:
    def __init__(self, nin, nouts): 
        #nin = number of inputs going into the initial layer
        #nouts = list of number of outputs in each layer
        self.layers = [Layer(nin,nouts[0])] #First layer
        for i in range(1,len(nouts)):
            self.layers.append(Layer(nouts[i-1],nouts[i]))

    def __call__(self,x):
        for l in self.layers:
            x = l(x)
        return x

    def parameters(self):  
        #Return a list of all the parameters of the MLP
        return [p for l in self.layers for p in l.parameters()]


In [152]:
x = [2,3,-1]
n = MLP(3,[4, 4, 1])
n(x)

draw_graph(n(x)[0])



In [153]:
#Loss Function

#Say we have these inputs 
xs = [[2,3,-1],[3,-1,.5],[.5,1,1],[1,1,-1]]

#And we want these outputs
ys = [1,-1,-1,1]

#Right now we have 
ypred = [n(x)[0] for x in xs]
print(ypred)

#We want to minimize the loss function 

[Value(0.4542069349289467, ), Value(-0.15673021275560195, ), Value(-0.47424213042392616, ), Value(-0.26340178536793224, )]


In [155]:
#Mean Squared Error
def mse(y,ypred):
    mse = []
    for ygt, yOut in zip(y,ypred):
        #Distance from ground truth squared = MSE 
        mse.append( (ygt - yOut)**2 )

        

    return sum(mse)

loss = mse(ys,ypred)

loss.backwards()

len( n.parameters() )

41

In [144]:
n.layers[0].neurons[0].w[0].grad #Gradient of first weight in first neuron in first layer
#Slightly decreasing this would make the loss smaller (cause its calculated with respect to loss)

0.2888731551405782

In [297]:
yPred = [n(x)[0] for x in xs]
loss = mse(ys,yPred)
loss.backwards()
print("Loss=",loss.data)
print(yPred)
for p in n.parameters():
    p.data -= .001*p.grad


Loss= 12.0
[Value(-1.0, ), Value(1.0, ), Value(-1.0, ), Value(-1.0, )]
