# Deep Learning

### Tensor

In [2]:
Tensor = list

In [3]:
from typing import List

def shape(tensor: Tensor) -> List[int]:
    sizes: List[int] = []
    while isinstance(tensor, list):
        sizes.append(len(tensor))
        tensor = tensor[0]
    
    return sizes

In [4]:
print(shape([[23,4,5], [45,55]]))
print(shape([[[12,4,5], [32,5,6], [4,56,24]]]))

[2, 3]
[1, 3, 3]


In [5]:
def is_1d(tensor: Tensor) -> bool:
    '''
    If tensor[0] is a list, its a higher-order tensor.
    Otherwise, tensor is 1-dimensional (that is, a vector).'''
    return not isinstance(tensor[0], list)

In [6]:
print(is_1d([1,3,4]))
print(is_1d([[12,3], [325,5]]))

True
False


In [7]:
def tensor_sum(tensor: Tensor) -> float:
    '''sums up all the values in the tensor'''
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i)
                   for tensor_i in tensor)

In [8]:
print(tensor_sum([[1,2,4], [1,3,4], [1,34,5]]))

55


In [9]:
from typing import Callable

In [10]:
def tensor_apply(f: Callable[[float], float], tensor: Tensor) -> Tensor:
    '''appliers f elementwise'''
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]

In [11]:
print(tensor_apply(lambda x: x+1, [1,2,3]))

print(tensor_apply(lambda x: 2*x, [[1,2], [3, 4]]))

[2, 3, 4]
[[2, 4], [6, 8]]


In [12]:
def zeros_like(tensor: Tensor) -> Tensor:
    return tensor_apply(lambda _ : 0.0, tensor)

In [13]:
print(zeros_like([1,2,4]))
print(zeros_like([[12,4,5], [12,4,5], [35,5, 4]]))

[0.0, 0.0, 0.0]
[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]


In [14]:
def tensor_combine(f: Callable[[float, float], float],
                   t1: Tensor,
                   t2: Tensor) -> Tensor:
    '''applies f to corresponding elements of t1 and t2'''
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i)
                for t1_i, t2_i in zip(t1, t2)]

In [15]:
import operator

In [16]:
print(tensor_combine(operator.add, [1,2,3], [4,5,6]));print()
print(tensor_combine(operator.mul, [1,2,3], [4,5,6]))

[5, 7, 9]

[4, 10, 18]


## The Layer Abstraction

In [17]:
from typing import Iterable, Tuple

In [18]:
class Layer:
    '''our neural network will be composed of Layers, each of which knows how to do some 
    computation on its inputs in the "forward" direction and propagate gradients in the
    "backward", direction'''
    def forward(self, input):
        '''Note the lack of types. 
        We re not going to be prescriptive about what kinds of inputs layers can take 
        and what kinds of ouputs they can return '''
        
        raise NotImplementedError
    
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError

    def params(self) -> Iterable[Tensor]:
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()

    def grads(self) -> Iterable[Tensor]:
        """
        Returns the gradients, in the same order as params()
        """
        return ()

In [19]:
from neural_network import sigmoid

In [20]:
class Sigmoid(Layer):
    
    def forward(self, input: Tensor) -> Tensor:
        '''Apply Sigmoid to each element of the input tensor,
        and save the result to use in backpropagration'''
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
    
    def backwark(self, gradient: Tensor) -> Tensor:
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)

# The Linear Layer

In [21]:
import random
from probability import inverse_normal_cdf

In [22]:
def random_uniform(*dims: int) -> Tensor:
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random.uniform(*dims[1:]) for _ in range(dims[0])]

In [23]:
def random_normal(*dims: int,
                   mean: float = 0.0,
                   variance: float = 1.0) -> Tensor:
    
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random())
                                                    for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean = mean, variance = variance)
                                                    for _ in range(dims[0])]

In [24]:
print(shape(random_uniform(2,3,4)))
print(shape(random_normal(5,6,mean=10)))

[2]
[5, 6]


In [25]:
def random_tensor(*dims: int, init: str = 'normal') -> Tensor:
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance = variance)
    else:
        raise ValueError(f"Unknown init: {init}")

In [26]:
from linearalgebra import dot

In [27]:
class Linear(Layer):
    def __init__(self,
                 input_dim: int,
                 output_dim: int,
                 init: str = 'xavier') -> None:
        
        '''
        A layer of output_dim neurons,
        each with input_dim weights (and a bias)'''
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.w = random_tensor(output_dim, input_dim, init = init)
        self.b = random_tensor(output_dim, init = init)
        
    
    def forward(self, input: Tensor) -> Tensor:
        # Saving the input to use in the backward pass.
        self.input = input
        return [dot(input, self.w[o]) + self.b[o]
                for o in range(self.output_dim)]
    
    def backward(self, gradient: Tensor) -> Tensor:
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o]
                        for i in range(self.input_dim)]
                        for o in range(self.output_dim)]
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim))
                for i in range(self.input_dim)]
    
    
    def params(self) -> Iterable[Tensor]:
        return [self.w, self.b]
    
    def grads(self) -> Iterable[Tensor]:
        return [self.w_grad, self.b_grad]

## Neural Networks as a Sequance of Layers 

In [28]:
from typing import List

In [29]:
class Sequential(Layer):
    '''
    a layer consisting of a sequence of other layers.
    its up to you to make sure that the output of each layer makes sense as the
    input to the next layer.'''
    def __init__(self, layers: List[Layer]) -> None:
        self.layers = layers
        
    def forward(self, input):
        '''just forward the input through the layers in order.'''
        for layer in self.layers:
            input = layer.forward(input)
        return input
        
    def backward(self, gradient):
        '''just backpropagete the gradient through the layers in reverse.'''
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        
        return gradient
    
    def params(self) -> Iterable[Tensor]:
        '''just return the params from each layers'''
        return (param for layer in self.layers for param in layer.params())
    
    def grads(self) -> Iterable[Tensor]:
        '''just return the grad'''
        return (grad for layer in self.layers for grad in layer.grad())

In [30]:
xor_net = Sequential([Linear(input_dim=2, output_dim=2),
                      Sigmoid(),
                      Linear(input_dim=2, output_dim=1),
                      Sigmoid()])

## Loss and Optimization

In [31]:

class Loss:
    def loss(self, predicted: Tensor, actual: Tensor) -> float:
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError

    def gradient(self, predicted: Tensor, actual: Tensor) -> Tensor:
        """How does the loss change as the predictions change?"""
        raise NotImplementedError

class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted: Tensor, actual: Tensor) -> float:
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)

        # And just add them up
        return tensor_sum(squared_errors)

    def gradient(self, predicted: Tensor, actual: Tensor) -> Tensor:
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)

class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer: Layer) -> None:
        raise NotImplementedError

class GradientDescent(Optimizer):
    def __init__(self, learning_rate: float = 0.1) -> None:
        self.lr = learning_rate

    def step(self, layer: Layer) -> None:
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)


In [32]:
class Momentum(Optimizer):
    def __init__(self, learning_rate: float,
                 Momentum: float = 0.9) -> None:
          self.lr = learning_rate
          self.mo = Momentum
          self.updates: List[Tensor] = []
        
    def step(self, layers: Layer) -> None:
        if not self.updates:
            self.updates = [zeros_like(grad) for grad in layer.grads()]
            for update, param, grad in zip(self.updates, layer.param(),
                                           layer.grads()):
                    
                update[:] = tensor_combine(
                    lambda u, g: self.mo * u + (1- self.mo) * g,
                    update, grad
                )
                param[:] = tensor_combine(
                    lambda p, u: p - self.lr * u, param, update
                )