In [6]:
from grad import *
import numpy as np
import math

In [8]:
def l2_loss(truth, model):
    ''' Simple L2 loss for a single sample.
    
        Parameters
        ----------
        truth : Sequence[float]
            A single data point.
              
        model : Sequence[float]
            The parameters of a model.
              
        Returns
        -------
        Number : the L2 loss (squared error) of `model_params` evaluated on `truth`
    '''
    l = truth[0] - model[0] - sum(truth[i]*model[i] for i in range(1, len(model)))
    return l**2 if l.data > 0 else (-1*l)**2

In [10]:
def train_epoch(model, data_set, loss_fn, lr=0.001):
    ''' Train a model for a single pass (epoch) through the provided data.
    
        Parameters
        ----------
        model : Sequence[Number]
            The parameters of a model
            
        data_set : Sequence[Sequence[float]]
            The datapoints in a dataset
            
        Returns
        -------
        float : the mean loss for the epoch
    '''
    # compute the mean error over the dataset
    mean_loss = sum(loss_fn(sample, model) for sample in data_set) / len(data_set)
    
    # compute gradients for our parameters
    mean_loss.null_gradients()
    mean_loss.backprop()
    
    # update the model parameters using gradient descent
    for param in model:
        # recall: param.grad is d(L)/d(param)
        # thus this computes:
        # param_new = param_old - step-size * d(L)/d(param) 
        param.data -= lr*param.grad  
        
    # return the loss for visualization
    return mean_loss.data

In [4]:
np.random.seed(0)

def weight_matrix(shape, stupid=False):
    """weight matrix thingy.give dims. Not 0."""
    # np.as_array(Number(np.random.rand()) for i in range(M*N))
    number = 1
    if(type(shape) == int):
        shape = [shape]
    for i in shape:
        number*= i
    if stupid:
        return np.array([Number(i / 10) for i in range(number)]).reshape(*shape)
    return np.array([Number(np.random.uniform(low=-.5, high=.5, size=None)) for i in range(number)]).reshape(*shape)
 
def get_grads(x):
    vectorized_grad = np.vectorize(lambda x : x.grad) #not true vecotrization just for readibility
    return vectorized_grad(x)

def sigmoid(x):
    return np.vectorize(lambda x: 1/(1+math.e**-x))(x)

In [4]:
class Model():
    '''this only works w/ MSE and sigmoid because dik how to topological sort'''
    
    def __init__(self, input_size, output_size, hidden_layers, stupid=False):
        '''
        Takes list of # of things in their layers.
        Layers are outputs?
        '''
        self.layer_sizes = hidden_layers
        self.layers = []
        self.biases = []

        #Hidden states is after the *weight but before sigmoid
        self.hidden_states = []

        self.hidden_states_sigmoid = []
        
        prev = input_size
        for hidden_layer in hidden_layers:
            # self.layers, weight_matrix([prev, hidden_layer])
            self.layers.append(weight_matrix([prev, hidden_layer], stupid))

            self.biases.append(weight_matrix(hidden_layer, stupid))
            prev  = hidden_layer
        self.biases.append(weight_matrix([output_size]))
        self.layers.append(weight_matrix([prev, output_size]))
 

        # print("layers "  + str(self.layers))
        # print("biases " + str(self.biases))

    def fd(self, x):
        '''f pass with input. Input has to be flat like a pancake'''
        #sigma sigma boy.
        self.hidden_states_sigmoid = []
        self.hidden_states = []
        self.input = x
        for i in range(len(self.layers)):
            # print(np.max(x))
            x = x @ self.layers[i]
            x += self.biases[i]
            self.hidden_states.append(x)
            x = sigmoid(x)
            self.hidden_states_sigmoid.append(x)

        return x
        
    def train_epoch(self, x, y, lr=10**-2):
        '''
        f pass and then uh gradient descent?

        x: Input. Again, flat as a pancake.
        y: the goal. In sparese tensor. 
        lr: how quick it learns
        '''
        num_correct = 0
        losses = []
        for i in range(len(y)):
            # print(x)
            pred = self.fd(x[i])
            mse = np.sum((pred * pred -  y[i] *  y[i] + 2 * np.dot(pred, y[i]))) / len(y)
            self.backprop(mse, y[i], lr)

            num_correct += np.argmax(pred) == np.argmax(y[i])
            losses.append(mse)

            print(f"Acc: {num_correct/len(y)} Avg loss: {sum(losses)/len(y)}")


        # print("finished backpropring gradiests")

    def backprop(self, loss, y, lr):
        null_gradients(self.biases)
        null_gradients(self.layers)

         
        loss.null_gradients(recursive=False)
 
        #And here comes the hard part.
        for i in range(len(self.hidden_states_sigmoid[-1])):
            #the last one gets special treatment due to the mse. help this is so scuffed :sob:
            self.hidden_states_sigmoid[-1][i].grad = 2 * (self.hidden_states_sigmoid[-1][i]  - y[i])

        
        for k in range(len(self.hidden_states_sigmoid) - 1,0 , -1):
            curr_h = self.hidden_states[k]
            curr_hs = self.hidden_states_sigmoid[k]
            #Get the gradients of the hidden layer : h_k[i].grad = sh_k[i].sigmoid(h_k[i]) * ( 1 - sigmoid(h_k[i]))
            for i in range(len(curr_h)):
                curr_h[i].grad =  curr_hs[i].grad * sigmoid(curr_h[i]) * ( 1 - sigmoid(curr_h[i]))

            #fix gradients in w_k with w_k[i][j].grad = s_(k-1)[j] * h_k[j].grad
            #Or: grad of a weight d(input to weight layer) * 1 * grad after
            curr_w = self.layers[k]
            # print(self.hidden_states_sigmoid[k-1])
            # print(curr_w, "cur")
            for i in range(curr_w.shape[0]):
                for j in range(curr_w.shape[1]):
                    curr_w[i][j].grad = self.hidden_states_sigmoid[k-1][i] * self.hidden_states[k][j].grad 
                    
            #Get gradients in b_k with b_k[i].grad = h_k[i].grad
            curr_b = self.biases[k]
            for i in range(curr_b.shape[0]):
                curr_b[i].grad = self.hidden_states[k][i].grad 
                    
            #Compute next s(k-1) gradients.
            prev_hs = self.hidden_states_sigmoid[k - 1]
            prev_h = self.hidden_states[k - 1]
 

            for i in range(len(prev_hs)):
                #Gonna have to do iteration i think
                prev_hs[i].grad = 0
                for j in range(len(curr_w[i])):
                    prev_hs[i].grad += sigmoid(prev_h[i]) * ( 1 - sigmoid(prev_h[i])) * curr_w[i][j].grad


        curr_h = self.hidden_states[0]
        curr_hs = self.hidden_states_sigmoid[0]
        for i in range(len(curr_h)):
            curr_h[i].grad =  curr_hs[i].grad * sigmoid(curr_h[i]) * ( 1 - sigmoid(curr_h[i]))
            
        #Manual computation of the last layer that connects to input layer.
        curr_w = self.layers[0]
        for i in range(curr_w.shape[0]):
            for j in range(curr_w.shape[1]):
                curr_w[i][j].grad = self.input[j] * self.hidden_states[0][j].grad 
                
        #Get gradients in b_k with b_k[i].grad = h_k[j].grad
        curr_b = self.biases[0]
        for j in range(curr_b.shape[0]):
            curr_b[j].grad = 1 * self.hidden_states[0][j].grad 

        #Actual updates.
        #Subtract gradient from everything. Here b and w refer to individual weights and biases inconsistent notation woohoo
        for layer in self.layers:
            for w in layer.flat:
                 w -= w.grad * lr

        for bias in self.biases:
            for b in bias.flat:
                b -= b.grad * lr
                
    def get_gradients(layer):
        for weight in self.layers[i].flat:
            weight.backprop(recursive=false)
        for bias in self.biases[i].flat:
            bias.backprop(recursive=false)  
            
          
    def print_info(self, verbose=True):
        print("layers " )
        for i in range(len(self.layers)):
            print( f"Layer {i} of shape {self.layers[i].shape}")
            print(self.layers[i])
        print("biases ")
        for i in range(len(self.biases)):
            print( f"Layer {i} of shape {self.biases[i].shape}")
            print(self.biases[i])

In [14]:
tiny_x = [1, 2, 3, 4]
tiny_test = Model(4, 2, [3, 5], stupid=True)
tiny_test.print_info()
tiny_test.train_epoch(tiny_x, np.array([2,3]))
tiny_test.train_epoch(tiny_x, np.array([2,3]))

layers 
Layer 0 of shape (4, 3)
[[Number(0.0) Number(0.1) Number(0.2)]
 [Number(0.3) Number(0.4) Number(0.5)]
 [Number(0.6) Number(0.7) Number(0.8)]
 [Number(0.9) Number(1.0) Number(1.1)]]
Layer 1 of shape (3, 5)
[[Number(0.0) Number(0.1) Number(0.2) Number(0.3) Number(0.4)]
 [Number(0.5) Number(0.6) Number(0.7) Number(0.8) Number(0.9)]
 [Number(1.0) Number(1.1) Number(1.2) Number(1.3) Number(1.4)]]
Layer 2 of shape (5, 2)
[[Number(-0.3566467125909536) Number(0.4446689170495839)]
 [Number(0.021848321750071675) Number(-0.08533806000947641)]
 [Number(-0.23544438789537303) Number(0.27423368943421667)]
 [Number(-0.04384966778345145) Number(0.06843394886864851)]
 [Number(-0.48121019956364486) Number(0.11763549707587706)]]
biases 
Layer 0 of shape (3,)
[Number(0.0) Number(0.1) Number(0.2)]
Layer 1 of shape (5,)
[Number(0.0) Number(0.1) Number(0.2) Number(0.3) Number(0.4)]
Layer 2 of shape (2,)
[Number(-0.3817255741310668) Number(0.13992102132752382)]
passed null check 
passed null check 
pas

In [12]:
def null_gradients(layers):
    """nulls all gradients"""
    for layer in layers:
        for weight in layer.flat:
            weight.null_gradients()
            
def check_null(layers):
    """nulls all gradients"""
    for layer in layers:
        for weight in layer.flat:
            if weight.grad != None:
                print("hi welcome to another 5hrs of debuggin")
    print("passed null check ")



In [17]:
def fix_data(x, y):
    x = x.reshape(x.shape[0], 28*28)/255
    test = np.zeros((x.shape[0], 10))
    test[np.arange(x.shape[0]),y] = 1
    return (x, test)

In [None]:
(x_train, y_train), ds_info = tfds.load(
    'mnist',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True,
) 

In [19]:
import keras
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data(path="mnist.npz")
testx, testy = fix_data(x_train[0:2], y_train[0:2])
rawx = x_train[0:2]



In [None]:
fixed_x, fixed_y = fix_data(x_train[:100], y_train[:100])

In [None]:
testa = np.array([[[2, 3], [4, 5]], [[2, 3], [4, 5]]])
testa.reshape(2, 4)

In [21]:
#Moment of truth
plsfuckingwork = Model(28*28, 10, [16, 32])
plsfuckingwork.train_epoch(fixed_x, fixed_y)

NameError: name 'weight_matrix' is not defined

pred done 
mse calced 
nulled the layers
finished nullig all gradiests
finished backpropring gradiests


In [None]:
import tensorflow as tf

import torch

In [None]:

y = torch.tensor(np.arange(6).reshape(3,2).astype(float), requires_grad=True)
x = torch.tensor(np.arange(3).reshape(1,3).astype(float), requires_grad=True)
out = x@y
out.backward((torch.ones_like(out)))
print(y.grad)
print(x.grad)


In [None]:
matrix_1 = weight_matrix((3, 2))
matrix_2 = weight_matrix((1,3))
print("t1 ", matrix_1)
print("t2 ",matrix_2)

test3 = matrix_2@matrix_1
test4 = np.sum(test3)
test4.null_gradients()
test4.backprop()

print("result", test3)
print(test4)

print("grad")
print(get_grads(matrix_1))
print(get_grads(matrix_2))

In [None]:
model = create_model(4)
losses = []
for _ in range(1000):
    losses.append(train_epoch(model, data_set, l2_loss))