In [None]:
from torch import empty
import torch
torch.set_grad_enabled(False)
from relu import ReLU
from sgd import SGD, SGD_Sequential
from leakyrelu import LeakyReLU

### The module and sgd optimizer

In [None]:
class Module(object):
    
    def forward(self, *input):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []


class Linear(Module):
    def __init__(self, in_features, out_features, bias=True):
        self.in_features = in_features
        self.out_features = out_features
        self.bias = bias
        self.params = {}
        self.w = empty(out_features, in_features).fill_(1)
        self.params['weight'] = [self.w]
        self.gradwrt_w = []
        self.params['grad'] = [self.gradwrt_w]
        if bias:
            self.b = empty(out_features).fill_(1)
            self.params['weight'].append(self.b)
            self.gradwrt_b = []
            self.params['grad'].append(self.gradwrt_b)
        self.input = None
        
    def forward(self, *input):
        self.input = input
        l = []
        
        if self.bias:
            l = [self.w @ tensor + self.b for tensor in input]
        else:
            l = [self.w @ tensor for tensor in input]
        
        return tuple(l)
        
    def backward(self, *gradwrtoutput):
        l = []
        
        for i in range(len(gradwrtoutput)):
            # with respect to the input
            l += [(gradwrtoutput[i] @ (self.w))]
            
            # with respect to the weight
            gradwrt_w = gradwrtoutput[i].view(-1,1).mm(self.input[i].view(1,-1))
            if len(self.gradwrt_w) != len(gradwrtoutput):
                self.gradwrt_w.append(empty(gradwrt_w.size()).fill_(0).squeeze()) # (1) this can be optimized
            self.gradwrt_w[i].add_(gradwrt_w.squeeze())
         
            # with respect to the bias
            if self.bias:
                gradwrt_b = gradwrtoutput[i]
                if len(self.gradwrt_b) != len(gradwrtoutput):
                    self.gradwrt_b.append(empty(gradwrt_b.size()).fill_(0).squeeze()) # (1) this can be optimized
                self.gradwrt_b[i].add_(gradwrt_b.squeeze())
        
        return tuple(l)
        
    def param(self):
        return self.params

In [None]:
class Sequential(Module):
    def __init__(self, modules, types):
        self.modules = modules
        
        # reverse order of modules for backward pass:
        self.N = len(modules)
        self.reverse_modules = [self.modules[(self.N)-i-1] for i in range(self.N)]
        self.input = None
        self.types = types
        
    def forward(self, *input):
        self.input = input
        
        # number of modules:
        N = len(self.modules)
        first_layer = self.modules[0]
        print(first_layer.param())
        x = first_layer.forward(*self.input)
        
        for i in range(1,N):
            module = self.modules[i]
            output_layer_i = module.forward(*x)
            # output becomes input to next layer
            x = output_layer_i
            
        return tuple(x)
        
    def backward(self, *gradwrtoutput):
        self.input = gradwrtoutput
        
        # number of modules:
        N = len(self.reverse_modules)
        
        # feed gradient to last layer:
        last_layer = self.reverse_modules[0]
        x = last_layer.backward(*gradwrtoutput)
        
        # give to layers N-1 -> 1:
        for i in range(1,N):
            module = self.reverse_modules[i]
            output_layer_i = module.backward(*x)
            # output becomes input to next layer
            x = output_layer_i
        return tuple(x)
        
    def param(self):
        par = {}
        for i in range(len(self.types)):
            par[self.types[i]] = self.modules[i].param()
        #return [module.param() for  module in self.modules]
        return par

### Example

In [None]:
# ------------------------------------------------------------------
# Control the randomness
# ------------------------------------------------------------------
import torch
torch.manual_seed(0)

# ------------------------------------------------------------------
# model
# ------------------------------------------------------------------
m = Linear(2, 3)
m2 = Linear(3, 4)



# give input as list of modules
types = ['linear1', 'linear2']
s = Sequential([m, m2], types)


sgd = SGD_Sequential(s.param(),types, 0.1)
print('# ------------------------------------------------------------------')
print('# Set up, weight initialization:')
print("Params of modules after initialization:\n")
for i in range(len(s.param())):
    print("Module {},{}:\n".format(i, types[i]))
    print(s.param()[types[i]])

# ------------------------------------------------------------------
# input and error and reset the gradients
# ------------------------------------------------------------------
input = empty(2)
input[0] = 1
input[1] = 2

# ------------------------------------------------------------------
# forward pass
# ------------------------------------------------------------------
x = s.forward(input, input, input)
print('# ------------------------------------------------------------------')
print('# Forward pass:')
print("Output after forward pass:\n {}".format(x))
print("Params of modules after forward:\n")
for i in range(len(s.param())):
    print("Module {},{}:\n".format(i, types[i]))
    print(s.param()[types[i]])

# arbitrary error need 3 grad_loss because we have 3 inputs below
grad_loss = empty(4).fill_(10),  empty(4).fill_(5),  empty(4).fill_(1)

# zeroes the gradients as one would do in a training setting
sgd.zero_grad()

# ------------------------------------------------------------------
# backward pass
# ------------------------------------------------------------------
print('# ------------------------------------------------------------------')
output = s.backward(*grad_loss)
print('# Backward pass:')
print("Output after backward pass:\n {}".format(output))
print("Params of modules after backward:\n")
for i in range(len(s.param())):
    print("Module {},{}:\n".format(i, types[i]))
    print(s.param()[types[i]])

# ------------------------------------------------------------------
# gradient step
# ------------------------------------------------------------------
print('# ------------------------------------------------------------------')
print('# Gradient step:')
sgd.step()
print("Params of modules after step:\n")
for i in range(len(s.param())):
    print("Module {},{}:\n".format(i, types[i]))
    print(s.param()[types[i]])


# zeroes the gradients as one would do in a training setting
sgd.zero_grad()
print("Params of modules after zeroing the gradients:\n")
for i in range(len(s.param())):
    print("Module {},{}:\n".format(i, types[i]))
    print(s.param()[types[i]])

### Test with ReLU:

In [None]:
class ReLU(Module):
    
    def __init__(self):
        self.input = None
    
    def relu(self, x):
        z = empty(x.size()).fill_(0)
        relu = x.maximum(z)
        print("your x", x, "your z", z, "your relu", relu)
        return relu
    
    def d_relu(self, x):
        d_relu = x.apply_(lambda x: 1 if x > 0 else 0)
        print("your x", x, "your d_relu", d_relu)
        return d_relu
        
    def forward (self, *input):
        self.input = input

        return tuple([self.relu(tensor) for tensor in input])
        
    def backward (self, *gradwrtoutput):     
        return tuple([gradwrtoutput[i] * self.d_relu(self.input[i]) for i in range(len(self.input))])

In [None]:
# ------------------------------------------------------------------
# Control the randomness
# ------------------------------------------------------------------
import torch
torch.manual_seed(0)

# ------------------------------------------------------------------
# model
# ------------------------------------------------------------------
m = Linear(2, 3)
m2 = Linear(3, 4)
relu = LeakyReLU(0.1)


# give input as list of modules
types = ['linear1', 'activation1','linear2']
s = Sequential([m, relu, m2],types)

sgd = SGD_Sequential(s.param(), types, 0.1)
print('# ------------------------------------------------------------------')
print('# Set up, weight initialization:')
print("Params of modules after initialization:\n")
for i in range(len(s.param())):
    print("Module {}, {}:\n".format(i, types[i]))
    print(s.param()[types[i]])

# ------------------------------------------------------------------
# input and error and reset the gradients
# ------------------------------------------------------------------
input = empty(2)
input[0] = 1
input[1] = 2

# ------------------------------------------------------------------
# forward pass
# ------------------------------------------------------------------
x = s.forward(input, input, input)
print('# ------------------------------------------------------------------')
print('# Forward pass:')
print("Output after forward pass:\n {}".format(x))
print("Params of modules after forward:\n")
for i in range(len(s.param())):
    print("Module {}, {}:\n".format(i, types[i]))
    print(s.param()[types[i]])

Compare to building it by hand without using sequential, like raphael did:

In [None]:
# ------------------------------------------------------------------
# model
# ------------------------------------------------------------------
m = Linear(2, 3)
m2 = Linear(3, 4)


sgd = SGD(m.param(), 0.1)
sgd2 = SGD(m2.param(), 0.1)

# ------------------------------------------------------------------
# input and error and reset the gradients
# ------------------------------------------------------------------
input = empty(2)
input[0] = 1
input[1] = 2

# arbitrary error need 3 grad_loss because we have 3 inputs below
grad_loss = empty(4).fill_(10),  empty(4).fill_(5),  empty(4).fill_(1)

# zeroes the gradients as one would do in a training setting
sgd.zero_grad()
sgd2.zero_grad()

# ------------------------------------------------------------------
# forward pass
# ------------------------------------------------------------------
x = m.forward(input, input, input)
x = m2.forward(*x)
print("m params after forward {}\n".format(m.param()))
print("m2 params after forward {}\n".format(m2.param()))
print("output after pass trough m and m2 {}\n".format(x))

# ------------------------------------------------------------------
# backward pass
# ------------------------------------------------------------------
x = m2.backward(*grad_loss)
output = m.backward(*x)
print("output (error with respect to input) {}\n".format(output))
print("m params after backward {}\n".format(m.param()))
print("m2 params after backward {}\n".format(m2.param()))

# ------------------------------------------------------------------
# gradient step
# ------------------------------------------------------------------
sgd.step()
sgd2.step()
print("m params after step {}\n".format(m.param()))
print("m2 params after step {}\n".format(m2.param()))

# zeroes the gradients as one would do in a training setting
sgd.zero_grad()
sgd2.zero_grad()
print("m params after zeroing the gradients {}\n".format(m.param()))
print("m2 params after zeroing the gradients {}\n".format(m2.param()))