In [28]:
import torch
import torch.nn as nn

In [29]:
torch.manual_seed(123)  # for reproducibility

<torch._C.Generator at 0x7ff90c077110>

In [30]:
# a nonlinear activation function: GELU (Gaussian Error Linear Unit)
# non linear means the output is not directly proportional to the input

# takes in a tensor of a certain shape and return a tensor of the same shape
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [47]:
class DeepNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            # 5 layers with GELU activation
            nn.Sequential( nn.Linear(layer_sizes[0], layer_sizes[5]), nn.GELU() ),
            nn.Sequential( nn.Linear(layer_sizes[1], layer_sizes[4]), nn.GELU() ),
            nn.Sequential( nn.Linear(layer_sizes[2], layer_sizes[3]), nn.GELU() ),
            nn.Sequential( nn.Linear(layer_sizes[3], layer_sizes[2]), nn.GELU() ),
            nn.Sequential( nn.Linear(layer_sizes[4], layer_sizes[1]), nn.GELU() ),

        ])
    
    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x) # we use a different name so we can use the shortcut connection
            if self.use_shortcut and layer_output.shape == x.shape:
                x = x + layer_output  # shortcut connection
            else:
                x = layer_output
        return x

In [48]:
layer_sizes = [3,3,3,3,3,1]

In [49]:
dn = DeepNetwork( layer_sizes, use_shortcut=False )

In [50]:
input = torch.randn(2, 3)  # batch size of 2, 4 features
input

tensor([[ 0.7694,  0.3453,  1.8979],
        [-0.2357,  0.7885,  0.3208]])

In [51]:
output = dn( input )

RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x1 and 3x3)

In [None]:
output

NameError: name 'output' is not defined

In [43]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x


def print_gradients(model, x):
    # Forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # Calculate loss based on how close the target
    # and output are
    loss = nn.MSELoss()
    loss = loss(output, target)
    
    # Backward pass to calculate the gradients
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            # Print the mean absolute gradient of the weights
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [44]:
layer_sizes = [3, 3, 3, 3, 3, 1]  

sample_input = torch.tensor([[1., 0., -1.]])

torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(
    layer_sizes, use_shortcut=False
)
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.00012011159560643137
layers.2.0.weight has gradient mean of 0.0007152040489017963
layers.3.0.weight has gradient mean of 0.0013988736318424344
layers.4.0.weight has gradient mean of 0.005049645435065031


In [45]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(
    layer_sizes, use_shortcut=True
)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22169798612594604
layers.1.0.weight has gradient mean of 0.20694111287593842
layers.2.0.weight has gradient mean of 0.3289700150489807
layers.3.0.weight has gradient mean of 0.26657330989837646
layers.4.0.weight has gradient mean of 1.3258544206619263
