In [1]:
import torch

## nn.linear

equivalent to y = XW + b


In [None]:
D_in = 1
D_out = 1

# notice that what is printed is exactly the same as the manual implementation
linear_layer = torch.nn.Linear(D_in, D_out)
print(f"linear_layer.weight: {linear_layer.weight}")
print(f"linear_layer.bias: {linear_layer.bias}")


linear_layer.weight: Parameter containing:
tensor([[0.3240]], requires_grad=True)
linear_layer.bias: Parameter containing:
tensor([0.3829], requires_grad=True)


ReLU


In [4]:
relu = torch.nn.ReLU()
sample_data = torch.tensor([-1.0, 0.0, 1.0, -2.0, -0.5, 3.0])
activated_data = relu(sample_data)

print(f"original data: {sample_data}")
print(f"activated_data: {activated_data}")


original data: tensor([-1.0000,  0.0000,  1.0000, -2.0000, -0.5000,  3.0000])
activated_data: tensor([0., 0., 1., 0., 0., 3.])


# softmax

Note: since softmax isn't reductive like torch.mean, torch.sum, etc. doing something like dim=0, softmax would operate across the first dimension, in this case the rows, which is equivalent to going vertically.


In [None]:
softmax_dim_0 = torch.nn.Softmax(dim=0)
softmax_dim_1 = torch.nn.Softmax(dim=-1)  # -1 = last dimension

logits = torch.tensor([[1.0, 2.0, 3.0, -0.5], [-1.0, -2.0, -3.0, 0.5]])
probabilities_dim_0 = softmax_dim_0(logits)
probabilities_dim_1 = softmax_dim_1(logits)

print(f"probabilities_dim_0:\n{probabilities_dim_0}")
print(f"probabilities_dim_1:\n{probabilities_dim_1}")


probabilities_dim_0:
tensor([[0.8808, 0.9820, 0.9975, 0.2689],
        [0.1192, 0.0180, 0.0025, 0.7311]])
probabilities_dim_1:
tensor([[0.0883, 0.2399, 0.6521, 0.0197],
        [0.1671, 0.0615, 0.0226, 0.7488]])


# nn.layer_norm

prevents exploding/vanishing gradients by rescaling


In [18]:
norm_layer = torch.nn.LayerNorm(
    normalized_shape=3
)  # our word vectors have a feature dimension of 3
input_features = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
normalized_features = norm_layer(input_features)

print(f"input_features:\n{input_features}")
print(f"normalized_features:\n{normalized_features}")


input_features:
tensor([[1., 2., 3.],
        [4., 5., 6.]])
normalized_features:
tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], grad_fn=<NativeLayerNormBackward0>)


# nn.dropout

prevents overfitting by randomly dropping out some neurons (zeroing out some neurons)

Note that dropout is applied during training, not testing nor inference.


In [None]:
dropout_layer = torch.nn.Dropout(p=0.5)  # zero out 50% of the neurons
input_tensor = torch.ones(1, 10)

# activate dropout for training
dropout_layer.train()
output_during_train = dropout_layer(input_tensor)

# deactivate dropout for testing
# note that in eval mode, it's just an identity function since it just returns the input
dropout_layer.eval()
output_during_test = dropout_layer(input_tensor)

print(f"output_during_train:\n{output_during_train}")
print(f"output_during_test:\n{output_during_test}")


output_during_train:
tensor([[2., 2., 2., 0., 0., 0., 0., 2., 2., 0.]])
output_during_test:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])


# Simple Linear Regression using nn.Module


In [38]:
class LinearRegression(torch.nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        torch.manual_seed(42)
        self.linear = torch.nn.Linear(in_dim, out_dim)

    def forward(self, x):
        return self.linear(x)


lr_model = LinearRegression(1, 1)
print("---- model architecture: ")
print(lr_model)
print("---- model params: ")
for name, param in lr_model.named_parameters():
    print(f"{name}: {param.data}")

---- model architecture: 
LinearRegression(
  (linear): Linear(in_features=1, out_features=1, bias=True)
)
---- model params: 
linear.weight: tensor([[0.7645]])
linear.bias: tensor([0.8300])


In [39]:
torch.manual_seed(30)
X = torch.randn(10, 1)
print(f"X[:3]: {X[:3]}")

W_true = torch.tensor([[2.0]])
b_true = torch.tensor(1.0)
print(f"X.shape: {X.shape}")
print(f"W_true: {W_true}")
print(f"b_true: {b_true}")

y_true = X @ W_true + b_true

X[:3]: tensor([[0.4705],
        [1.6563],
        [0.5153]])
X.shape: torch.Size([10, 1])
W_true: tensor([[2.]])
b_true: 1.0


In [41]:
import torch.optim as optim

epochs, lr = 100, 0.05

optimizer = optim.Adam(lr_model.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()

for epoch in range(epochs):
    y_hat = lr_model(X)

    loss = loss_fn(y_hat, y_true)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if epoch % 20 == 0:
        print(f"step: {epoch}, loss: {loss.item()}")
        print(f"lr_model.linear.weight: {lr_model.linear.weight.item()}")
        print(f"lr_model.linear.bias: {lr_model.linear.bias.item()}")

print(f"W_true: {W_true}")
print(f"b_true: {b_true}")
print("----------------")
print(f"lr_model.linear.weight: {lr_model.linear.weight}")
print(f"lr_model.linear.bias: {lr_model.linear.bias}")


step: 0, loss: 6.656859568465734e-06
lr_model.linear.weight: 1.951899766921997
lr_model.linear.bias: 0.9509152770042419
step: 20, loss: 0.000408816005801782
lr_model.linear.weight: 1.985743761062622
lr_model.linear.bias: 0.9859563112258911
step: 40, loss: 6.365532499330584e-06
lr_model.linear.weight: 1.9979428052902222
lr_model.linear.bias: 0.9989743232727051
step: 60, loss: 1.0216799637419172e-06
lr_model.linear.weight: 1.999178409576416
lr_model.linear.bias: 0.999602198600769
step: 80, loss: 7.304174118871742e-07
lr_model.linear.weight: 1.9992291927337646
lr_model.linear.bias: 0.9994306564331055
W_true: tensor([[2.]])
b_true: 1.0
----------------
lr_model.linear.weight: Parameter containing:
tensor([[1.9999]], requires_grad=True)
lr_model.linear.bias: Parameter containing:
tensor([0.9999], requires_grad=True)
