## Fully Connect Neural Network (FCN)

In [75]:
import torch
import torch.nn as nn

In [76]:
input_tensor = torch.Tensor([[2, 3, 6, 7, 9, 3, 2, 1]])

In [77]:
num_out = 2

In [78]:
linear = nn.Linear(in_features=input_tensor.shape[1], out_features=num_out)
linear(input_tensor)

tensor([[5.1283, 0.6496]], grad_fn=<AddmmBackward0>)

### Linear Layers

In [79]:
sequential = nn.Sequential(
    nn.Linear(in_features=input_tensor.shape[1], out_features=5),
    nn.Linear(in_features=5, out_features=3),
    nn.Linear(in_features=3, out_features=num_out)
)

sequential(input_tensor)

tensor([[-0.2842,  0.2831]], grad_fn=<AddmmBackward0>)

# Activation Functions

In [80]:
import torch.nn as nn

sigmoid = nn.Sigmoid()
softmax = nn.Softmax()
relu = nn.ReLU()
leakyRelu = nn.LeakyReLU(negative_slope=0.05)

### Sigmoid (Binary Classification)

In [81]:
input_tensor = torch.tensor([[2.0, 9.9, -1]])

sigmoid = nn.Sigmoid()
output = sigmoid(input_tensor)
output

tensor([[0.8808, 0.9999, 0.2689]])

In [82]:
sequential = nn.Sequential(
    nn.Linear(in_features=input_tensor.shape[1], out_features=5),
    nn.Linear(in_features=5, out_features=3),
    nn.Linear(in_features=3, out_features=1),
    nn.Sigmoid()
)

sequential(input_tensor)

tensor([[0.5724]], grad_fn=<SigmoidBackward0>)

### Softmax (Multiclass Classification)

In [83]:
input_tensor = torch.tensor([[2.0, 9.9, -1], [2.0, 9.9, -1], ])
softmax = nn.Softmax(dim=-1)
softmax(input_tensor)

tensor([[3.7060e-04, 9.9961e-01, 1.8451e-05],
        [3.7060e-04, 9.9961e-01, 1.8451e-05]])

In [84]:
sequential = nn.Sequential(
    nn.Linear(in_features=input_tensor.shape[1], out_features=5),
    nn.Linear(in_features=5, out_features=3),
    nn.Linear(in_features=3, out_features=num_out),
    nn.Softmax()
)

sequential(input_tensor)

tensor([[0.1182, 0.8818],
        [0.1182, 0.8818]], grad_fn=<SoftmaxBackward0>)

## ReLu, Elu
- ReLU most popular activation function but suffer dying neurons problems where a neuron dies when it outputs negative.
    - Prone to vanishing gradient.
- Elu solve this problem.
    - Less prone to vanishing gradient.

In [None]:
# ReLU Usage

# sequential = nn.Sequential(
#     nn.Linear(in_features=input_tensor.shape[1], out_features=5),
#     nn.Linear(in_features=5, out_features=3),
#     nn.Linear(in_features=3, out_features=num_out),
#     nn.ReLU()
# )
# sequential(input_tensor)

# (or)
# sequential = nn.Sequential(
#     nn.Linear(in_features=input_tensor.shape[1], out_features=5),
#     nn.Linear(in_features=5, out_features=3),
#     nn.Linear(in_features=3, out_features=num_out),
# )
# output = sequential(input_tensor)
# yhat = nn.functional.relu(output)


In [None]:
# ReLU Usage

# sequential = nn.Sequential(
#     nn.Linear(in_features=input_tensor.shape[1], out_features=5),
#     nn.Linear(in_features=5, out_features=3),
#     nn.Linear(in_features=3, out_features=num_out),
#     nn.ELU()
# )
# sequential(input_tensor)

# (or)
# sequential = nn.Sequential(
#     nn.Linear(in_features=input_tensor.shape[1], out_features=5),
#     nn.Linear(in_features=5, out_features=3),
#     nn.Linear(in_features=3, out_features=num_out),
# )
# output = sequential(input_tensor)
# yhat = nn.functional.elu(output)


## Weight Initialization
- Useful to handle vanishing and exploding gradient.
- Good Initialization ensure
    - Variance of layer inputs = variance of layer outputs
    - Variance of gradients the same before and after a layer.
- Methods are different for each activation function.

In [85]:
import torch.nn.init as init

In [86]:
sequential[0].weight

Parameter containing:
tensor([[-0.2751,  0.5342, -0.5666],
        [-0.0347, -0.4694, -0.2718],
        [ 0.3737, -0.0626, -0.1293],
        [ 0.3223, -0.0420, -0.2485],
        [-0.1106,  0.4756, -0.0402]], requires_grad=True)

In [93]:
# For ReLU and Semilar Activation Functions.
# He/Kaiming Initialization

init.kaiming_uniform_(sequential[0].weight)
print("Kaiming Init: ", sequential[0].weight)

Kaiming Init:  Parameter containing:
tensor([[-1.1691,  0.7021, -0.3884],
        [-0.4028,  0.8705, -0.3006],
        [-1.0375,  0.8404, -1.4055],
        [ 1.3897, -0.0479,  0.2500],
        [-0.1482,  0.6126, -0.1561]], requires_grad=True)


# Loss Functions

In [11]:
import torch.nn.functional as F

### Multiclass Classifictaion
- Require one hot encoding to the truth label y.
- use CrossEntropyLoss (torch.nn.CrossEntropy)

In [12]:
# Use Neural network to get y_hats

num_out = 3

input_tensor = torch.tensor([[2.0, 9.9, -1], [2.5, 9.5, -1.5], [1.5, 8, -4]])

model = nn.Sequential(
    nn.Linear(in_features=input_tensor.shape[1], out_features=5),
    nn.Linear(in_features=5, out_features=3),
    nn.Linear(in_features=3, out_features=num_out),
    nn.Softmax(dim=-1)
)

In [13]:
# One hot encoding
num_out = 3
y_true = torch.tensor([0, 1, 2])
F.one_hot(y_true, num_classes=num_out)

tensor([[1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]])

In [14]:
y_hat = model(input_tensor)

In [15]:
print(model[0].weight.grad)
print(model[1].weight.grad)
print(model[2].weight.grad)

None
None
None


In [16]:
# Compute the loss
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()
loss = criterion(y_hat, y_true)
loss

tensor(1.0879, grad_fn=<NllLossBackward0>)

In [17]:
# Compute the gradient.
loss.backward()

In [18]:
print(model[0].weight.grad)
print(model[1].weight.grad)
print(model[2].weight.grad)

tensor([[ 0.0217,  0.0638,  0.0208],
        [-0.0290, -0.0938, -0.0437],
        [-0.0132, -0.0491, -0.0318],
        [ 0.0333,  0.0890,  0.0151],
        [-0.0512, -0.1586, -0.0637]])
tensor([[-0.0592, -0.1516, -0.0087, -0.1133,  0.0446],
        [ 0.0696,  0.1856,  0.0273,  0.1347, -0.0510],
        [ 0.0007, -0.0163, -0.0414, -0.0024, -0.0039]])
tensor([[-0.0013, -0.0278,  0.0364],
        [-0.0488, -0.0190,  0.0527],
        [ 0.0501,  0.0468, -0.0891]])


# Updating Model Parameters (Gradient Descent)

Steps
- loss.
- optimizer will update the model parameters (weights) automatically. `optimizer.step()`

In [19]:
import torch.optim as optim

In [20]:
model.parameters()

<generator object Module.parameters at 0x7f9630a3b270>

In [22]:
# Optimizer lists
lr = 0.01 # Learning rate.
momentum = 0.09 # Mementum to reach the global momentum.

# Stochastic Gradient Descend, simple, rarely used in practice.
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

# Adaptive Gradient
# Good for sparse data where some features are not often observed.
# But decreased learning too fast.
# Cannot use momentum.
optimizer = optim.Adagrad(model.parameters(), lr=lr)

# RMSprop - Root Mean Square Propagation.
# Adapt the learning based on the previous gradients.
optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=momentum)

##################################################
# The MOST POPULAR Method. (DEFAULT Method)
##################################################

# Adam - Adaptive moment estimation.
# RMSProp + Momentum - most recent gradients have more weights.
optimizer = optim.Adam(model.parameters(), lr=lr)

In [19]:
y_hat = model(input_tensor)

criterion = CrossEntropyLoss()
loss = criterion(y_hat, y_true)
loss.backward()

In [20]:
model.parameters

<bound method Module.parameters of Sequential(
  (0): Linear(in_features=3, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=3, bias=True)
  (2): Linear(in_features=3, out_features=3, bias=True)
  (3): Softmax(dim=-1)
)>

In [21]:
lr = 0.001
optimizer = optim.SGD(model.parameters(), lr=lr)

In [22]:
print(model[0].weight)
print(model[1].weight)
print(model[2].weight)

Parameter containing:
tensor([[-0.3764,  0.2672,  0.4446],
        [-0.0268,  0.1104,  0.2934],
        [-0.5184,  0.0213,  0.4332],
        [-0.3090,  0.5258, -0.1544],
        [ 0.5495, -0.4032, -0.0214]], requires_grad=True)
Parameter containing:
tensor([[ 0.0958, -0.1698, -0.3226,  0.1997, -0.3070],
        [ 0.3433, -0.0698, -0.2307,  0.0943, -0.3171],
        [ 0.3749,  0.3677,  0.2880, -0.2014, -0.0077]], requires_grad=True)
Parameter containing:
tensor([[-0.5041,  0.4766, -0.4686],
        [-0.5452, -0.5262,  0.1746],
        [-0.1246, -0.5688, -0.2138]], requires_grad=True)


In [23]:
optimizer.step()

In [24]:
print(model[0].weight)
print(model[1].weight)
print(model[2].weight)

Parameter containing:
tensor([[-0.3764,  0.2671,  0.4447],
        [-0.0268,  0.1105,  0.2934],
        [-0.5183,  0.0214,  0.4331],
        [-0.3090,  0.5257, -0.1544],
        [ 0.5496, -0.4031, -0.0215]], requires_grad=True)
Parameter containing:
tensor([[ 0.0958, -0.1698, -0.3227,  0.1998, -0.3071],
        [ 0.3434, -0.0697, -0.2304,  0.0939, -0.3170],
        [ 0.3749,  0.3677,  0.2879, -0.2012, -0.0077]], requires_grad=True)
Parameter containing:
tensor([[-0.5043,  0.4765, -0.4684],
        [-0.5452, -0.5262,  0.1746],
        [-0.1245, -0.5687, -0.2139]], requires_grad=True)


## Count the Number of Model Parameters

In [25]:
total = 0

for parameters in model.parameters():
    total += parameters.numel()

print(f"Total params: {total}")

Total params: 50


## Metrics

In [26]:
from torchmetrics import Accuracy

acc = Accuracy(task='binary')