In [1]:
import torch
import torchvision
import torch.nn as nn 
from torch.utils.data import DataLoader
from torch.functional import F  
from tqdm import trange
import numpy as np
from tqdm import trange

In [2]:
torch.set_printoptions(precision=2)

# Training Loop: 
Training a model is an iterative process; in each iteration the model makes a guess about the output, calculates the error in its guess (loss), collects the derivatives of the error with respect to its parameters, and optimizes these parameters using gradient descent.https://www.youtube.com/watch?v=tIeHLnjs5U8


Loop through Data Samples: 

1. Forward Pass: data $\rightarrow$ Model $\rightarrow$ Output
2. Compute Loss: $\text{Loss} = \mathcal{L} (output, target)$
3. Compute Gradients of model parameters w.r.t loss: $\frac{\partial \mathcal{L}}{\partial \theta}$
4. Update parameters of model (weights and biases) using their gradients: 
$\theta = \theta - \alpha \frac{\partial \mathcal{L}}{\partial \theta}$

```
Loop through Dataset:
    Forward pass:
        ...
    Compute Loss:
        ...
    Compute Gradients:
        ...
    Update Parameters:
        ...
```

# Pytorch Tensors: 
A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. Behind the scenes, Tensors can keep track of a computational graph and gradients.
unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations.


In [3]:
demo_data = np.array([[0, 2], [1, 2]])
demo_tensor = torch.tensor(demo_data)
print(type(demo_tensor))
print(demo_tensor.shape)
demo_numpy = demo_tensor.numpy()
print(type(demo_numpy))

<class 'torch.Tensor'>
torch.Size([2, 2])
<class 'numpy.ndarray'>


In [4]:
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Random Tensor: 
 tensor([[0.48, 0.83, 0.41],
        [0.69, 0.91, 0.19]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [5]:
tensor = torch.rand(3,4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

Shape of tensor: torch.Size([3, 4])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu


## Tensor Operations: 
https://pytorch.org/docs/stable/torch.html

In [6]:
# Indexing 
tensor = torch.ones(4, 4)
print(f"First row: {tensor[0]}")
print(f"First column: {tensor[:, 0]}")
tensor[:,1] = 0
print(tensor)

First row: tensor([1., 1., 1., 1.])
First column: tensor([1., 1., 1., 1.])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])


In [7]:
# Arithmetic Operations: 

# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
# ``tensor.T`` returns the transpose of a tensor
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)

y3 = torch.rand_like(y1)
print(torch.matmul(tensor, tensor.T, out=y3))

# This computes the element-wise product. z1, z2, z3 will have the same value
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

tensor([[3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.]])


tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])

# Iterating Data:
## Loading Dataset:

MNIST dataset is in torchvision datasets 

In [8]:
trainset = torchvision.datasets.MNIST('MNIST', download=True, train=True, transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))]))
    
testset = torchvision.datasets.MNIST('MNIST', train=False, download=True, transform=torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.1307,), (0.3081,))]))

In [9]:
len(trainset), len(testset)

(60000, 10000)

## DataLoaders:
`DataLoader` wraps an iterable around the `Dataset`.
Using pytorch DataLoader module, we can iterate through different batches of data.

In [10]:
BATCH_SIZE = 64

full_dataloaders = {
    'train': DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True),
    'test': DataLoader(testset, batch_size=BATCH_SIZE)
}

In [11]:
len(full_dataloaders['train'].dataset), len(full_dataloaders['test'].dataset)

(60000, 10000)

In [12]:
for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    if batch_indx < 3: 
        print('batch index: ', batch_indx)
        print('images shape: ', inputs.shape, 'labels shape: ', targets.shape)
        print(np.unique(targets.numpy(), return_counts=True))
        print('type: ', type(inputs), type(targets))

batch index:  0
images shape:  torch.Size([64, 1, 28, 28]) labels shape:  torch.Size([64])
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 8,  4,  7,  7,  4, 10,  3,  5,  7,  9]))
type:  <class 'torch.Tensor'> <class 'torch.Tensor'>
batch index:  1
images shape:  torch.Size([64, 1, 28, 28]) labels shape:  torch.Size([64])
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 3,  7,  4,  8,  4,  6,  8, 10,  4, 10]))
type:  <class 'torch.Tensor'> <class 'torch.Tensor'>
batch index:  2
images shape:  torch.Size([64, 1, 28, 28]) labels shape:  torch.Size([64])
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 5,  4,  5,  5,  8,  5, 10, 11,  4,  7]))
type:  <class 'torch.Tensor'> <class 'torch.Tensor'>


In [13]:
batch_indx, len(full_dataloaders['train'])

(937, 938)

**Training Loop**:
```
for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    Forward pass:
        ...
    Compute Loss:
        ...
    Compute Gradients:
        ...
    Update Parameters:
        ...
```

# Forward pass: 
Forward pass is in the ```forward()``` function of model.

When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.


PyTorch provides the elegantly designed modules and classes, including `torch.nn`, to help you create and train neural networks. The `nn` package defines a set of Modules, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters.

To define a neural network in PyTorch, we create a class that inherits from nn.Module. We define the layers of the network in the `__init__` function and specify how data will pass through the network in the `forward` function (this is the Forward Pass).

### Defining Model

In [None]:
class demo_model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        # define layers
        # parameters are defined here
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features=28*28, out_features=1024)
        self.fc2 = nn.Linear(in_features=1024, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=10)
        
    # Forward Pass
    def forward(self, inputs, debug=False):
        # [N, 1, 28, 28]
        # You can use any of the Tensor operations in the forward function.
        x0 = self.flatten(inputs)
        x1 = self.fc1(x0)
        x1 = F.relu(x1)
        x2 = self.fc2(x1)
        x2 = F.relu(x2)
        outputs = self.fc3(x2)
        
        if debug: 
            print('inputs shape: ', inputs.shape) # inputs in shape [N, C, H, W]
            print('after flattening: ', x0.shape)
            print('Activations after 1st fully connected layer: ', x1.shape)
            print('Activations after 2nd fully connected layer: ', x2.shape)
            print('Output shape: ', outputs.shape)
        
        return outputs

NameError: name 'nn' is not defined

In [15]:
model = demo_model()
print(model)

demo_model(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=784, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)


One important behavior of `torch.nn.Module` is registering parameters. If a particular Module subclass has learning weights, these weights are expressed as instances of `torch.nn.Parameter`. The Parameter class is a subclass of `torch.Tensor`, with the special behavior that when they are assigned as attributes of a Module, they are added to the list of that modules parameters. These parameters may be accessed through the `parameters()` method on the Module class.

In [16]:
for name, param in model.named_parameters():
    print(name, 'Parameters Shape: ', param.shape, param.requires_grad)
    
    if name == 'fc3.bias':
        print(param)

fc1.weight Parameters Shape:  torch.Size([1024, 784]) True
fc1.bias Parameters Shape:  torch.Size([1024]) True
fc2.weight Parameters Shape:  torch.Size([128, 1024]) True
fc2.bias Parameters Shape:  torch.Size([128]) True
fc3.weight Parameters Shape:  torch.Size([10, 128]) True
fc3.bias Parameters Shape:  torch.Size([10]) True
Parameter containing:
tensor([ 0.08,  0.04, -0.07, -0.01,  0.07, -0.06,  0.05,  0.05,  0.05,  0.00],
       requires_grad=True)


When we check the weights/biases of a layer with `fc3.bias.weight`/`fc3.bias.bias`, it reports itself as a `Parameter` (which is a subclass of Tensor), and lets us know that it’s tracking gradients with autograd. This is a default behavior for Parameter that differs from Tensor.

To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call `model.forward()` directly!

In [18]:
test_input = torch.rand((BATCH_SIZE, 1, 28, 28)) # [N, C, H, W]

In [19]:
outs = model(test_input, True)
outs.shape

inputs shape:  torch.Size([64, 1, 28, 28])
after flattening:  torch.Size([64, 784])
Activations after 1st fully connected layer:  torch.Size([64, 1024])
Activations after 2nd fully connected layer:  torch.Size([64, 128])
Output shape:  torch.Size([64, 10])


torch.Size([64, 10])

**Training Loop**:
```
for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    Forward pass:
        outputs = model(inputs)
    Compute Loss:
        ...
    Compute Gradients:
        ...
    Update Parameters:
        ...
```

More on model building:
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html#convolutional-layers

More Layers, activation functions: https://pytorch.org/docs/stable/nn.html

In [20]:
# 3, 32, 32

## Compute Loss

In [21]:
loss_fn = nn.CrossEntropyLoss()

In [22]:
output = torch.tensor([
    [0.1, 1.3, 1.03, 0.17, 0.03, 0.09, 0.07, 0.02, 5.61, 0.01],
])
target = torch.tensor([8]) # target with class indices
l1 = loss_fn(output, target)
l1

tensor(0.05)

In [23]:
output = torch.tensor([
    [0.1, 1.3, 1.03, 0.17, 0.03, 0.09, 0.07, 0.02, 5.61, 0.01],
])
target = torch.tensor([0]) # target with class indices
l2 = loss_fn(output, target)
l2

tensor(5.56)

In [24]:
outputs = torch.tensor([
    [0.1, 1.3, 1.03, 0.17, 0.03, 0.09, 0.07, 0.02, 5.61, 0.01],
    [0.1, 1.3, 1.03, 0.17, 0.03, 0.09, 0.07, 0.02, 5.61, 0.01],
])
targets = torch.tensor([8, 0]) # target with class indices
l = loss_fn(outputs, targets)
l

tensor(2.80)

In [25]:
(l1 + l2) / 2

tensor(2.80)

**Training Loop**:
```
for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    Forward pass:
        outputs = model(inputs)
    Compute Loss:
        loss = loss_fn(outputs, targets)
    Compute Gradients:
        ...
    Update Parameters:
        ...
```
More on Cross Entropy and softmax (Optional): 
- https://www.youtube.com/watch?v=6ArSys5qHAU
- https://www.youtube.com/watch?v=KpKog-L9veg&t=645s

# Compute Gradients:

Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.
The autograd package in PyTorch provides automatic differentiation. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

Each Tensor represents a node in a computational graph. If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another Tensor holding the gradient of `x` with respect to some scalar value.

Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The *forward* function computes output Tensors from input Tensors. The *backward* function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

Use `backward()` function on loss.

In [26]:
x = torch.tensor([
    1.0, 2.0, 2.0, 1.0
]) # [4]

W = torch.tensor([
    [0.1, 0.1, 0.1, 0.1],
], # [1, 4]
requires_grad=True) # Model.parameters tensors have requires_grad set to True by default

y = torch.matmul(x, W.T) # [1]

scalar = y ** 2 # 1

In [27]:
print(W.grad)

None


$$
y = x \times W_{2 \times 4}^T \\ 
\text{s} = y^2 \\
\frac{\partial s}{\partial W} = \frac{\partial s}{\partial y} \times \frac{\partial y}{\partial W} \\
\frac{\partial s}{\partial y}  = 2y \\
\frac{\partial y}{\partial W}  = x \\
\frac{\partial s}{\partial W} = 2xy
$$

In [28]:
scalar.backward()

In [29]:
W.grad

tensor([[1.20, 2.40, 2.40, 1.20]])

In [30]:
2* x * y

tensor([1.20, 2.40, 2.40, 1.20], grad_fn=<MulBackward0>)

Testing on model:

In [31]:
test_input = torch.rand((BATCH_SIZE, 1, 28, 28)) # [N, C, H, W]
dummy_target = torch.randint(0, 10, (BATCH_SIZE, ))

In [32]:
model = demo_model()
print(model.fc3.bias)

Parameter containing:
tensor([ 0.03,  0.01, -0.07,  0.07, -0.06,  0.03, -0.08,  0.05, -0.01,  0.06],
       requires_grad=True)


In [33]:
print(model.fc3.bias.grad)

None


In [34]:
demo_output = model(test_input)
print(model.fc3.bias.grad)

None


In [35]:
loss = loss_fn(demo_output, dummy_target)
print(model.fc3.bias.grad)
loss

None


tensor(2.31, grad_fn=<NllLossBackward0>)

In [36]:
loss.backward()

In [37]:
# model.fc3.bias.grad = d Loss / d fc3.bias
print(model.fc3.bias.grad)

tensor([ 0.01,  0.01,  0.00,  0.04, -0.08, -0.00, -0.05,  0.06,  0.03, -0.01])


**Training Loop**:
```
for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    Forward pass:
        outputs = model(inputs)
    Compute Loss:
        loss = loss_fn(outputs, targets)
    Compute Gradients:
        loss.backward()
        Access gradient of Weight W w.r.t Loss, using W.grad
    Update Parameters:
        ...
```

More on Autograd (Optional): https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

# Update Parameters: 
The optimizer is what drives the learning. Here we create an optimizer that implements stochastic gradient descent. Besides parameters of the optimizing algorithm, like the learning rate (lr), we also pass in `net.parameters()`, which is a collection of all the learning weights in the model - which is what the optimizer adjusts.

In [38]:
demo_inputs = torch.rand((4, 1, 28, 28))
demo_targets = torch.randint(0, 10, (4, ))

model = demo_model()
print(model)

demo_model(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=784, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)


In [39]:
# Define Optimizer, later usage for updating model.parameters
LR = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

In [40]:
for params in optimizer.param_groups[0]['params']:
    print(params.shape)

torch.Size([1024, 784])
torch.Size([1024])
torch.Size([128, 1024])
torch.Size([128])
torch.Size([10, 128])
torch.Size([10])


In [41]:
print(model.fc3.bias)

Parameter containing:
tensor([ 0.08, -0.05, -0.06,  0.03, -0.07,  0.05,  0.07,  0.00,  0.02, -0.08],
       requires_grad=True)


In [42]:
print(model.fc3.bias.grad)

None


In [43]:
# Forward Pass
demo_outputs = model(demo_inputs)
demo_outputs

tensor([[ 1.23e-01,  1.63e-03, -4.03e-02,  3.06e-02, -1.03e-01, -1.16e-02,
          2.25e-02, -1.79e-02, -2.34e-02, -5.24e-02],
        [ 1.16e-01, -3.33e-02, -5.94e-02,  3.24e-02, -1.43e-01,  2.70e-02,
          2.11e-02, -8.76e-03,  2.46e-02, -3.33e-02],
        [ 1.09e-01, -8.42e-02, -8.58e-02,  1.22e-04, -1.41e-01, -5.80e-03,
          2.54e-02, -3.58e-02, -1.01e-02, -8.33e-03],
        [ 8.82e-02, -4.82e-02, -2.32e-02,  8.57e-02, -1.22e-01,  1.08e-02,
         -2.51e-02, -5.70e-02,  3.33e-02, -3.97e-02]],
       grad_fn=<AddmmBackward0>)

In [44]:
print(model.fc3.bias.grad)

None


In [45]:
# Compute Loss
demo_loss = loss_fn(demo_outputs, demo_targets)

In [46]:
print(model.fc3.bias.grad)

None


In [47]:
# Compute Gradients
demo_loss.backward()

In [48]:
print(model.fc3.bias.grad)

tensor([-0.64,  0.10,  0.10,  0.10,  0.09,  0.10,  0.10, -0.15,  0.10,  0.10])


In [49]:
print(model.fc3.bias)

Parameter containing:
tensor([ 0.08, -0.05, -0.06,  0.03, -0.07,  0.05,  0.07,  0.00,  0.02, -0.08],
       requires_grad=True)


In [50]:
model.fc3.bias - LR * model.fc3.bias.grad

tensor([ 0.14, -0.06, -0.07,  0.02, -0.08,  0.04,  0.06,  0.02,  0.01, -0.09],
       grad_fn=<SubBackward0>)

`step()` method updates the parameters(which were passed to optimizer (`model.parameters()`)). it can be called once the gradients are computed using `backward()`.

In [51]:
optimizer.step()

In [52]:
print(model.fc3.bias)

Parameter containing:
tensor([ 0.14, -0.06, -0.07,  0.02, -0.08,  0.04,  0.06,  0.02,  0.01, -0.09],
       requires_grad=True)


**Note**: use `optimizer.zero_grad()` after each `.step()` call (because PyTorch accumulates the gradients on subsequent backward passes). `zero_grad()` sets the gradients of all optimized `torch.Tensor`s to zero.

In [53]:
optimizer.zero_grad()

In [54]:
model.fc3.bias.grad

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

**Training Loop**:
```
optimizer = optim(model.parameters(), lr)

for batch_indx, (inputs, targets) in enumerate(full_dataloaders['train']):
    Forward pass:
        outputs = model(inputs)
    Compute Loss:
        loss = loss_fn(outputs, targets)
    Compute Gradients:
        loss.backward()
    Update Parameters:
        optimizer.step()
        optimizer.zero_grad()
```

# Putting it all together


## One epoch of Trainig Loop:
Below, we have a function that performs one training epoch. It enumerates data from the DataLoader, and on each pass of the loop does the following:

- Gets a batch of training data from the DataLoader
- Performs an inference - that is, gets predictions from the model for an input batch (Forward Pass)
- Calculates the loss for that set of predictions vs. the targets(labels) on the dataset (Computing Loss)
- Calculates the backward gradients over the learning weights (Compute Gradients)
- Tells the optimizer to perform one learning step - that is, adjust the model’s learning weights based on the observed gradients for this batch (stored in each tensor's `.grad`), according to the optimization algorithm we chose. (Update Parameters)
- Zeros the optimizer’s gradients (`optim.zero_grad()`)
- Finally, it stores the sum of losses all batches and keeps track of number of correct predictions (to compute accuracy later)

In [55]:
def train_one_epoch(model: nn.Module, optim: torch.optim.Optimizer,
         dataloader: DataLoader, loss_fn):
    
    # utils
    num_samples = len(dataloader.dataset)
    num_batches = len(dataloader)
    running_corrects = 0
    running_loss = 0.0 
    
    model.train() # 
    for batch_indx, (inputs, targets) in enumerate(dataloader): # Get a batch of Data

        outputs = model(inputs) # Forward Pass, [N, 10]
        loss = loss_fn(outputs, targets) # Compute Loss
        
        loss.backward() # Compute Gradients
        optim.step() # Update parameters
        optim.zero_grad() # zero the parameter's gradients

        _, preds = torch.max(outputs, dim=1) # Explain, [N]
        running_corrects += torch.sum(preds == targets)
        running_loss += loss.item()
        
    epoch_acc = (running_corrects / num_samples) * 100
    epoch_loss = (running_loss / num_batches)
    
    return epoch_acc, epoch_loss

In [56]:
# Explain Order of Operations 

## Evaluating Model: 

In [57]:
def test_model(model: nn.Module,
         dataloader: DataLoader, loss_fn):
    
    # utils
    num_samples = len(dataloader.dataset)
    num_batches = len(dataloader)
    running_corrects = 0
    running_loss = 0.0 
    
    model.eval() # you must call `model.eval()` to set dropout and batch normalization layers to evaluation mode before running inference.
    with torch.no_grad(): # explain
        # more on torch.no_grad(): https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html#disabling-gradient-tracking
        
        for batch_indx, (inputs, targets) in enumerate(dataloader): # Get a batch of Data

            outputs = model(inputs) # Forward Pass
            loss = loss_fn(outputs, targets) # Compute Loss

            # loss.backward() # Compute Gradients
            # optim.step() # Update parameters
            # optim.zero_grad() # zero the parameter's gradients

            _, preds = torch.max(outputs, 1) # 
            running_corrects += torch.sum(preds == targets)
            running_loss += loss.item()

    test_acc = (running_corrects / num_samples) * 100
    test_loss = (running_loss / num_batches)
    
    return test_acc, test_loss

In [58]:
from utils import custom_plot_training_stats

def demo():
    batch_size = 128 
    num_epochs = 10
    learning_rate = 0.005
        
    trainset = torchvision.datasets.MNIST('MNIST', download=True, train=True, transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))]))
    
    testset = torchvision.datasets.MNIST('MNIST', train=False, download=True, transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))]))

    full_dataloaders = {
        'train': DataLoader(trainset, batch_size=batch_size, shuffle=True),
        'test': DataLoader(testset, batch_size=batch_size)
    }
    
    model = demo_model()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    cross_entropy = nn.CrossEntropyLoss()
    
    acc_history = {'train': [], 'test': []}
    loss_history = {'train': [], 'test': []}
    
    for epoch in trange(num_epochs): 
        train_acc, train_loss = train_one_epoch(model=model, optim=optimizer, dataloader=full_dataloaders['train'], loss_fn=cross_entropy)
        test_acc, test_loss = test_model(model=model, dataloader=full_dataloaders['test'], loss_fn=cross_entropy)
        
        acc_history['train'].append(train_acc)
        acc_history['test'].append(test_acc)
        loss_history['train'].append(train_loss)
        loss_history['test'].append(test_loss)
    
    custom_plot_training_stats(acc_history, loss_history, ['train', 'test'], title='demp', dir='demo_plots')

In [59]:
demo()

100%|███████████████████████████████████████████| 10/10 [03:39<00:00, 21.90s/it]


<Figure size 1400x600 with 0 Axes>

More Detailed Examples: 
https://pytorch.org/tutorials/beginner/nn_tutorial.html