In [21]:
import numpy as np
from tqdm import tqdm, trange
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchvision.datasets.mnist import MNIST

np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7ad872ac3070>

In [22]:
# VISION Transformer Implementation
def patchify(images, n_patches):
    n, c, h, w = images.shape

    assert h == w, "Patchify method is implemented for square images only"

    patches = torch.zeros(n, n_patches ** 2, h * w * c // n_patches ** 2)
    patch_size = h // n_patches

    for idx, image in enumerate(images):
        for i in range(n_patches):
            for j in range(n_patches):
                patch = image[:, i * patch_size: (i + 1) * patch_size, j * patch_size: (j + 1) * patch_size]
                patches[idx, i * n_patches + j] = patch.flatten()
    return patches


class MultiheadedSelfAttention(nn.Module):
    def __init__(self, d, n_heads=2):
        super(MultiheadedSelfAttention, self).__init__()
        self.d = d
        self.n_heads = n_heads

        assert d % n_heads == 0, f"Can't divide dimension {d} into {n_heads} heads"

        d_head = int(d / n_heads)
        self.q_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.k_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.v_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(self.n_heads)])
        self.d_head = d_head
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, sequences):
        # Sequences has shape (N, seq_length, token_dim)
        # We go into shape    (N, seq_length, n_heads, token_dim / n_heads)
        # And come back to    (N, seq_length, item_dim)  (through concatenation)
        result = []
        for sequence in sequences:
            seq_result = []
            for head in range(self.n_heads):
                q_mapping = self.q_mappings[head]
                k_mapping = self.k_mappings[head]
                v_mapping = self.v_mappings[head]

                seq = sequence[:, head * self.d_head: (head + 1) * self.d_head]
                q, k, v = q_mapping(seq), k_mapping(seq), v_mapping(seq)

                attention = self.softmax(q @ k.T / (self.d_head ** 0.5))
                seq_result.append(attention @ v)
            result.append(torch.hstack(seq_result))
        return torch.cat([torch.unsqueeze(r, dim=0) for r in result])

# Vision Transformer Block
class ViTBlock(nn.Module):
    def __init__(self, hidden_d, n_heads, mlp_ratio=4):
        super(ViTBlock, self).__init__()
        self.hidden_d = hidden_d
        self.n_heads = n_heads

        self.norm1 = nn.LayerNorm(hidden_d)
        self.mhsa = MultiheadedSelfAttention(hidden_d, n_heads)
        self.norm2 = nn.LayerNorm(hidden_d)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_d, mlp_ratio * hidden_d),
            nn.GELU(),
            nn.Linear(mlp_ratio * hidden_d, hidden_d)
        )

    def forward(self, x):
        out = x + self.mhsa(self.norm1(x))
        out = out + self.mlp(self.norm2(out))
        return out


    
# Vision Transformer Class
class ViT(nn.Module):
    def __init__(self, chw, n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10):
        # Super constructor
        super(ViT, self).__init__()
        
        # Attributes
        self.chw = chw # ( C , H , W )
        self.n_patches = n_patches
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.hidden_d = hidden_d
        
        # Input and patches sizes
        assert chw[1] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        assert chw[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
        self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)

        # 1) Linear mapper
        self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
        self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
        
        # 2) Learnable classification token
        self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
        
        # 3) Positional embedding
        self.register_buffer('positional_embeddings', get_positional_embeddings(n_patches ** 2 + 1, hidden_d), persistent=False)
        
        # 4) Transformer encoder blocks
        self.blocks = nn.ModuleList([ViTBlock(hidden_d, n_heads) for _ in range(n_blocks)])
        
        # 5) Classification MLPk
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_d, out_d),
            nn.Softmax(dim=-1)
        )

    def forward(self, images):
        # Dividing images into patches
        n, c, h, w = images.shape
        patches = patchify(images, self.n_patches).to(self.positional_embeddings.device)
        
        # Running linear layer tokenization
        # Map the vector corresponding to each patch to the hidden size dimension
        tokens = self.linear_mapper(patches)
        
        # Adding classification token to the tokens
        tokens = torch.cat((self.class_token.expand(n, 1, -1), tokens), dim=1)
        
        # Adding positional embedding
        out = tokens + self.positional_embeddings.repeat(n, 1, 1)
        
        # Transformer Blocks
        for block in self.blocks:
            out = block(out)
            
        # Getting the classification token only
        out = out[:, 0]
        
        return self.mlp(out) # Map to output dimension, output category distribution
    

def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i / (10000 ** (j / d))) if j % 2 == 0 else np.cos(i / (10000 ** ((j - 1) / d)))
    return result



In [23]:

def main():
    # Loading data
    transform = ToTensor()

    train_set = MNIST(root='./../datasets', train=True, download=True, transform=transform)
    test_set = MNIST(root='./../datasets', train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, shuffle=True, batch_size=128)
    test_loader = DataLoader(test_set, shuffle=False, batch_size=128)

    # Defining model and training options
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")
    model = ViT((1, 28, 28), n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10).to(device)
    N_EPOCHS = 20
    LR = 0.005

    # Training loop
    optimizer = Adam(model.parameters(), lr=LR)
    criterion = CrossEntropyLoss()
    for epoch in trange(N_EPOCHS, desc="Training"):
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} in training", leave=False):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)

            train_loss += loss.detach().cpu().item() / len(train_loader)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f}")

    # Test loop
    with torch.no_grad():
        correct, total = 0, 0
        test_loss = 0.0
        for batch in tqdm(test_loader, desc="Testing"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            loss = criterion(y_hat, y)
            test_loss += loss.detach().cpu().item() / len(test_loader)

            correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
            total += len(x)
        print(f"Test loss: {test_loss:.2f}")
        print(f"Test accuracy: {correct / total * 100:.2f}%")


if __name__ == '__main__':
    main()

Using device:  cuda (Tesla T4)


Training:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 1 in training:   0%|          | 1/469 [00:00<04:26,  1.76it/s][A
Epoch 1 in training:   0%|          | 2/469 [00:01<04:32,  1.71it/s][A
Epoch 1 in training:   1%|          | 3/469 [00:01<04:27,  1.74it/s][A
Epoch 1 in training:   1%|          | 4/469 [00:02<04:30,  1.72it/s][A
Epoch 1 in training:   1%|          | 5/469 [00:02<04:30,  1.72it/s][A
Epoch 1 in training:   1%|▏         | 6/469 [00:03<04:28,  1.72it/s][A
Epoch 1 in training:   1%|▏         | 7/469 [00:04<04:27,  1.73it/s][A
Epoch 1 in training:   2%|▏         | 8/469 [00:04<04:30,  1.71it/s][A
Epoch 1 in training:   2%|▏         | 9/469 [00:05<04:27,  1.72it/s][A
Epoch 1 in training:   2%|▏         | 10/469 [00:05<04:28,  1.71it/s][A
Epoch 1 in training:   2%|▏         | 11/469 [00:06<04:27,  1.71it/s][A
Epoch 1 in training:   3%|▎         | 12/469 [00:06<04:27,  1.71it/s][A
Epoch 1 in training:

Epoch 1/20 loss: 2.11



Epoch 2 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 2 in training:   0%|          | 1/469 [00:00<04:38,  1.68it/s][A
Epoch 2 in training:   0%|          | 2/469 [00:01<05:04,  1.54it/s][A
Epoch 2 in training:   1%|          | 3/469 [00:01<04:47,  1.62it/s][A
Epoch 2 in training:   1%|          | 4/469 [00:02<04:41,  1.65it/s][A
Epoch 2 in training:   1%|          | 5/469 [00:03<04:45,  1.63it/s][A
Epoch 2 in training:   1%|▏         | 6/469 [00:03<04:40,  1.65it/s][A
Epoch 2 in training:   1%|▏         | 7/469 [00:04<04:37,  1.66it/s][A
Epoch 2 in training:   2%|▏         | 8/469 [00:04<04:35,  1.68it/s][A
Epoch 2 in training:   2%|▏         | 9/469 [00:05<04:34,  1.67it/s][A
Epoch 2 in training:   2%|▏         | 10/469 [00:06<04:32,  1.69it/s][A
Epoch 2 in training:   2%|▏         | 11/469 [00:06<04:31,  1.69it/s][A
Epoch 2 in training:   3%|▎         | 12/469 [00:07<04:27,  1.71it/s][A
Epoch 2 in training:   3%|▎         | 13/469 [00:07<04:29,  1.69it/s

Epoch 2/20 loss: 1.85



Epoch 3 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 3 in training:   0%|          | 1/469 [00:00<04:34,  1.70it/s][A
Epoch 3 in training:   0%|          | 2/469 [00:01<04:37,  1.68it/s][A
Epoch 3 in training:   1%|          | 3/469 [00:01<04:35,  1.69it/s][A
Epoch 3 in training:   1%|          | 4/469 [00:02<04:31,  1.71it/s][A
Epoch 3 in training:   1%|          | 5/469 [00:02<04:31,  1.71it/s][A
Epoch 3 in training:   1%|▏         | 6/469 [00:03<04:33,  1.69it/s][A
Epoch 3 in training:   1%|▏         | 7/469 [00:04<04:43,  1.63it/s][A
Epoch 3 in training:   2%|▏         | 8/469 [00:04<04:42,  1.63it/s][A
Epoch 3 in training:   2%|▏         | 9/469 [00:05<04:40,  1.64it/s][A
Epoch 3 in training:   2%|▏         | 10/469 [00:06<04:38,  1.65it/s][A
Epoch 3 in training:   2%|▏         | 11/469 [00:06<04:33,  1.67it/s][A
Epoch 3 in training:   3%|▎         | 12/469 [00:07<04:30,  1.69it/s][A
Epoch 3 in training:   3%|▎         | 13/469 [00:07<04:27,  1.70it/s

Epoch 3/20 loss: 1.75



Epoch 4 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 4 in training:   0%|          | 1/469 [00:00<04:35,  1.70it/s][A
Epoch 4 in training:   0%|          | 2/469 [00:01<04:31,  1.72it/s][A
Epoch 4 in training:   1%|          | 3/469 [00:01<04:30,  1.72it/s][A
Epoch 4 in training:   1%|          | 4/469 [00:02<04:30,  1.72it/s][A
Epoch 4 in training:   1%|          | 5/469 [00:02<04:28,  1.73it/s][A
Epoch 4 in training:   1%|▏         | 6/469 [00:03<04:26,  1.74it/s][A
Epoch 4 in training:   1%|▏         | 7/469 [00:04<04:26,  1.73it/s][A
Epoch 4 in training:   2%|▏         | 8/469 [00:04<04:26,  1.73it/s][A
Epoch 4 in training:   2%|▏         | 9/469 [00:05<04:25,  1.73it/s][A
Epoch 4 in training:   2%|▏         | 10/469 [00:05<04:25,  1.73it/s][A
Epoch 4 in training:   2%|▏         | 11/469 [00:06<04:24,  1.73it/s][A
Epoch 4 in training:   3%|▎         | 12/469 [00:06<04:23,  1.73it/s][A
Epoch 4 in training:   3%|▎         | 13/469 [00:07<04:22,  1.74it/s

Epoch 4/20 loss: 1.72



Epoch 5 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 5 in training:   0%|          | 1/469 [00:00<04:32,  1.72it/s][A
Epoch 5 in training:   0%|          | 2/469 [00:01<04:32,  1.71it/s][A
Epoch 5 in training:   1%|          | 3/469 [00:01<04:33,  1.70it/s][A
Epoch 5 in training:   1%|          | 4/469 [00:02<04:52,  1.59it/s][A
Epoch 5 in training:   1%|          | 5/469 [00:03<04:43,  1.63it/s][A
Epoch 5 in training:   1%|▏         | 6/469 [00:03<04:37,  1.67it/s][A
Epoch 5 in training:   1%|▏         | 7/469 [00:04<04:32,  1.69it/s][A
Epoch 5 in training:   2%|▏         | 8/469 [00:04<04:30,  1.71it/s][A
Epoch 5 in training:   2%|▏         | 9/469 [00:05<04:27,  1.72it/s][A
Epoch 5 in training:   2%|▏         | 10/469 [00:05<04:26,  1.72it/s][A
Epoch 5 in training:   2%|▏         | 11/469 [00:06<04:24,  1.73it/s][A
Epoch 5 in training:   3%|▎         | 12/469 [00:07<04:23,  1.73it/s][A
Epoch 5 in training:   3%|▎         | 13/469 [00:07<04:23,  1.73it/s

Epoch 5/20 loss: 1.70



Epoch 6 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 6 in training:   0%|          | 1/469 [00:00<04:33,  1.71it/s][A
Epoch 6 in training:   0%|          | 2/469 [00:01<04:32,  1.71it/s][A
Epoch 6 in training:   1%|          | 3/469 [00:01<04:31,  1.72it/s][A
Epoch 6 in training:   1%|          | 4/469 [00:02<04:31,  1.71it/s][A
Epoch 6 in training:   1%|          | 5/469 [00:02<04:30,  1.71it/s][A
Epoch 6 in training:   1%|▏         | 6/469 [00:03<04:28,  1.72it/s][A
Epoch 6 in training:   1%|▏         | 7/469 [00:04<04:31,  1.70it/s][A
Epoch 6 in training:   2%|▏         | 8/469 [00:04<04:29,  1.71it/s][A
Epoch 6 in training:   2%|▏         | 9/469 [00:05<04:28,  1.72it/s][A
Epoch 6 in training:   2%|▏         | 10/469 [00:05<04:26,  1.72it/s][A
Epoch 6 in training:   2%|▏         | 11/469 [00:06<04:25,  1.73it/s][A
Epoch 6 in training:   3%|▎         | 12/469 [00:06<04:24,  1.73it/s][A
Epoch 6 in training:   3%|▎         | 13/469 [00:07<04:24,  1.72it/s

Epoch 6/20 loss: 1.70



Epoch 7 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 7 in training:   0%|          | 1/469 [00:00<04:30,  1.73it/s][A
Epoch 7 in training:   0%|          | 2/469 [00:01<04:34,  1.70it/s][A
Epoch 7 in training:   1%|          | 3/469 [00:01<04:32,  1.71it/s][A
Epoch 7 in training:   1%|          | 4/469 [00:02<04:34,  1.69it/s][A
Epoch 7 in training:   1%|          | 5/469 [00:03<04:49,  1.60it/s][A
Epoch 7 in training:   1%|▏         | 6/469 [00:03<04:44,  1.62it/s][A
Epoch 7 in training:   1%|▏         | 7/469 [00:04<04:42,  1.64it/s][A
Epoch 7 in training:   2%|▏         | 8/469 [00:04<04:37,  1.66it/s][A
Epoch 7 in training:   2%|▏         | 9/469 [00:05<04:33,  1.68it/s][A
Epoch 7 in training:   2%|▏         | 10/469 [00:05<04:32,  1.68it/s][A
Epoch 7 in training:   2%|▏         | 11/469 [00:06<04:32,  1.68it/s][A
Epoch 7 in training:   3%|▎         | 12/469 [00:07<04:30,  1.69it/s][A
Epoch 7 in training:   3%|▎         | 13/469 [00:07<04:29,  1.69it/s

Epoch 7/20 loss: 1.70



Epoch 8 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 8 in training:   0%|          | 1/469 [00:00<04:43,  1.65it/s][A
Epoch 8 in training:   0%|          | 2/469 [00:01<04:38,  1.67it/s][A
Epoch 8 in training:   1%|          | 3/469 [00:01<04:35,  1.69it/s][A
Epoch 8 in training:   1%|          | 4/469 [00:02<04:35,  1.69it/s][A
Epoch 8 in training:   1%|          | 5/469 [00:02<04:35,  1.69it/s][A
Epoch 8 in training:   1%|▏         | 6/469 [00:03<04:36,  1.67it/s][A
Epoch 8 in training:   1%|▏         | 7/469 [00:04<04:36,  1.67it/s][A
Epoch 8 in training:   2%|▏         | 8/469 [00:04<04:47,  1.60it/s][A
Epoch 8 in training:   2%|▏         | 9/469 [00:05<04:41,  1.64it/s][A
Epoch 8 in training:   2%|▏         | 10/469 [00:06<04:38,  1.65it/s][A
Epoch 8 in training:   2%|▏         | 11/469 [00:06<04:35,  1.66it/s][A
Epoch 8 in training:   3%|▎         | 12/469 [00:07<04:35,  1.66it/s][A
Epoch 8 in training:   3%|▎         | 13/469 [00:07<04:32,  1.67it/s

Epoch 8/20 loss: 1.69



Epoch 9 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 9 in training:   0%|          | 1/469 [00:00<04:48,  1.62it/s][A
Epoch 9 in training:   0%|          | 2/469 [00:01<04:41,  1.66it/s][A
Epoch 9 in training:   1%|          | 3/469 [00:01<04:44,  1.64it/s][A
Epoch 9 in training:   1%|          | 4/469 [00:02<04:41,  1.65it/s][A
Epoch 9 in training:   1%|          | 5/469 [00:03<04:41,  1.65it/s][A
Epoch 9 in training:   1%|▏         | 6/469 [00:03<04:40,  1.65it/s][A
Epoch 9 in training:   1%|▏         | 7/469 [00:04<04:45,  1.62it/s][A
Epoch 9 in training:   2%|▏         | 8/469 [00:04<04:49,  1.59it/s][A
Epoch 9 in training:   2%|▏         | 9/469 [00:05<04:50,  1.59it/s][A
Epoch 9 in training:   2%|▏         | 10/469 [00:06<04:48,  1.59it/s][A
Epoch 9 in training:   2%|▏         | 11/469 [00:06<04:44,  1.61it/s][A
Epoch 9 in training:   3%|▎         | 12/469 [00:07<04:42,  1.62it/s][A
Epoch 9 in training:   3%|▎         | 13/469 [00:08<04:38,  1.64it/s

Epoch 9/20 loss: 1.69



Epoch 10 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 10 in training:   0%|          | 1/469 [00:00<04:38,  1.68it/s][A
Epoch 10 in training:   0%|          | 2/469 [00:01<04:38,  1.68it/s][A
Epoch 10 in training:   1%|          | 3/469 [00:01<04:35,  1.69it/s][A
Epoch 10 in training:   1%|          | 4/469 [00:02<04:33,  1.70it/s][A
Epoch 10 in training:   1%|          | 5/469 [00:02<04:30,  1.71it/s][A
Epoch 10 in training:   1%|▏         | 6/469 [00:03<04:30,  1.71it/s][A
Epoch 10 in training:   1%|▏         | 7/469 [00:04<04:43,  1.63it/s][A
Epoch 10 in training:   2%|▏         | 8/469 [00:04<04:37,  1.66it/s][A
Epoch 10 in training:   2%|▏         | 9/469 [00:05<04:32,  1.69it/s][A
Epoch 10 in training:   2%|▏         | 10/469 [00:05<04:29,  1.70it/s][A
Epoch 10 in training:   2%|▏         | 11/469 [00:06<04:28,  1.71it/s][A
Epoch 10 in training:   3%|▎         | 12/469 [00:07<04:26,  1.72it/s][A
Epoch 10 in training:   3%|▎         | 13/469 [00:07<04

Epoch 10/20 loss: 1.68



Epoch 11 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 11 in training:   0%|          | 1/469 [00:00<05:13,  1.49it/s][A
Epoch 11 in training:   0%|          | 2/469 [00:01<04:52,  1.60it/s][A
Epoch 11 in training:   1%|          | 3/469 [00:01<04:44,  1.64it/s][A
Epoch 11 in training:   1%|          | 4/469 [00:02<04:38,  1.67it/s][A
Epoch 11 in training:   1%|          | 5/469 [00:03<04:35,  1.68it/s][A
Epoch 11 in training:   1%|▏         | 6/469 [00:03<04:32,  1.70it/s][A
Epoch 11 in training:   1%|▏         | 7/469 [00:04<04:29,  1.71it/s][A
Epoch 11 in training:   2%|▏         | 8/469 [00:04<04:26,  1.73it/s][A
Epoch 11 in training:   2%|▏         | 9/469 [00:05<04:24,  1.74it/s][A
Epoch 11 in training:   2%|▏         | 10/469 [00:05<04:22,  1.75it/s][A
Epoch 11 in training:   2%|▏         | 11/469 [00:06<04:22,  1.75it/s][A
Epoch 11 in training:   3%|▎         | 12/469 [00:07<04:20,  1.75it/s][A
Epoch 11 in training:   3%|▎         | 13/469 [00:07<04

Epoch 11/20 loss: 1.69



Epoch 12 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 12 in training:   0%|          | 1/469 [00:00<04:25,  1.76it/s][A
Epoch 12 in training:   0%|          | 2/469 [00:01<04:25,  1.76it/s][A
Epoch 12 in training:   1%|          | 3/469 [00:01<04:25,  1.76it/s][A
Epoch 12 in training:   1%|          | 4/469 [00:02<04:22,  1.77it/s][A
Epoch 12 in training:   1%|          | 5/469 [00:02<04:22,  1.77it/s][A
Epoch 12 in training:   1%|▏         | 6/469 [00:03<04:21,  1.77it/s][A
Epoch 12 in training:   1%|▏         | 7/469 [00:03<04:20,  1.77it/s][A
Epoch 12 in training:   2%|▏         | 8/469 [00:04<04:20,  1.77it/s][A
Epoch 12 in training:   2%|▏         | 9/469 [00:05<04:23,  1.75it/s][A
Epoch 12 in training:   2%|▏         | 10/469 [00:05<04:21,  1.75it/s][A
Epoch 12 in training:   2%|▏         | 11/469 [00:06<04:21,  1.75it/s][A
Epoch 12 in training:   3%|▎         | 12/469 [00:06<04:19,  1.76it/s][A
Epoch 12 in training:   3%|▎         | 13/469 [00:07<04

Epoch 12/20 loss: 1.69



Epoch 13 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 13 in training:   0%|          | 1/469 [00:00<04:25,  1.76it/s][A
Epoch 13 in training:   0%|          | 2/469 [00:01<04:25,  1.76it/s][A
Epoch 13 in training:   1%|          | 3/469 [00:01<04:23,  1.77it/s][A
Epoch 13 in training:   1%|          | 4/469 [00:02<04:48,  1.61it/s][A
Epoch 13 in training:   1%|          | 5/469 [00:03<05:25,  1.43it/s][A
Epoch 13 in training:   1%|▏         | 6/469 [00:03<05:04,  1.52it/s][A
Epoch 13 in training:   1%|▏         | 7/469 [00:04<04:50,  1.59it/s][A
Epoch 13 in training:   2%|▏         | 8/469 [00:04<04:42,  1.63it/s][A
Epoch 13 in training:   2%|▏         | 9/469 [00:05<04:35,  1.67it/s][A
Epoch 13 in training:   2%|▏         | 10/469 [00:06<04:30,  1.70it/s][A
Epoch 13 in training:   2%|▏         | 11/469 [00:06<04:26,  1.72it/s][A
Epoch 13 in training:   3%|▎         | 12/469 [00:07<04:23,  1.73it/s][A
Epoch 13 in training:   3%|▎         | 13/469 [00:07<04

Epoch 14/20 loss: 1.68



Epoch 15 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 15 in training:   0%|          | 1/469 [00:00<05:11,  1.50it/s][A
Epoch 15 in training:   0%|          | 2/469 [00:01<04:44,  1.64it/s][A
Epoch 15 in training:   1%|          | 3/469 [00:01<04:43,  1.64it/s][A
Epoch 15 in training:   1%|          | 4/469 [00:02<04:42,  1.65it/s][A
Epoch 15 in training:   1%|          | 5/469 [00:03<04:38,  1.67it/s][A
Epoch 15 in training:   1%|▏         | 6/469 [00:03<04:33,  1.69it/s][A
Epoch 15 in training:   1%|▏         | 7/469 [00:04<04:31,  1.70it/s][A
Epoch 15 in training:   2%|▏         | 8/469 [00:04<04:28,  1.72it/s][A
Epoch 15 in training:   2%|▏         | 9/469 [00:05<04:26,  1.73it/s][A
Epoch 15 in training:   2%|▏         | 10/469 [00:05<04:24,  1.73it/s][A
Epoch 15 in training:   2%|▏         | 11/469 [00:06<04:22,  1.74it/s][A
Epoch 15 in training:   3%|▎         | 12/469 [00:07<04:21,  1.75it/s][A
Epoch 15 in training:   3%|▎         | 13/469 [00:07<04

Epoch 15/20 loss: 1.68



Epoch 16 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 16 in training:   0%|          | 1/469 [00:00<04:34,  1.71it/s][A
Epoch 16 in training:   0%|          | 2/469 [00:01<04:30,  1.72it/s][A
Epoch 16 in training:   1%|          | 3/469 [00:01<04:30,  1.72it/s][A
Epoch 16 in training:   1%|          | 4/469 [00:02<04:29,  1.72it/s][A
Epoch 16 in training:   1%|          | 5/469 [00:02<04:28,  1.73it/s][A
Epoch 16 in training:   1%|▏         | 6/469 [00:03<04:28,  1.72it/s][A
Epoch 16 in training:   1%|▏         | 7/469 [00:04<04:28,  1.72it/s][A
Epoch 16 in training:   2%|▏         | 8/469 [00:04<04:28,  1.72it/s][A
Epoch 16 in training:   2%|▏         | 9/469 [00:05<04:44,  1.62it/s][A
Epoch 16 in training:   2%|▏         | 10/469 [00:05<04:40,  1.64it/s][A
Epoch 16 in training:   2%|▏         | 11/469 [00:06<04:37,  1.65it/s][A
Epoch 16 in training:   3%|▎         | 12/469 [00:07<04:33,  1.67it/s][A
Epoch 16 in training:   3%|▎         | 13/469 [00:07<04

Epoch 16/20 loss: 1.68



Epoch 17 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 17 in training:   0%|          | 1/469 [00:00<04:31,  1.72it/s][A
Epoch 17 in training:   0%|          | 2/469 [00:01<04:35,  1.69it/s][A
Epoch 17 in training:   1%|          | 3/469 [00:01<04:32,  1.71it/s][A
Epoch 17 in training:   1%|          | 4/469 [00:02<04:32,  1.71it/s][A
Epoch 17 in training:   1%|          | 5/469 [00:02<04:31,  1.71it/s][A
Epoch 17 in training:   1%|▏         | 6/469 [00:03<04:29,  1.72it/s][A
Epoch 17 in training:   1%|▏         | 7/469 [00:04<04:28,  1.72it/s][A
Epoch 17 in training:   2%|▏         | 8/469 [00:04<04:27,  1.72it/s][A
Epoch 17 in training:   2%|▏         | 9/469 [00:05<04:27,  1.72it/s][A
Epoch 17 in training:   2%|▏         | 10/469 [00:05<04:29,  1.70it/s][A
Epoch 17 in training:   2%|▏         | 11/469 [00:06<04:26,  1.72it/s][A
Epoch 17 in training:   3%|▎         | 12/469 [00:07<04:26,  1.71it/s][A
Epoch 17 in training:   3%|▎         | 13/469 [00:07<04

Epoch 17/20 loss: 1.68



Epoch 18 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 18 in training:   0%|          | 1/469 [00:00<04:37,  1.69it/s][A
Epoch 18 in training:   0%|          | 2/469 [00:01<04:31,  1.72it/s][A
Epoch 18 in training:   1%|          | 3/469 [00:01<04:29,  1.73it/s][A
Epoch 18 in training:   1%|          | 4/469 [00:02<04:26,  1.74it/s][A
Epoch 18 in training:   1%|          | 5/469 [00:02<04:43,  1.64it/s][A
Epoch 18 in training:   1%|▏         | 6/469 [00:03<04:39,  1.66it/s][A
Epoch 18 in training:   1%|▏         | 7/469 [00:04<04:36,  1.67it/s][A
Epoch 18 in training:   2%|▏         | 8/469 [00:04<04:33,  1.68it/s][A
Epoch 18 in training:   2%|▏         | 9/469 [00:05<04:30,  1.70it/s][A
Epoch 18 in training:   2%|▏         | 10/469 [00:05<04:27,  1.71it/s][A
Epoch 18 in training:   2%|▏         | 11/469 [00:06<04:29,  1.70it/s][A
Epoch 18 in training:   3%|▎         | 12/469 [00:07<04:26,  1.71it/s][A
Epoch 18 in training:   3%|▎         | 13/469 [00:07<04

Epoch 18/20 loss: 1.68



Epoch 19 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 19 in training:   0%|          | 1/469 [00:00<04:23,  1.78it/s][A
Epoch 19 in training:   0%|          | 2/469 [00:01<04:25,  1.76it/s][A
Epoch 19 in training:   1%|          | 3/469 [00:01<04:26,  1.75it/s][A
Epoch 19 in training:   1%|          | 4/469 [00:02<04:26,  1.74it/s][A
Epoch 19 in training:   1%|          | 5/469 [00:02<04:27,  1.73it/s][A
Epoch 19 in training:   1%|▏         | 6/469 [00:03<04:29,  1.72it/s][A
Epoch 19 in training:   1%|▏         | 7/469 [00:04<04:27,  1.72it/s][A
Epoch 19 in training:   2%|▏         | 8/469 [00:04<04:24,  1.74it/s][A
Epoch 19 in training:   2%|▏         | 9/469 [00:05<04:24,  1.74it/s][A
Epoch 19 in training:   2%|▏         | 10/469 [00:05<04:23,  1.74it/s][A
Epoch 19 in training:   2%|▏         | 11/469 [00:06<04:24,  1.73it/s][A
Epoch 19 in training:   3%|▎         | 12/469 [00:06<04:24,  1.73it/s][A
Epoch 19 in training:   3%|▎         | 13/469 [00:07<04

Epoch 19/20 loss: 1.68



Epoch 20 in training:   0%|          | 0/469 [00:00<?, ?it/s][A
Epoch 20 in training:   0%|          | 1/469 [00:00<04:29,  1.74it/s][A
Epoch 20 in training:   0%|          | 2/469 [00:01<04:32,  1.71it/s][A
Epoch 20 in training:   1%|          | 3/469 [00:01<04:30,  1.72it/s][A
Epoch 20 in training:   1%|          | 4/469 [00:02<04:26,  1.74it/s][A
Epoch 20 in training:   1%|          | 5/469 [00:02<04:25,  1.75it/s][A
Epoch 20 in training:   1%|▏         | 6/469 [00:03<04:24,  1.75it/s][A
Epoch 20 in training:   1%|▏         | 7/469 [00:04<04:24,  1.75it/s][A
Epoch 20 in training:   2%|▏         | 8/469 [00:04<04:24,  1.74it/s][A
Epoch 20 in training:   2%|▏         | 9/469 [00:05<04:27,  1.72it/s][A
Epoch 20 in training:   2%|▏         | 10/469 [00:05<04:48,  1.59it/s][A
Epoch 20 in training:   2%|▏         | 11/469 [00:06<04:41,  1.63it/s][A
Epoch 20 in training:   3%|▎         | 12/469 [00:07<04:35,  1.66it/s][A
Epoch 20 in training:   3%|▎         | 13/469 [00:07<04

Epoch 20/20 loss: 1.69


Testing: 100%|██████████| 79/79 [00:27<00:00,  2.91it/s]

Test loss: 1.68
Test accuracy: 77.89%



