In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.datasets as torch_datasets

from torchvision.transforms import transforms
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler

import matplotlib.pyplot as plt
import numpy as np

import math
from tqdm.auto import tqdm

from torchmetrics import Accuracy

from timeit import default_timer as timer



In [7]:
"""
 H(x) be stacked layers of Convolutions and BatchNorms, when these layers are stacked,
instead of learning this layers, which are stacked, which could be computationally expensive and also might lead to issues of vanishing or exploding gradients,
one method could be learn the underlying mapping of this stack or block; in this case, say if x is the input and H(x) is the output, the we could try to focus on the residual F(x), which is equal to H(x) - x.
And now, if H(x) outputs a value that is approximately close to x, meaning they are approximately equal then F(x) would be approximately close to 0 or if H(x)
despite having complex convolutional operations if it outputs some linear approximation of the input x, rather than trying to have the complicated stacked computations of H(x)
or even if it has some non-linear approximations, when there are deep layers, F(x) could still be proven to offer better way to handle,
where it captures a potential linear or non-linear mapping that has an optimal approximation, not if the exact an optimal approximation.
Where, if this method is carried out in multiple blocks, where in each block the function or mapping is trying to get F(x) r
ather than deep or complex notions of stacked layers of H(x), it would also help in capturing the global and most generalized representation of the mapping,
which not only addresses the issues of over-fitting but also potentially prevents from degradation.
################################################################################


residual_block:
    - goal: the goal of resnet is to find an optimal residual function that captures the underlying pattern
    - let this block with stacks of layers represent H(x), where x is the input and H(x) is a combined stack layer or composition of convolutions and batchnorm
    - the stack of layers or H(x) and it's input x can be uesd together to find a residual function F(x), which captures the underlying mapping or computations of the stack
                which could be represented as F(x) = H(x) -x, where here F(x) could be considered as an approximate representation of the underlying computtations of H applied on x

basic_residual_block: (num_in_channels, num_out_channels, stride)
    -in_channels: represents number of input_channels that this block takes
    -out_channels: represents number of output_channels that this block outputs
    - stride: stride of the filters that traverses using convolutions across the image

    if this particular block has two stacked layers, then they could be represented as follows:

        # assuming that the given stride is 1, however, changing stride could change the dimensions
        # block1:  this block changes the channel size as required,
        with filter size as per standard approach is taken as 3, and with padding and stride of 1, this layer doesn't change the spatial dimension but only the channel depth

        conv1 = (input_channels, output_channels, filter=3, stride=stride, padding=1)
        batchnorm(output_channels), # normalizes the values across the volume of the output from conv1
        --------------------------------------------------------------------------------------------------------------------

        # block2: this block also updates the channel size as required and following similar convention of block 1
        the output_channels from the previous block becomes the input_channels to this conv2 block,
        this block also ensures that by hardcoding stride as 1, it ensures that the height and width remains the same as the output block

        conv2 = (output_channels, output_channels, filter=3, stride=1, padding=1)
        batchnorm(output_channels), # normalizes the values across the volume of the output from conv1
        --------------------------------------------------------------------------------------------------------------------

        for the residual function F(x) = H(x) - x to work, they all must have same (channel_count, height, width)

        here, since H(x) has already learnt the weights that correctly map the input and output,
        rather than changing the dimension of the output of H(x) to match the input, an optimal way would be
        to project the input so that it matches the output dimension H(x) and also offering efficient way to comput F(x).

         basically, for the residual F(x) to work, the output of H(x) must have the same (channel_count, height, width) of the input x

        for example: if the input is (3, 224, 224)  and the output of H(x) is (64, 112, 112), then for F(x) to be able to learn the mapping,
        the input x must be projected in such a way that it has the same dimensions of H(x),
        where F(x) = H(x) - x, x is now projected into same spatial dimension as H(x)
        ------------------------------------------------------------------------------------------------------------------
        let x be an input of shape (3, 224, 224) and it goes through H(x) stack in the following manner:
        ### case1: stride = 1, padding =1, kernel = 3
        conv_1: input(3, 224, 224) --(in_channels=3, out_channels=64, kernel=3, stride=1, padding=1) --> output(64, 224, 224)
        batchnorm(64)

        # second convolution block has hardcoded stride and padding values of 1 with kernel size 3 so it outputs the same dimension as input
        conv_2: input(64, 224, 224) --(in_channels=3, out_channels=64, kernel=3, stride=1, padding=1)--> output(64, 224, 224)
        batchnorm(64)

        result: however, here shape of channels is not the same despite having retiaining the dimensions of height and width, which must  be adjusted
        ------------------------------------------------------------------------------------------------------------------
        ### case2: if stride != 1 assume it to be 2, then it reduces the output size by downsampling the dimensions
        conv_1: input(3, 224, 224) --(in_channels=3, out_channels=64, kernel=3, stride=2, padding=1) --> output(64, 112, 112)
        batchnorm(64)

        # second convolution block has hardcoded stride and padding values of 1 with kernel size 3 so it outputs the same dimension as input
        conv_2: input(64, 112, 112) --(in_channels=3, out_channels=64, kernel=3, stride=1, padding=1)--> output(64, 112, 112)
        batchnorm(64)

        result: here, shape of both height and width and channels are mismath, which must be adjusted
        ------------------------------------------------------------------------------------------------------------------

        ### case3: if stride == 1 and input_channels== output_channels = 64
        conv_1: input(64, 112, 112) --(in_channels=3, out_channels=64, kernel=3, stride=1, padding=1) --> output(64, 112, 112)
        batchnorm(64)

        # second convolution block has hardcoded stride and padding values of 1 with kernel size 3 so it outputs the same dimension as input
        conv_2: input(64, 112, 112) --(in_channels=3, out_channels=64, kernel=3, stride=1, padding=1)--> output(64, 112, 112)
        batchnorm(64)

        result: here, shape of both height and width and also color channels remain the same, so there is no need for adjustment

        shape(H(x)) == shape(x) = (64, 112, 112), then F(x) is also of shape (64, 112, 112) and F(x) = H(x) - x
                H(x) = F(x) + x; where now the model's goal is to learn and approximate F(x)
        ------------------------------------------------------------------------------------------------------------------

        considering case 1 and  2 with mismatched dimensions, they can be adjusted as follows:
        here, x must be adjusted by projecting it to have the same spatial dimension of H(x)

        for example, H(x),shape = (64, 112, 112) and x.shape = (3, 224, 224)
        adjusted_x = input(3, 224, 224) --(in_channels=3, out_channels=64, stride=2, padding=1)--> (64. 112. 112)
        in the above operation the stride must be equal to the same stride that was used in the convolution operation
        since the second convolution operation does not change or alter the dimension

        once done, H(x) and x have the same dimension, where the goal would be then to learn the residual F(x) + x
        which captrues the linear or non-linear transformation that was captrured by stacked layers in H(x)


"""
import torch
import torch.nn as nn
import torch.nn.functional as F

import math
class ResidualBasicBlock(nn.Module):

    """
    ResidualBasicBlock: to compute the residual, a learnable from stacks of layers, it may not capture the exact computation of the stacks of layers,
    however, it could help in capturing a more generalized approximation, which could be more helpful, especially with diverse datasets

    for a given input data x if the stacks of convolution and batchnorm operations are represented by H(x),
    the residual could be represented by F(x), where F(x) = H(x) - x => H(x) = F(x) + x
    """

    """
    in the following code, just as a representation, input_channels are assumed to be (3, 224, 224)
    bias is set to false as batchnormalization scales and adjusts the values, where using additional learning parameter of bias would become redundant
    and also adds computational costs, where the bias even if added would become negligble as batchnorm already scales and adjusts in an optimal manner,
    additionaly batchnorm's beta values in the equation y * (conv(x) - mean)/stand_dev + beta;
    if not exactly the same, but approximately captures/includes bias value which could also be optimal for more generalized predictions
    output_size = floor(((image_size_1 + 2*padding - filter)/stride) + 1)
    """
    def __init__(self, input_channels, output_channels, stride=1):
        """
        Args:
            input_channels: number of  channels in the input passed into the block
            output_channels: number of output channels of this block
            stride: stride of the filter that is used to traverse through the image, is only used in the first convolution layer
                    where if the stride is other than 1 then it is used again in the projecttion matrix to readjust the projected_x to match the dimension of the output
        """
        super(ResidualBasicBlock, self).__init__()

        # below coded inputs and outputs are just a conceptualization and not the exact representation, assuming height and width are of same dimension image_size_

        # if in_channels = 3 and image_size=224 and output_channels=64
        # stride=1: input(3, 224, 224) -> output(64, 224, 224)
        # stride=2: input(3, 224, 224) -> output(64, 112, 112), in which case x is projected to match the size of the output of this stack H(x)

        # in this first layer conv_1, stride can be adjusted
        self.conv_1 = nn.Conv2d(in_channels=input_channels, out_channels=output_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.batchnorm_1 = nn.BatchNorm2d(output_channels)

        # in this second layer conv_2, stride is hardcoded to be 1 so it doesn't downsample the output of conv_1
        # input(64, 224, 224) -> output(64, 224, 224) or input(64, 112, 112) -> output(64, 112, 112)
        self.conv_2 = nn.Conv2d(in_channels=output_channels, out_channels=output_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm_2 = nn.BatchNorm2d(output_channels)


        """
        in the conv_1 layer, if stride != 1, the input and output dimensions of H(x) do not match, thus x must be projected to match the dimension of H(x)
        if in_channels != out_channels, then x must be projected to match the dimension of H(x),
        since the goal is to get F(x) = H(x) -x => H(x) = F(x) -x
        since the conv_2 layer doesn't change the spatial dimension, there is no need of checking or having any additional computations for that

        both the channel and height or width could be adjusted in the following manner,
        however, if stride == 1 and input_channels == output_channels, then x could be just sent as is without any update
        these representation is generally referred to as shortcut, so the same name is used here to keep it  consistent
        """
        self.shortcut = nn.Sequential() # when an input tensor is passed into this it returns the same without any changes => shortcut(x) == x

        if (stride != 1) or input_channels != output_channels:
            """
            since stride values are only dynamically added to conv_1 layer, the same stride value can be used here to project x to match the output dimension of H(x)
            and since there was one such operation, x can also be projected only once to match it
            """
            self.shortcut = nn.Sequential(
                                                            nn.Conv2d(in_channels=input_channels, out_channels=output_channels, kernel_size=1, stride=stride, padding=0), #x: input(3, 224, 224) -> projected_x: output(64, 112, 112)
                                                            nn.BatchNorm2d(output_channels)
            )

    # forward propagation
    def forward(self, x):

        # if stride != 1 or if input_channels != output_channels then the input is projected to match the output dimension and if not it outputs the input x without any transformation
        identity = self.shortcut(x)
        x = self.conv_1(x)
        x = self.batchnorm_1(x)
        x = self.conv_2(x)
        x = self.batchnorm_2(x)
        x += identity # H(x) = F(x) + x
        x = F.relu(x)

        return x

"""
in the residual network, a stack of layers could be represented by H(x), and if x is the input,
one of the ways to capture the computation is by capturing the change of the stacked layers represented by H(x), where if x is the input,
then such change could be mapped by a function F(x) = H(x) - x, where if x and H(x) have the same dimension then x could be used as is,
however, if H(x) and x do not have the same dimension then x is projected so that it matches the dimension of H(x),
where the goal would be to approximate the F(x) + x that could be close to H(x) where it need be exactly as H(x),
but approximately close to H(x), where such approximation might be missing the exact mapping,
 it could also be considered beneficial as it is a more generalized representation of the network,
  which could be beneficial in networks with deeper layers and also in data which could be diverse.
"""
# resnet architecture

class ResNetBasic(nn.Module):
    """
    input image (tensor representation) must be resized to 224, 224
    however, if the input_size is anything other than 224, then the adapativemaxpool adjusts so that first block always outputs 56
    """
    def __init__(self, num_classes=10):

        super(ResNetBasic, self).__init__()

        self.conv_1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) # input(3, 224, 224) - > output(64, 112, 112)
        self.batchnorm_1 = nn.BatchNorm2d(64)
        self.adaptive_maxpool_1 = nn.AdaptiveMaxPool2d(56) # input (64, 112, 112) -> output(64, 56, 56)

        self.stage_1 = self._make_stack(64, 64, stride=1, num_blocks=2) #input (64, 56, 56) -> output (64, 56, 56)
        self.stage_2 = self._make_stack(64, 128, stride=2, num_blocks=2) # input(64, 56, 56) -> output(128, 28, 28)
        self.stage_3 = self._make_stack(128, 256, stride=2, num_blocks=2) # input(128, 28, 28) -> output(256, 14, 14)
        self.stage_4 = self._make_stack(256, 512, stride=2, num_blocks=2) # input(256, 14, 14) -> output(512, 7, 7)

        self.adaptive_avgpool = nn.AdaptiveAvgPool2d((1, 1)) # input(512, 7, 7) -> output(512, 1, 1)
        self.fully_connected = nn.Linear(512, num_classes)

        self.flatten = nn.Flatten(start_dim=1) # considering that 0th dim is batch_number

    def _make_stack(self, input_channels, output_channels, stride, num_blocks):
        """
        this method uses the residual block class defined above to make stack of layers with x as input:
        conv->batchnorm->conv->batchnorm-> = y
        x_projected = project(x) to match the dimension of y
        output = relu(x_projected + y)
        """
        layers = []
        layers.append(ResidualBasicBlock(input_channels, output_channels, stride)) # may downsample if stride > 1

        for _ in range(1, num_blocks):
            #in this loop, residual_basic_blocks are added without changing the stride or output channels
            layers.append(ResidualBasicBlock(output_channels, output_channels, stride=1))

        return nn.Sequential(*layers)

    # forward propagation
    def forward(self, x):
        x = self.conv_1(x)
        x = self.batchnorm_1(x)
        x = self.adaptive_maxpool_1(x)
        x = self.stage_1(x)
        x = self.stage_2(x)
        x = self.stage_3(x)
        x = self.stage_4(x)
        x = self.adaptive_avgpool(x)
        x = self.flatten(x)
        x = self.fully_connected(x)
        return x



In [13]:
RANDOM_SEED = 32
BATCH_SIZE = 64
MODEL_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(RANDOM_SEED)

accuracy_metrics = Accuracy(task="multiclass", num_classes=10).to(MODEL_DEVICE)

In [14]:
import os
NUM_CPUS = os.cpu_count()

# normalized mean and standard deviation values that are used in transforming data, set per CIFAR10 datset
NORMALIZED_MEAN=[0.4914, 0.4822, 0.4465]
NORMALIZED_STD = [0.2023, 0.1994, 0.2010]

torch.manual_seed(RANDOM_SEED)     # manual seed to ensure that the data can be reproducible

def cifar10data(
        data_dir = '/data/cifar10',
        batch_size=32,
        image_size=32,
        test=False,
        eval_size=0.1,
        normalized_mean=[0.4914, 0.4822, 0.4465],
        normalized_std=[0.2023, 0.1994, 0.2010],
        num_cpus = NUM_CPUS):
    """
    loads cifar10 dataset and returns dataloaders
    Args:
        - datadir (str): root directory or the folder to where the data must exist
        - image_size (int): adjusts the size of the image from the datasets to (image_size, image_size), ex: (224, 224)
        - train (bool): if True, will use the train split of the dataset
        - eval_size(float): size of the validation dataset from the training dataset, if the eval_size is 0.1, then the valid_dataset size would be 10% of the training dataset
        - batch_size= size of batch that DataLoader process  (number of samples per batch)
        - normalized_mean: values used to normalize the mean of tensor representation of data
        - normalized_std: values used to normalized the standard deviation of tensor representation of data
    Return:
        - if train is set to false, will return test DataLoader
        - if train is set to true, splits cifar10 dataset into train and test data and returns respective dataloaders along with class labels
    """

    # resize the data to the give image_size, transform to tensors, and normalize mean and standard deviation
    transform_data = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=normalized_mean, std=normalized_std)
    ])

    if test:
        test_dataset = torch_datasets.CIFAR10(root=data_dir, train=False, transform=transform_data, download=True)
        dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_cpus)
        return dataloader
    else:
        """
        training dataset is split into train and evalation sections using randomsampler, and also shuffle is turned off as randomsampler already samples from random indicies without a strict order as defined in the dataset
        """
        train_dataset = torch_datasets.CIFAR10(root=data_dir, train=True, transform=transform_data, download=True)
        len_train_dataset = len(train_dataset)  # gets the length of the train_dataset which are then converted to indicies
        indicies = list(range(len_train_dataset))
        split = int(len_train_dataset*eval_size) # number that splits the training data into train and evaluation sections
        train_idx, eval_idx = indicies[:split], indicies[split:] # splits the indicies that are used in randomsampler method to get random indicies of the respective split size

        train_sampler = SubsetRandomSampler(train_idx)
        eval_sampler = SubsetRandomSampler(eval_idx)

        # train and eval dataloaders that uses randomsampler to split it respectively
        train_dataloader = DataLoader(train_dataset, batch_size, sampler=train_sampler, num_workers=num_cpus)
        eval_dataloader = DataLoader(train_dataset, batch_size, sampler=eval_sampler, num_workers=num_cpus)


        image, label = next(iter(train_dataloader))
        data_attributes = {
            #"class_labels": train_dataset.classes,
            "classes_to_index": train_dataset.class_to_idx,
            "image/tensor shape": image[0].shape,
            "dataloader_shape": image.shape,
            "len_train_data(dataloader)": len(train_dataloader) * batch_size,
            "len_evaal_data(dataloader)": len(eval_dataloader) * batch_size,
            "classes\target_labels": len(train_dataset.classes)
        }

        return train_dataloader, eval_dataloader, data_attributes

In [10]:
cifar10_train_data, cifar10_eval_data, cifar_data_attributes = cifar10data()
cifar10_test_data = cifar10data(test=True)
cifar_data_attributes

100%|██████████| 170M/170M [00:02<00:00, 70.2MB/s]


{'classes_to_index': {'airplane': 0,
  'automobile': 1,
  'bird': 2,
  'cat': 3,
  'deer': 4,
  'dog': 5,
  'frog': 6,
  'horse': 7,
  'ship': 8,
  'truck': 9},
 'image/tensor shape': torch.Size([3, 32, 32]),
 'dataloader_shape': torch.Size([32, 3, 32, 32]),
 'len_train_data(dataloader)': 5024,
 'len_evaal_data(dataloader)': 45024,
 'classes\target_labels': 10}

In [16]:
# initialize the ResNetBasic network
res_model_v0 = ResNetBasic().to(MODEL_DEVICE)

# optimizer and loss functions
optimizer = optim.Adam(params=res_model_v0.parameters())
criterion = nn.CrossEntropyLoss()


NUM_EPOCHS = 10 # number of training loops

In [17]:
torch.manual_seed(RANDOM_SEED)     # manual seed to ensure that the data can be reproducible

start_time= timer()
for epoch in tqdm(range(NUM_EPOCHS)):
    print(f"epoch:{epoch}______________")

    accuracy_metrics.reset() # reset the accuracy metrics to prevent accumulation

    # metrics for training
    train_loss_list = [] # append the training loss per batch, keeps track of individual losses, it could be summed up and divided to get the averge training loss per batch
    eval_loss_list = [] # similar to training loss, but used in evaluation loss which is done in evaluation method
    eval_accuracy_list = [] # evaluate predictions against the truth labels and append to the list

    # training loop
    for batch, (images, labels) in enumerate(cifar10_train_data): # traverse/iterates over each batch
        images, labels = images.to(MODEL_DEVICE), labels.to(MODEL_DEVICE) #move data to MODEL_DEVICE which is used both on model and also in evaluation

        res_model_v0.train() # set the model to train mode
        optimizer.zero_grad() # reset the optimizers to prevent gradient accumulation

        train_logits = res_model_v0(images) # model outputs logits as softmax is not applied in the model
        train_loss = criterion(train_logits, labels) # calculate the loss between predicted logits and the actual truth values of target

        train_loss_list.append(train_loss.item()) # append the loss to the list of loss

        train_loss.backward() # back propagation
        optimizer.step() # updates the weights and other learnable parameters (this training model is used for resnet, so it tries to learn F(x) + x or approximate values or the outputs of H(x))


    # evaluate the model at the end of each epoch to check the training and also evaluation
    res_model_v0.eval() # sets the model to evaluation mode
    for eval_batch, (eval_images, eval_labels) in enumerate(cifar10_eval_data): # traverses/iterates over each batch
        eval_images, eval_labels = eval_images.to(MODEL_DEVICE), eval_labels.to(MODEL_DEVICE)

        with torch.inference_mode(): #  disables gradient_descent / learning
            eval_logits = res_model_v0(eval_images)
            eval_loss = criterion(eval_logits, eval_labels)
            eval_loss_list.append(eval_loss.item())

            #torch.softmax(batch_size, num_classes) when dim=1, it takes probabilities for each class in the row
            # torch.argmax when dim=1 : takes the class index with highest probabilitiy for each sample
            eval_predictions = torch.argmax(torch.softmax(eval_logits, dim=1), dim=1)
            eval_accuracy = accuracy_metrics(eval_predictions, eval_labels).item() # evaluates the accuracy with true_positives/(true_positivies + false_positivies)
            eval_accuracy_list.append(eval_accuracy)

    # average out and print loss and accuracy
    avg_train_loss = sum(train_loss_list) / len(train_loss_list)
    avg_eval_loss = sum(eval_loss_list) / len(eval_loss_list)
    avg_eval_accuracy = sum(eval_accuracy_list) / len(eval_accuracy_list)
    print(f"avg_train_loss: {avg_train_loss} | avg_eval_loss:{avg_eval_loss} | avg_eval_accuracy:{avg_eval_accuracy}")

end_time = timer()

print(f"total_train_time: {end_time-start_time}")

  0%|          | 0/10 [00:00<?, ?it/s]

epoch:0______________
avg_train_loss: 2.004547108510497 | avg_eval_loss:2.1629204389976118 | avg_eval_accuracy:0.2564410092395167
epoch:1______________
avg_train_loss: 1.7953805065458748 | avg_eval_loss:1.8948389577289464 | avg_eval_accuracy:0.3013948116560057
epoch:2______________
avg_train_loss: 1.7090957627934256 | avg_eval_loss:1.7764679269749981 | avg_eval_accuracy:0.3659159559346126
epoch:3______________
avg_train_loss: 1.6088916169609992 | avg_eval_loss:1.7080095192177887 | avg_eval_accuracy:0.3747334754797441
epoch:4______________
avg_train_loss: 1.5406295729290909 | avg_eval_loss:1.865507679114379 | avg_eval_accuracy:0.371113184079602
epoch:5______________
avg_train_loss: 1.4823845829933313 | avg_eval_loss:1.65508154850101 | avg_eval_accuracy:0.3999200426439232
epoch:6______________
avg_train_loss: 1.419481732283428 | avg_eval_loss:1.5310461103450177 | avg_eval_accuracy:0.4330135039090263
epoch:7______________
avg_train_loss: 1.3482386716611825 | avg_eval_loss:1.45908253991019

In [None]:
moel_save_path