## EfficientNet Architecture

In [76]:
import torch
import torch.nn as nn

import torchvision

In [77]:
torch.cuda.manual_seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7d6265b78e90>

In [78]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1,act=True, bias=False):
        
        super().__init__()
        # same padding quick mapping:
        # k=1 -> p =0, k=3 -> p=1, k=5 -> p=2
        padding = kernel_size // 2
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding = padding, bias=bias, groups=groups)
        self.batch_norm = nn.BatchNorm2d(out_channels)
        self.activation = nn.SiLU() if act else nn.Identity()
        
    def forward(self, x):
        x= self.conv(x)
        x= self.batch_norm(x)
        return self.activation(x)

In [79]:
class SqueezeAndExcitationBlock(nn.Module):
    def __init__(self, in_channels,reduction_ratio=16): #reduction_ratio to reduce computation, a hyperparameter. take r=16 for balance in complexity and capacity as in SeNet paper
        super().__init__()
        self.squeeze = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(in_channels , in_channels//reduction_ratio, bias=False)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(in_channels//reduction_ratio, in_channels, bias =False)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self,x):
        
        x_out = self.squeeze(x)
        x_out = torch.flatten(x_out,1)
        x_out = self.relu(self.fc1(x_out))
        x_out = self.sigmoid(self.fc2(x_out))
        
        x_out = x_out[:,:,None,None]
        
        scaled = x * x_out
        return scaled
        

In [80]:
# class SqueezeAndExcitationBlockUncheckedLikelyWrong(nn.Module):
#     def __init__(self, in_channels, reduction_ratio=16):
#         super().__init__()
#         self.squeeze = nn.AdaptiveAvgPool2d(1)
#         self.excitation = nn.Sequential(
#             nn.Linear(in_channels, in_channels // reduction_ratio, bias=False),
#             nn.ReLU(inplace=True),
#             nn.Linear(in_channels // reduction_ratio, in_channels, bias=False),
#             nn.Sigmoid()
#         )

#     def forward(self, x):
#         batch, channels, _, _ = x.size()
#         y = self.squeeze(x).view(batch, channels)
#         y = self.excitation(y).view(batch, channels, 1, 1)
#         return x * y


In [81]:
class MobileConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, expansion_ratio, reduction_ratio):
        super().__init__()
        
        exp_out_channels = in_channels * expansion_ratio
        
        #residual connection only when it is not being downsampled in any way
        self.add_res = in_channels == out_channels and stride ==1
        self.conv1 = CNNBlock(in_channels, exp_out_channels, 1,1) if expansion_ratio > 1 else nn.Identity()
        
        #depthwise convolution
        self.conv2 = CNNBlock(exp_out_channels, exp_out_channels, kernel_size, stride, exp_out_channels)
        self.se = SqueezeAndExcitationBlock(exp_out_channels,reduction_ratio)
        self.conv3 = CNNBlock(exp_out_channels,out_channels, 1,1, act=False) #hatched line features means no activation
        
        self.sd = StochasticDepth(0.75)
        
    def forward(self, x):
        x_out = self.conv3(self.se(self.conv2(self.conv1(x))))
        
        if self.add_res:
            x_out = x + x_out
        
        x_out = self.sd(x_out)
        
        return x_out

In [82]:
class ClassificationBlock(nn.Module):
    def __init__(self,in_channels, n_classes, dropout_prob):
        super().__init__()
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(in_channels, n_classes)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        x = self.avgpool(x)
        x = self.dropout(x)
        x= torch.flatten(x,1)
        x = self.fc(x)
        return self.sigmoid(x)
        

In [83]:
class StochasticDepth(nn.Module):
    def __init__(self, p=0.75):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        
        rand_mask = torch.rand((x.shape[0], 1,1,1),  dtype=x.dtype, device=x.device)
        binary_mask = torch.floor(rand_mask) #TODO
      
        if self.training:   x = x/self.p * binary_mask
        
        return x

In [84]:
class EffNet(nn.Module):
    def __init__(self, model_name, in_channels=3, n_classes=25, show_output_dims=False):
        super().__init__()
        self.show = show_output_dims
        self.model_name = model_name
        self.config = Config()
        self.stages = self.config.stages
        self.phis = self.config.phis[model_name]
        
        #parameters
        phi, res, dropout_p = self.phis
        self.calc_coeffs(phi)
        
        #define network
        self.network = nn.ModuleList([])
        self.channels =[]
        
        #baseline stage 1
        operator, channels, layers, kernel_size, stride, expansion_ratio = self.config.stages[0]
        self.add_layers(3, operator, channels, layers, kernel_size, stride) #rgb input layer, TODO: check for errors
        print(operator)
        
        #remaining stages: 9 stages ko 7 stages (2-8) lai
        for i in range(1, len(self.stages)-1):
           
            if i==1:
                reduction_ratio=4
            else:
                reduction_ratio=24
                
            operator, channels, layers, kernel_size, stride, expansion_ratio = self.config.stages[i]
            self.add_layers(self.channels[-1], operator, channels, layers, kernel_size, stride, expansion_ratio, reduction_ratio)
            print(operator)
               

        #final stage: conv1x1 and classifier
        operator, channels, layers, kernel_size, stride, expansion_ratio = self.config.stages[-1]
        self.add_layers(self.channels[-1], operator, channels, layers, kernel_size, stride) #the conv layer
        print(operator)
        self.network.append(ClassificationBlock(self.channels[-1], n_classes,dropout_p)) #the classifier block

        
    
    def forward(self, x):
        
        for stage_num,module in enumerate(self.network):
            
            x= module(x)

            shape = x.shape
            if self.show: print(f"shape of stage{stage_num} : {shape}")
                
        return x
    
    
    def add_layers(self, in_channels, operator, channels, layers, kernel_size, stride, *args):
        
        channels, layers = self.update_dw(channels, layers)
        
        if layers == 1:
            self.network.append(operator(in_channels, channels, kernel_size, stride, *args))
        else:
            #the first 
            self.network.append(operator(in_channels, channels, kernel_size, 1, *args))
            
            #the remaining except first and last: works if there are >3 layers
            for _ in range(layers-2):
                self.network.append(operator(channels, channels,kernel_size, 1, *args))
            
            #final layer with stride dependent on the stage
            self.network.append(operator(channels,channels, kernel_size, stride, *args))
        
        self.channels.append(channels)

    # for models higher than the basseline:
    
    def calc_coeffs(self, phi, alpha=1.2, beta =1.1): #alpha and beta from EffNet paper, calculated through grid search. We dont use gamma but use the resolution from config
        # in every higher model, the channels is multiplied by width (beta^phi) layers is multiplied by depth (alpha^phi)
        self.depth = alpha ** phi
        self.width = beta ** phi
    
    def update_dw(self,channels, layers):
        return int(channels * self.width), int(layers * self.depth)
        
        

In [85]:
class Config:
    stages = [
            # [Operator(F), Channels, Layers, Kernel, Stride, Expansion Ratio]
            [CNNBlock, 32, 1, 3, 2, 1], 
            [MobileConvBlock, 16, 1, 3, 1, 1],
            [MobileConvBlock, 24, 2, 3, 2, 6],
            [MobileConvBlock, 40, 2, 5, 2, 6],
            [MobileConvBlock, 80, 3, 3, 2, 6],
            [MobileConvBlock, 112, 3, 5, 1, 6],
            [MobileConvBlock, 192, 4, 5, 2, 6],
            [MobileConvBlock, 320, 1, 3, 1, 6],
            [CNNBlock, 1280, 1, 1, 1, 0]
    ]

    phis = {
            # BX : (phi, resolution, dropout) 
            "B0" : (0, 224, 0.2),
            "B1" : (0.5, 240, 0.2),
            "B2" : (1, 260, 0.3),
            "B3" : (2, 300, 0.3),
            "B4" : (3, 380, 0.4),
            "B5" : (4, 456, 0.4),
            "B6" : (5, 528, 0.5),
            "B7" : (6, 600, 0.5)
    }

In [86]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name ="B0"
model = EffNet(model_name = model_name, in_channels = 3, n_classes = 25, show_output_dims=False).to(device)
res = Config().phis[model_name][1]
print(res)

x= torch.randn((5, 3, res, res)).to(device)
out= model(x)
print(out.shape)

<class '__main__.CNNBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.MobileConvBlock'>
<class '__main__.CNNBlock'>
224
torch.Size([5, 25])


In [87]:
!pip install torch-summary



In [88]:
from torchsummary import summary

In [89]:
_ =summary(model,x)

Layer (type:depth-idx)                             Output Shape              Param #
├─ModuleList: 1                                    []                        --
|    └─CNNBlock: 2-1                               [-1, 32, 112, 112]        --
|    |    └─Conv2d: 3-1                            [-1, 32, 112, 112]        864
|    |    └─BatchNorm2d: 3-2                       [-1, 32, 112, 112]        64
|    |    └─SiLU: 3-3                              [-1, 32, 112, 112]        --
|    └─MobileConvBlock: 2-2                        [-1, 16, 112, 112]        --
|    |    └─Identity: 3-4                          [-1, 32, 112, 112]        --
|    |    └─CNNBlock: 3-5                          [-1, 32, 112, 112]        352
|    |    └─SqueezeAndExcitationBlock: 3-6         [-1, 32, 112, 112]        512
|    |    └─CNNBlock: 3-7                          [-1, 16, 112, 112]        544
|    |    └─StochasticDepth: 3-8                   [-1, 16, 112, 112]        --
|    └─MobileConvBlock: 2-3    

## 2. Training, Validating and Saving the model

In [90]:
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import transforms, ToTensor
from tqdm import tqdm
from torchvision import models

In [91]:
train_dir = "/kaggle/input/seen-25-bird-dataset/Seen Datasets/train"
val_dir = "/kaggle/input/seen-25-bird-dataset/Seen Datasets/val"
test_dir =""

In [92]:
# #data loaders for mean calculation
# transforms_func = transforms.Compose([transforms.Resize((224,224)),
#                                       ToTensor()])
# train_ds= datasets.ImageFolder(train_dir, transform=transforms_func)
# batch_size = 32
# train_dataloader = DataLoader(train_ds, batch_size=batch_size,shuffle=False)


In [93]:
# def get_ds_mean_std(dataloader):
#     mean= 0.0 # torch.zeros(3)
#     var= 0.0 # torch.zeros(3)
#     n_imgs_total=0
#     i=0
#     for images, _ in dataloader:
        
#         n_imgs_per_batch = images.shape[0]
# #         print(images)
#         #b,c,w*h
#         channelwise_images = images.view(n_imgs_per_batch, images.shape[1], -1)
#         #mean and var per channel, summed every batch
#         mean += channelwise_images.mean(2).sum(0)
      
#         var += channelwise_images.var(2).sum(0)
       
#         n_imgs_total += n_imgs_per_batch
            
#     #means can simply be averaged
#     mean /=n_imgs_total

#     #std can't be averaged. but, for equal batch sizes (only one batch might have different size)
#     std = torch.sqrt(var/n_imgs_total)
#     print(n_imgs_total)
#     return mean,std
        

In [94]:
# get_ds_mean_std(train_dataloader)

In [95]:
#output of above
ds_mean, ds_std = [0.4731, 0.4819, 0.4018], [0.3544, 0.3544, 0.3544]

In [96]:
transforms_func = transforms.Compose([transforms.Resize((416,416)),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.RandomRotation(10),
                                      ToTensor(),
                                      transforms.Normalize(mean=ds_mean, std=ds_std)])

train_ds= datasets.ImageFolder(train_dir, transform=transforms_func)
# test_ds= datasets.ImageFolder(test_dir, transform=transforms_func)
val_ds= datasets.ImageFolder(val_dir, transform=transforms_func)

#data loaders for training

batchsize = 64

train_dataloader = DataLoader(train_ds, batch_size=batchsize,shuffle=True, num_workers=4, pin_memory=True)
# test_dataloader = DataLoader(test_ds, batch_size=batch_size,shuffle=False)
val_dataloader = DataLoader(val_ds, batch_size=batchsize,shuffle=False,  num_workers=4, pin_memory=True)

In [97]:
len(train_ds), len(val_ds)

(22500, 7500)

### Training Loop

In [98]:
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9, momentum=0.9, weight_decay = 1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5) 
criterion = nn.CrossEntropyLoss()

In [99]:
def get_lr(optimizer):
    #list of param groups
    return optimizer.param_groups[0]["lr"]
    
# def get_lr(optimizer):
#     for param_group in optimizer.param_groups:
#         return param_group["lr"]

print(get_lr(optimizer))

0.001


In [100]:
#calculate loss values per batch of data

def loss_batch(criterion, result, target, Training=True):
    
    #get loss
    loss=criterion(result,target)
    
    
    if Training is True:
        #backward propagation
        loss.backward()
    
    
#     print(f"batch loss:{loss} ")
    #get performance metric
#     print(f"result: {result}")
#     print(f"target: {target}")
#     n_correct_b = count_correct_batch(result,target)
    
#     print(f"n_correct per batch: {n_correct_b}")
    
    return loss.item()

In [101]:
from tqdm import tqdm

In [102]:
def display_batch(batch):
  """Displays images in a batch.
  Args:
    batch: A batch of images, either as a NumPy array or PyTorch tensor.
  """

  # Convert to NumPy array if necessary
  if isinstance(batch, torch.Tensor):
    batch = batch.numpy()

  # Assuming images are in CHW format, convert to HWC for display
  batch = np.transpose(batch, (0, 2, 3, 1))

  # Calculate the number of rows and columns for the grid
  num_images = batch.shape[0]
  cols = min(8, num_images)
  rows = (num_images + cols - 1) // cols

  # Create a figure and subplots
  fig, axes = plt.subplots(rows, cols, figsize=(cols * 3, rows * 3))

  # Iterate over images and display them
  for i in range(num_images):
    ax = axes.flatten()[i]
    ax.imshow(batch[i])
    ax.axis('off')

  plt.tight_layout()
  plt.show()


In [103]:
def loss_acc_epoch(model, criterion, dataloader,is_train, check_id=False, optimizer=None):
    
    epoch_loss =0.0
    epoch_acc =0.0
    batch_number =0
    
    for images, labels in tqdm(dataloader):
        
        
#         if batch_number % 100 ==0:
#             display_batch(images)
        
        images = images.to(device)
        labels = labels.to(device)
        
        result = model(images)
        
        if optimizer is not None:
            # zero the gradient of optimizer
            optimizer.zero_grad()
            
            loss_b = loss_batch(criterion,result,labels,Training=True)
            
            #step optimizer
            optimizer.step()
        else:
            loss_b = loss_batch(criterion,result,labels,Training=False)
            
        
        batch_number += 1
        epoch_loss += loss_b
        epoch_acc += torch.sum(result.argmax(dim=1).round() == labels.float()).item()
        
    
    loss_epoch = epoch_loss/batch_number
    acc_epoch = epoch_acc/len(dataloader.dataset)
    
    return loss_epoch, acc_epoch
    

In [104]:
import copy

In [105]:
def train(model,train_dataloader, val_dataloader, criterion, optimizer, n_epochs, device, scheduler,check_id, save_path, val_iter):
    
    loss_hist = {
        "train": [],
        "val": []
    }
    
    acc_hist={
        "train": [],
        "val": []
    }
    
    
#     #copy the current model wts as best model:
#     best_model_wts = copy.deepcopy(model.state_dict())
#     best_loss=float('inf')
#     #TODO:
    
    for epoch in range(n_epochs):
        

        print(f"Epoch {epoch+1} of {n_epochs}")
        current_lr = get_lr(optimizer)
        
        print("training")
        model.train()
        train_loss_epoch, train_acc_epoch = loss_acc_epoch(model, criterion, train_dataloader,True, check_id, optimizer)
        
        loss_hist["train"].append(train_loss_epoch)
        acc_hist["train"].append(train_acc_epoch)
        print(f"Tr.Loss: {train_loss_epoch:.5f}, Tr.Acc:{train_acc_epoch:.5f}")
        #validating now
        print("validation")
        model.eval()
        with torch.no_grad():
            val_loss_epoch, val_acc_epoch = loss_acc_epoch(model,criterion, val_dataloader, False, check_id)
            loss_hist["val"].append(val_loss_epoch)
            acc_hist["val"].append(val_acc_epoch)
        
#         #every epoch, keep on selecting the best model sofar
#         if val_loss_epoch < best_loss:
#             best_loss = val_loss_epoch
#             best_model_wts = copy.deepcopy(model.state_dict())
            
#             torch.save(model.state_dict(), "savemodel.pt")
#             print("best model saved")
            
        scheduler.step()
        print(f"Val.loss: {val_loss_epoch}, Val.Acc: {val_acc_epoch:.5f} ")
        
        
    return loss_hist, acc_hist

In [106]:
save_path = "/kaggle/working/"

In [107]:
# model2 = models.efficientnet_b5(pretrained=False)

# # Modify the classifier to have 6 output classes instead of 1000
# model2.classifier[1] = nn.Linear(model2.classifier[1].in_features, 25)

# def initialize_weights(m):
#     if isinstance(m, nn.Conv2d):
#         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)
#     elif isinstance(m, nn.BatchNorm2d):
#         nn.init.constant_(m.weight, 1)
#         nn.init.constant_(m.bias, 0)
#     elif isinstance(m, nn.Linear):
#         nn.init.normal_(m.weight, 0, 0.01)
#         nn.init.constant_(m.bias, 0)

# # Apply the weight initialization
# model2.apply(initialize_weights)
# model2.to(device)

In [108]:

    
class AlexNet(nn.Module):
    def __init__(self, num_classes=25):
        super().__init__()
        self.features_extraction = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),  
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=3, stride=2),
              
            
            nn.Conv2d(64, 192, kernel_size=5, padding=2),  
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(192),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(384),
            
            nn.Conv2d(384, 256, kernel_size=3, padding=1),  
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(256),
            
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(256),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.AdaptiveAvgPool2d((6, 6)),  
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(4096),
            
            nn.Dropout(0.1),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(4096),
            
            nn.Dropout(0.1),
            nn.Linear(4096, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        x = self.features_extraction(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)
    

In [109]:
model=AlexNet()

In [110]:
model = model.to(device)
model = nn.DataParallel(model,device_ids =[0,1]) # to use multiple gpu
n_epochs = 10

#scheduler ni chaine ho? for lr decay. edit: Hola, lets see. TODO: copy paper's
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9, momentum=0.9, weight_decay = 1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5) 
criterion = nn.CrossEntropyLoss()

In [111]:
loss_hist, acc_hist = train(model,train_dataloader, val_dataloader, criterion, optimizer, n_epochs, device=device, scheduler=scheduler,check_id=False, save_path=save_path, val_iter=1)

Epoch 1 of 10
training


100%|██████████| 352/352 [03:13<00:00,  1.82it/s]


Tr.Loss: 2.99018, Tr.Acc:0.14316
validation


100%|██████████| 118/118 [01:02<00:00,  1.87it/s]


Val.loss: 2.656839860697924, Val.Acc: 0.18160 
Epoch 2 of 10
training


100%|██████████| 352/352 [03:08<00:00,  1.87it/s]


Tr.Loss: 2.17072, Tr.Acc:0.30724
validation


100%|██████████| 118/118 [01:01<00:00,  1.93it/s]


Val.loss: 2.0110171730235473, Val.Acc: 0.35213 
Epoch 3 of 10
training


100%|██████████| 352/352 [03:08<00:00,  1.87it/s]


Tr.Loss: 1.83413, Tr.Acc:0.41551
validation


100%|██████████| 118/118 [01:01<00:00,  1.91it/s]


Val.loss: 1.7553924109976171, Val.Acc: 0.44467 
Epoch 4 of 10
training


100%|██████████| 352/352 [03:08<00:00,  1.87it/s]


Tr.Loss: 1.81243, Tr.Acc:0.42907
validation


100%|██████████| 118/118 [01:01<00:00,  1.93it/s]


Val.loss: 1.7256363721217138, Val.Acc: 0.46360 
Epoch 5 of 10
training


100%|██████████| 352/352 [03:08<00:00,  1.87it/s]


Tr.Loss: 1.67053, Tr.Acc:0.48893
validation


100%|██████████| 118/118 [01:02<00:00,  1.90it/s]


Val.loss: 1.8073886521792009, Val.Acc: 0.49347 
Epoch 6 of 10
training


100%|██████████| 352/352 [03:08<00:00,  1.87it/s]


Tr.Loss: 1.19196, Tr.Acc:0.63658
validation


100%|██████████| 118/118 [01:01<00:00,  1.92it/s]


Val.loss: 1.1205631042436017, Val.Acc: 0.66480 
Epoch 7 of 10
training


100%|██████████| 352/352 [03:07<00:00,  1.88it/s]


Tr.Loss: 0.91888, Tr.Acc:0.72053
validation


100%|██████████| 118/118 [01:00<00:00,  1.94it/s]


Val.loss: 0.8586647944935297, Val.Acc: 0.73920 
Epoch 8 of 10
training


100%|██████████| 352/352 [03:09<00:00,  1.86it/s]


Tr.Loss: 0.99886, Tr.Acc:0.69667
validation


100%|██████████| 118/118 [01:01<00:00,  1.93it/s]


Val.loss: 0.99547142156605, Val.Acc: 0.69853 
Epoch 9 of 10
training


100%|██████████| 352/352 [03:07<00:00,  1.88it/s]


Tr.Loss: 1.10118, Tr.Acc:0.66733
validation


100%|██████████| 118/118 [01:00<00:00,  1.94it/s]


Val.loss: 1.2087937669228699, Val.Acc: 0.64173 
Epoch 10 of 10
training


100%|██████████| 352/352 [03:07<00:00,  1.87it/s]


Tr.Loss: 0.80696, Tr.Acc:0.75756
validation


100%|██████████| 118/118 [01:00<00:00,  1.94it/s]

Val.loss: 0.8227307216848357, Val.Acc: 0.75707 





In [127]:
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss_hist': loss_hist,
    'accuracy_hist': acc_hist,
}

torch.save(checkpoint, '/kaggle/working/checkpoint.pth')

In [128]:
test_model =AlexNet()

In [129]:
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_model.to(device)

AlexNet(
  (features_extraction): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (5): ReLU(inplace=True)
    (6): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (

In [130]:
# Example of loading a checkpoint and extracting loss and accuracy history
checkpoint_path = '/kaggle/working/checkpoint.pth'
checkpoint = torch.load(checkpoint_path)

# Load the state dict, stripping the 'module.' prefix
state_dict = checkpoint['model_state_dict']
new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
test_model.load_state_dict(new_state_dict)

optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9, momentum=0.9, weight_decay = 1e-5)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Extract loss and accuracy history
loss_hist = checkpoint['loss_hist']
acc_hist = checkpoint['accuracy_hist']

print("Loss History:", loss_hist)
print("Accuracy History:", acc_hist)

Loss History: {'train': [2.9901792014187034, 2.1707179356705057, 1.8341338248415426, 1.8124313435771249, 1.6705312573096969, 1.1919589941813187, 0.9188829821280458, 0.9988550403240052, 1.101175282658501, 0.8069616239517927], 'val': [2.656839860697924, 2.0110171730235473, 1.7553924109976171, 1.7256363721217138, 1.8073886521792009, 1.1205631042436017, 0.8586647944935297, 0.99547142156605, 1.2087937669228699, 0.8227307216848357]}
Accuracy History: {'train': [0.14315555555555556, 0.30724444444444443, 0.4155111111111111, 0.42906666666666665, 0.48893333333333333, 0.6365777777777778, 0.7205333333333334, 0.6966666666666667, 0.6673333333333333, 0.7575555555555555], 'val': [0.1816, 0.35213333333333335, 0.44466666666666665, 0.4636, 0.49346666666666666, 0.6648, 0.7392, 0.6985333333333333, 0.6417333333333334, 0.7570666666666667]}


In [131]:
def test_model(model, test_dataloader, device):
        #validating now
    print("validation")
    model.eval()
    with torch.no_grad():
        val_loss_epoch, val_acc_epoch = loss_acc_epoch(model,criterion, val_dataloader, False, check_id=False)

    #         #every epoch, keep on selecting the best model sofar
    #         if val_loss_epoch < best_loss:
    #             best_loss = val_loss_epoch
    #             best_model_wts = copy.deepcopy(model.state_dict())

    #             torch.save(model.state_dict(), "savemodel.pt")
    #             print("best model saved")
    print(f"Val.loss: {val_loss_epoch}, Val.Acc: {val_acc_epoch:.5f} ")

# Example usage
test_model(model, val_dataloader, device=device)


validation


100%|██████████| 118/118 [01:10<00:00,  1.68it/s]

Val.loss: 0.8207268435823716, Val.Acc: 0.75773 





In [None]:
# def train(model, criteria, optimizer, n_epochs, train_dataloader, val_dataloader, device, val_iter=10):
    
#     train_losses=[]
#     train_accs=[]
#     val_losses=[]
#     val_accs=[]
    
#     for epoch in range(1, n_epochs+1):
        
#         print(f"Epoch {epoch} of {n_epochs}:")
        
#         #------training------------
        
#         n_train_samples = len(train_dataloader.dataset)
        
#         #train mode activate
#         model.train()
#         epoch_train_loss = 0
#         correct_preds=0
#         epoch_train_acc = float(0)
#         step=0
        
        
#         #all the training that happens per epoch
#         for images, targets in tqdm(train_dataloader):
#             #iterates through the dataset in batches
            
#             images = images.to(device)
#             targets = targets.to(device)
#             optimizer.zero_grad()
            
#             #forward pass
#             results = model(images)
#             losses = criteria(results, targets.unsqueeze(1).float())
            
#             #backward pass
#             losses.backward()
#             optimizer.step()
            
#             epoch_train_loss += losses.item()
#             epoch_train_acc += torch.sum(results.round() == targets.unsqueeze(1).float())
            
#             step +=1
        
#         #after going through the entire dataset calculate the train loss and accuracy
#         epoch_train_loss /= step
#         train_losses.append(epoch_train_loss)
        
#         epoch_train_acc /= float(n_train_samples)
#         train_accs.append(epoch_train_acc)
        
#         print(f"Epoch {epoch} Training Loss: {epoch_train_loss:.4f}")
#         print(f"Epoch {epoch} Training Acc: {epoch_train_acc:.4f}")
        
        
#         #perform validation every few epochs
#         if epoch % val_iter ==0:
            
#             #------validation------------
#             n_val_samples = len(val_dataloader.dataset)
            
#             #evaluation mode activate
#             model.eval()
#             epoch_val_loss =0
#             epoch_val_acc = float(0)
#             val_step=0
            
#             with torch.no_grad():
#                 for images, targets in tqdm(val_dataloader):
#                     #iterates through the dataset in batches
                    
#                     images = images.to(device)
#                     targets = targets.to(device)
#                     optimizer.zero_grad()
                    
#                     #forward pass
#                     results = model(images)
#                     losses = criteria(results, targets.unsqueeze(1).float())
                    
#                     #no backward pass
                    
#                     epoch_val_loss += losses.item()
                    
#                     #rounding basically thresholds the output at 0.5
#                     epoch_val_acc += torch.sum(results.round() == targets.unsqueeze(1).float())
                    
#                     val_step +=1
        
#                 #after going through the entire dataset calculate the validation loss and accuracy
#                 epoch_val_loss /= val_step
#                 val_losses.append(epoch_val_loss)
#                 epoch_val_acc /= n_val_samples
#                 val_accs.append(epoch_val_acc)
        
#         print(f"Epoch {epoch} Validation Loss: {epoch_train_loss:.4f}")
#         print(f"Epoch {epoch} Validation Acc: {epoch_train_acc:.4f}")
    
#     return train_losses, val_losses, train_accs, val_accs