In [1]:
!pip install -qq timm

[?25l[K     |▊                               | 10 kB 26.8 MB/s eta 0:00:01[K     |█▌                              | 20 kB 14.3 MB/s eta 0:00:01[K     |██▎                             | 30 kB 7.8 MB/s eta 0:00:01[K     |███                             | 40 kB 3.4 MB/s eta 0:00:01[K     |███▉                            | 51 kB 3.4 MB/s eta 0:00:01[K     |████▋                           | 61 kB 4.1 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 4.3 MB/s eta 0:00:01[K     |██████                          | 81 kB 4.8 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 5.2 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 4.1 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 4.1 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 4.1 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 4.1 MB/s eta 0:00:01[K     |██████████▋                     | 143 kB 4.1 MB/s eta 0:00:01[K   

In [2]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [3]:
%cd drive/MyDrive/Practice/Exam/Round_2/fine_grained

/content/drive/MyDrive/Practice/Exam/Round_2/fine_grained


In [4]:
import os
import pandas as pd 
import numpy as np 
from PIL import Image

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
from einops.layers.torch import Reduce
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

torch.manual_seed(123)

<torch._C.Generator at 0x7f0c90314ef0>

In [5]:
def read_image(path):
    return Image.open(path).convert("RGB")

In [6]:
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Resize(size = (224,224)),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ]
)

In [7]:
path = "fgvc-aircraft-2013b/data"
image_path = os.path.join(path, "image_folder")

In [8]:
class AircraftDataset(Dataset):
    def __init__(self, csv_file, image_dir, transforms = None):
        super().__init__()
        self.df = pd.read_csv(csv_file)
        print(self.df.head())
        self.image_dir = image_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, item["image_file"])
        image = np.array(read_image(image_path))
        xmin, ymin, xmax, ymax = item["xmin"], item["ymin"], item["xmax"], item["ymax"]
        cropped_image = image[ymin : ymax, xmin : xmax, :]

        if self.transforms is not None: 
            image = self.transforms(cropped_image)

        return image, item["variant"]

In [9]:
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size = 7):
        super().__init__()

        self.conv = nn.Conv2d(2, 1, kernel_size = kernel_size, stride = 1, padding = (kernel_size - 1) // 2)
        self.max_pooling = Reduce('b c h w -> b 1 h w', 'max')
        self.avg_pooling = Reduce('b c h w -> b 1 h w', 'mean')

    def forward(self, x):
        max_pooled_map = self.max_pooling(x)
        avg_pooled_map = self.avg_pooling(x)
        aggregation = torch.cat([max_pooled_map, avg_pooled_map], dim = 1)
        out = self.conv(aggregation)
        out = torch.sigmoid(out)

        return x * out, out

In [10]:
class ImageClassifier(nn.Module):
    def __init__(self, num_classes, device):
        super().__init__()
        self.resnet = timm.create_model("resnet50", pretrained = True, in_chans=3)
        self.resnet.fc = nn.Identity()

        self.resnet.to(device)

        self.spatial_attn1 = SpatialAttention()
        self.spatial_attn2 = SpatialAttention()
        self.spatial_attn3 = SpatialAttention()
        self.spatial_attn4 = SpatialAttention()

        self.fc = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        out = self.resnet.conv1(x)
        out = self.resnet.bn1(out)
        out = self.resnet.act1(out)
        out = self.resnet.maxpool(out)
      
        out = self.resnet.layer1(out)
        out, attn1 = self.spatial_attn1(out)

        out = self.resnet.layer2(out)
        out, attn2 = self.spatial_attn2(out)
        out = self.resnet.layer3(out)
        out, attn3 = self.spatial_attn3(out)

        out = self.resnet.layer4(out)
        out, attn4 = self.spatial_attn4(out)

        out = self.resnet.global_pool(out)
        return self.fc(out)

In [11]:
def train(model, loader, loss_fn, optimizer, device = "cpu"):
    model.train()
    total_loss = []
    for idx, (image, label) in tqdm(enumerate(loader), total = len(loader)):
        image = image.to(device) 
        label = label.long().to(device)
        optimizer.zero_grad()
        logit = model(image)
        loss = loss_fn(logit, label)
        loss.backward()
        optimizer.step() 
        total_loss.append(loss.detach().cpu().item())

    return np.array(total_loss).mean() 

def eval(model, loader, metrics, device = "cpu"):
    model.eval()
    total_acc = []

    for idx, (image, label) in tqdm(enumerate(loader), total = len(loader)):
        image = image.to(device)
        with torch.no_grad():
            logit = model(image)
        pred = F.softmax(logit, dim = -1)
        pred = torch.argmax(pred, dim = -1).detach().cpu().numpy()
        label = label.long().detach().cpu().numpy()
        acc = metrics["acc"](label, pred)
        total_acc.append(acc)
    
    return np.array(total_acc).mean()

In [16]:
train_dataset = AircraftDataset("train_08.csv", image_path, transform)
val_dataset = AircraftDataset("val_08.csv", image_path, transform)
test_dataset = AircraftDataset("test.csv", image_path, transform)

    image_file  xmin  ymin  xmax  ymax  manufacturer  variant
0  0979624.jpg    11   151   986   487           1.0     24.0
1  0275110.jpg    33   210  1290   591          13.0     67.0
2  1671731.jpg     7   244  1195   606           1.0     24.0
3  0167073.jpg    68   159   984   485           0.0     34.0
4  2259663.jpg    22   193  1166   597          24.0     62.0
    image_file  xmin  ymin  xmax  ymax  manufacturer  variant
0  1858302.jpg    50   194  1088   479           1.0     31.0
1  0134592.jpg    21   148  1003   435           6.0     39.0
2  2101945.jpg     6   244  1195   546           5.0     79.0
3  0321746.jpg    37   164  1015   494           6.0     37.0
4  2081787.jpg    28   174  1011   455           9.0     92.0
    image_file  xmin  ymin  xmax  ymax  manufacturer  variant
0  1514522.jpg     7   217  1196   551           4.0      0.0
1  0747566.jpg    48    69   998   348           4.0      0.0
2  1008575.jpg     6   133  1022   469           4.0      0.0
3  07174

In [17]:
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True, num_workers = 2, pin_memory = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False, num_workers = 2, pin_memory = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False, num_workers = 2, pin_memory = True)

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
model = ImageClassifier(num_classes = 100, device = device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)
loss_fn = nn.CrossEntropyLoss()

metrics = {
    "acc" : accuracy_score
}

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth" to /root/.cache/torch/hub/checkpoints/resnet50_a1_0-14fe96d1.pth


In [21]:
for e in range(10):
    print("Epoch: ", e)
    train_loss = train(model, train_loader, loss_fn, optimizer, device) 
    val_acc = eval(model, val_loader, metrics, device)

    print("Train Loss: {:.4f}, Val Accuracy: {:.4f}".format(train_loss, val_acc))

Epoch:  0


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1372, Val Accuracy: 0.5907
Epoch:  1


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.1126, Val Accuracy: 0.5977
Epoch:  2


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0846, Val Accuracy: 0.6063
Epoch:  3


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0814, Val Accuracy: 0.5926
Epoch:  4


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0571, Val Accuracy: 0.6119
Epoch:  5


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0469, Val Accuracy: 0.6104
Epoch:  6


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0504, Val Accuracy: 0.5992
Epoch:  7


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0539, Val Accuracy: 0.5993
Epoch:  8


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0459, Val Accuracy: 0.6275
Epoch:  9


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Train Loss: 0.0379, Val Accuracy: 0.6074


In [22]:
eval(model, test_loader, metrics, device)

  0%|          | 0/105 [00:00<?, ?it/s]

0.6101785714285713

In [23]:
torch.save(model.state_dict(), "model.pt")