In [2]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image 
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
import argparse
import sys
from tqdm import tqdm
import math
import random
import os

# Data Loader

In [3]:
train_images_path = []
train_images_label = []
label2id = {'Honda': 0, 'Others': 1, 'Suzuki': 2, 'VinFast': 3, 'Yamaha': 4}
id2label = {0: 'Honda', 1: 'Others', 2: 'Suzuki', 3: 'VinFast', 4: 'Yamaha'}
train_dir = '/kaggle/input/vn-moto-dataset'
for subfolder in os.listdir(train_dir):
    if not subfolder.endswith('.txt'):
        for file in os.listdir(os.path.join(train_dir,subfolder)):
            if file.lower().endswith('png') or file.lower().endswith('jpg') or file.lower().endswith('jpeg'):
                train_images_path.append(os.path.join(train_dir,subfolder, file))
                train_images_label.append(label2id[subfolder])

In [4]:
train_valid_path = []
train_valid_label = []
for idx, path in enumerate(train_images_path):
    img = Image.open(path)
    if img.mode != "P":  # Exclude palette mode
        train_valid_path.append(path)
        train_valid_label.append(train_images_label[idx])

In [5]:
val_images_path = []
val_images_label = []
val_dir = '/kaggle/input/tiny-moto-dataset'
for subfolder in os.listdir(val_dir):
    if not subfolder.endswith('.txt'):
        for file in os.listdir(os.path.join(val_dir,subfolder)):
            if file.lower().endswith('png') or file.lower().endswith('jpg') or file.lower().endswith('jpeg'):
                val_images_path.append(os.path.join(val_dir,subfolder, file))
                val_images_label.append(label2id[subfolder])

In [6]:
val_valid_path = []
val_valid_label = []
for idx, path in enumerate(val_images_path):
    img = Image.open(path)
    if img.mode != "P":  # Exclude palette mode
        val_valid_path.append(path)
        val_valid_label.append(val_images_label[idx])

In [7]:
class MyDataset(Dataset):
    def __init__(self, valid_images_path, images_label, transform = None):
        self.valid_images_path = valid_images_path
        self.images_label = images_label
        self.transform = transform
    def __len__(self):
        return len(self.valid_images_path)
    def __getitem__(self, item):
        img = Image.open(self.valid_images_path[item]).convert("RGB")
        label = self.images_label[item]
        if self.transform is not None:
            img = self.transform(img)
        return img, label
        
    @staticmethod
    def collate_fn(batch):
        images, labels = tuple(zip(*batch))
    
        images = torch.stack(images, dim=0)
        labels = torch.as_tensor(labels)
        return images, labels

# ConvNeXt V2

In [8]:
def drop_path(x, drop_prob: float = 0., training: bool = False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

In [9]:
class LayerNorm(nn.Module):
    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
    with shape (batch_size, channels, height, width).
    """

    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape), requires_grad=True)
        self.bias = nn.Parameter(torch.zeros(normalized_shape), requires_grad=True)
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise ValueError(f"not support data format '{self.data_format}'")
        self.normalized_shape = (normalized_shape,)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.data_format == "channels_last":
            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        elif self.data_format == "channels_first":
            # [batch_size, channels, height, width]
            mean = x.mean(1, keepdim=True)
            var = (x - mean).pow(2).mean(1, keepdim=True)
            x = (x - mean) / torch.sqrt(var + self.eps)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
            return x

In [10]:
class GRN(nn.Module):
    """ GRN (Global Response Normalization) layer
    """
    def __init__(self, dim):
        super().__init__()
        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))

    def forward(self, x):
        Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True)
        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
        return self.gamma * (x * Nx) + self.beta + x

In [11]:
class Block(nn.Module):
    """ ConvNeXtV2 Block.
    
    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
    """
    def __init__(self, dim, drop_path=0.):
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.grn = GRN(4 * dim)
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.grn(x)
        x = self.pwconv2(x)
        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)

        x = input + self.drop_path(x)
        return x


In [12]:
class ConvNeXtV2(nn.Module):
    """ ConvNeXt V2
        
    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    """
    def __init__(self, in_chans=3, num_classes=1000, 
                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], 
                 drop_path_rate=0., head_init_scale=1.
                 ):
        super().__init__()
        self.depths = depths
        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
        cur = 0
        for i in range(4):
            stage = nn.Sequential(
                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
            )
            self.stages.append(stage)
            cur += depths[i]

        self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
        self.head = nn.Linear(dims[-1], num_classes)

        self.apply(self._init_weights)
        self.head.weight.data.mul_(head_init_scale)
        self.head.bias.data.mul_(head_init_scale)

    def _init_weights(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            nn.init.trunc_normal_(m.weight, std=.02)
            nn.init.constant_(m.bias, 0)

    def forward_features(self, x):
        for i in range(4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
        return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

In [13]:
def convnextv2_base(num_classes: int):
    model = ConvNeXtV2(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], num_classes=num_classes)
    return model

In [14]:
def convnextv2_tiny(num_classes: int):
    model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], num_classes=num_classes)
    return model

# Train

In [15]:
def train_one_epoch(model, optimizer, data_loader, device, epoch, lr_scheduler):
    model.train()
    loss_function = torch.nn.CrossEntropyLoss()
    accu_loss = torch.zeros(1).to(device)  
    accu_num = torch.zeros(1).to(device)   
    optimizer.zero_grad()

    sample_num = 0
    data_loader = tqdm(data_loader, file=sys.stdout)
    for step, data in enumerate(data_loader):
        images, labels = data
        sample_num += images.shape[0]

        pred = model(images.to(device))
        pred_classes = torch.max(pred, dim=1)[1]
        accu_num += torch.eq(pred_classes, labels.to(device)).sum()

        loss = loss_function(pred, labels.to(device))
        loss.backward()
        accu_loss += loss.detach()

        data_loader.desc = "[train epoch {}] loss: {:.3f}, acc: {:.3f}, lr: {:.5f}".format(
            epoch,
            accu_loss.item() / (step + 1),
            accu_num.item() / sample_num,
            optimizer.param_groups[0]["lr"]
        )

        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ', loss)
            sys.exit(1)

        optimizer.step()
        optimizer.zero_grad()
        # update lr
        lr_scheduler.step()

    return accu_loss.item() / (step + 1), accu_num.item() / sample_num


In [16]:
def evaluate(model, data_loader, device, epoch):
    loss_function = torch.nn.CrossEntropyLoss()

    model.eval()

    accu_num = torch.zeros(1).to(device)   
    accu_loss = torch.zeros(1).to(device)  

    sample_num = 0
    data_loader = tqdm(data_loader, file=sys.stdout)
    with torch.no_grad():
        for step, data in enumerate(data_loader):
            images, labels = data
            sample_num += images.shape[0]

            pred = model(images.to(device))
            pred_classes = torch.max(pred, dim=1)[1]
            accu_num += torch.eq(pred_classes, labels.to(device)).sum()

            loss = loss_function(pred, labels.to(device))
            accu_loss += loss

            data_loader.desc = "[valid epoch {}] loss: {:.3f}, acc: {:.3f}".format(
                epoch,
                accu_loss.item() / (step + 1),
                accu_num.item() / sample_num
            )

    return accu_loss.item() / (step + 1), accu_num.item() / sample_num

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device} device.")

using cuda device.


# Load pretrain FCMAE and fine tune for classification

In [18]:
model = convnextv2_base(5).to(device)
weight_dict = torch.load('/kaggle/input/fcmae-pretrained/pre_last_model_50.pth', map_location=device)

  weight_dict = torch.load('/kaggle/input/fcmae-pretrained/pre_last_model_50.pth', map_location=device)


In [19]:
weight_dict.keys()

odict_keys(['mask_token', 'encoder.downsample_layers.0.0.weight', 'encoder.downsample_layers.0.0.bias', 'encoder.downsample_layers.0.1.weight', 'encoder.downsample_layers.0.1.bias', 'encoder.downsample_layers.1.0.weight', 'encoder.downsample_layers.1.0.bias', 'encoder.downsample_layers.1.1.weight', 'encoder.downsample_layers.1.1.bias', 'encoder.downsample_layers.2.0.weight', 'encoder.downsample_layers.2.0.bias', 'encoder.downsample_layers.2.1.weight', 'encoder.downsample_layers.2.1.bias', 'encoder.downsample_layers.3.0.weight', 'encoder.downsample_layers.3.0.bias', 'encoder.downsample_layers.3.1.weight', 'encoder.downsample_layers.3.1.bias', 'encoder.stages.0.0.dwconv.weight', 'encoder.stages.0.0.dwconv.bias', 'encoder.stages.0.0.norm.weight', 'encoder.stages.0.0.norm.bias', 'encoder.stages.0.0.pwconv1.weight', 'encoder.stages.0.0.pwconv1.bias', 'encoder.stages.0.0.pwconv2.weight', 'encoder.stages.0.0.pwconv2.bias', 'encoder.stages.0.0.grn.gamma', 'encoder.stages.0.0.grn.beta', 'encode

In [20]:
for k in list(weight_dict.keys()):
    if 'decoder' in k or 'mask_token'in k or 'proj' in k or 'pred' in k:
        print(f"Removing key {k} from pretrained checkpoint")
        del weight_dict[k]

Removing key mask_token from pretrained checkpoint
Removing key proj.weight from pretrained checkpoint
Removing key proj.bias from pretrained checkpoint
Removing key decoder.0.dwconv.weight from pretrained checkpoint
Removing key decoder.0.dwconv.bias from pretrained checkpoint
Removing key decoder.0.norm.weight from pretrained checkpoint
Removing key decoder.0.norm.bias from pretrained checkpoint
Removing key decoder.0.pwconv1.weight from pretrained checkpoint
Removing key decoder.0.pwconv1.bias from pretrained checkpoint
Removing key decoder.0.grn.gamma from pretrained checkpoint
Removing key decoder.0.grn.beta from pretrained checkpoint
Removing key decoder.0.pwconv2.weight from pretrained checkpoint
Removing key decoder.0.pwconv2.bias from pretrained checkpoint
Removing key pred.weight from pretrained checkpoint
Removing key pred.bias from pretrained checkpoint


In [21]:
updated_weight_dict = {}

In [22]:
for k,v in weight_dict.items():
    if k.startswith("encoder."):
        updated_weight_dict[k[len("encoder."):]]= v
    else:
        updated_weight_dict[k] = v

In [23]:
model.load_state_dict(updated_weight_dict, strict=False)

_IncompatibleKeys(missing_keys=['norm.weight', 'norm.bias', 'head.weight', 'head.bias'], unexpected_keys=[])

In [24]:
for name, para in model.named_parameters():
    if 'head' not in name:
        para.requires_grad_(False)
    else:
        print("training {}".format(name))

training head.weight
training head.bias


In [25]:
if os.path.exists("./weights") is False:
    os.makedirs("./weights")

In [26]:
img_size = 224
train_transform = transforms.Compose([transforms.Resize((img_size,img_size)),
                                    transforms.CenterCrop(img_size),
                                    transforms.RandomHorizontalFlip(p=0.5),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
val_transform = transforms.Compose([transforms.Resize((img_size,img_size)),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
train_dataset = MyDataset(train_valid_path, train_valid_label, train_transform)
val_dataset = MyDataset(val_valid_path, val_valid_label, val_transform)

In [27]:
batch_size = 32
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
train_loader = DataLoader(train_dataset,
                       batch_size=batch_size,
                       shuffle=True,
                       pin_memory=True,
                       num_workers=nw,
                       collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset,
                       batch_size=batch_size,
                       shuffle=True,
                       pin_memory=True,
                       num_workers=nw,
                       collate_fn=val_dataset.collate_fn)

In [29]:
def create_lr_scheduler(optimizer,
                        num_step: int,
                        epochs: int,
                        warmup=True,
                        warmup_epochs=1,
                        warmup_factor=1e-3,
                        end_factor=1e-6):
    assert num_step > 0 and epochs > 0
    if warmup is False:
        warmup_epochs = 0

    def f(x):
        if warmup is True and x <= (warmup_epochs * num_step):
            alpha = float(x) / (warmup_epochs * num_step)
            
            return warmup_factor * (1 - alpha) + alpha
        else:
            current_step = (x - warmup_epochs * num_step)
            cosine_steps = (epochs - warmup_epochs) * num_step
            
            return ((1 + math.cos(current_step * math.pi / cosine_steps)) / 2) * (1 - end_factor) + end_factor

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)

In [30]:
epochs = 30

optimizer = torch.optim.Adam(params=model.parameters(),
                            lr=1e-3,
                            weight_decay=1e-5)
lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), epochs,
                                   warmup=True, warmup_epochs=1)

In [31]:
best_loss = 1e9
patience = 0
for epoch in range(epochs):
    # train
    train_loss, train_acc = train_one_epoch(model=model,
                                            optimizer=optimizer,
                                            data_loader=train_loader,
                                            device=device,
                                            epoch=epoch,
                                            lr_scheduler=lr_scheduler)

    # validate
    val_loss, val_acc = evaluate(model=model,
                               data_loader=val_loader,
                               device=device,
                               epoch=epoch)
    if best_loss >= val_loss:
        torch.save(model.state_dict(), "./weights/best_model.pth")
        best_loss = val_loss
        patience = 0
    else:
        patience += 1
        if patience >= 5:
            print("Loss does not improve in 5 epochs. Early Stopping!")
            break
        else:
            print(f"Loss does not improve in {patience} epochs!")

[train epoch 0] loss: 1.382, acc: 0.416, lr: 0.00100: 100%|██████████| 476/476 [02:06<00:00,  3.77it/s]
[valid epoch 0] loss: 1.435, acc: 0.344: 100%|██████████| 29/29 [00:08<00:00,  3.39it/s]
[train epoch 1] loss: 1.240, acc: 0.497, lr: 0.00100: 100%|██████████| 476/476 [02:06<00:00,  3.78it/s]
[valid epoch 1] loss: 1.233, acc: 0.510: 100%|██████████| 29/29 [00:08<00:00,  3.45it/s]
[train epoch 2] loss: 1.198, acc: 0.518, lr: 0.00099: 100%|██████████| 476/476 [02:05<00:00,  3.80it/s]
[valid epoch 2] loss: 1.302, acc: 0.496: 100%|██████████| 29/29 [00:08<00:00,  3.47it/s]
Loss does not improve in 1 epochs!
[train epoch 3] loss: 1.182, acc: 0.526, lr: 0.00097: 100%|██████████| 476/476 [02:05<00:00,  3.81it/s]
[valid epoch 3] loss: 1.243, acc: 0.504: 100%|██████████| 29/29 [00:08<00:00,  3.43it/s]
Loss does not improve in 2 epochs!
[train epoch 4] loss: 1.156, acc: 0.536, lr: 0.00095: 100%|██████████| 476/476 [02:05<00:00,  3.80it/s]
[valid epoch 4] loss: 1.218, acc: 0.512: 100%|████████