In [2]:
!pip install --quiet timm pytorch_lightning==1.7.7 torchmetrics==0.11.1

[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [3]:
IS_TEST = False

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import pytorch_lightning as L

from glob import glob
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from torchvision.io import read_image
from torchvision.transforms import v2 as  transforms
from torch.utils.data import Dataset, DataLoader
from transformers import Swinv2Config, Swinv2Model, AutoImageProcessor, AutoModelForImageClassification
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

2024-05-05 01:19:06.107584: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 01:19:06.107720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 01:19:06.241976: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
class CustomDataset(Dataset):
    def __init__(self, df, path_col,  mode='train'):
        self.df = df
        self.path_col = path_col
        self.mode = mode

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.mode == 'train':
            row = self.df.iloc[idx]
            image = read_image(row[self.path_col])/256.
            label = row['class']
            data = {
                'image':image,
                'label':label
            }
            return data
        elif self.mode == 'val':
            row = self.df.iloc[idx]
            image = read_image(row[self.path_col])/256.
            label = row['class']
            data = {
                'image':image,
                'label':label
            }
            return data
        elif self.mode == 'inference':
            row = self.df.iloc[idx]
            image = read_image(row[self.path_col])/256.
            data = {
                'image':image,
            }
            return data

    def train_transform(self, image):
        pass

In [7]:
class CustomCollateFn:
    def __init__(self, transform, mode):
        self.mode = mode
        self.transform = transform

    def __call__(self, batch):
        if self.mode=='train':
            pixel_values = torch.stack([self.transform(data['image']) for data in batch])
            label = torch.LongTensor([data['label'] for data in batch])
            return {
                'pixel_values':pixel_values,
                'label':label,
            }
        elif self.mode=='val':
            pixel_values = torch.stack([self.transform(data['image']) for data in batch])
            label = torch.LongTensor([data['label'] for data in batch])
            return {
                'pixel_values':pixel_values,
                'label':label,
            }
        elif self.mode=='inference':
            pixel_values = torch.stack([self.transform(data['image']) for data in batch])
            return {
                'pixel_values':pixel_values,
            }


In [8]:
class CustomModel(nn.Module):
    def __init__(self, model):
        super(CustomModel, self).__init__()
        self.model = model
    def forward(self, x, label=None):
        x = self.model(x)
        loss = None
        if label is not None:
            loss = nn.CrossEntropyLoss(label_smoothing=0.11)(x, label)
        probs = nn.LogSoftmax(dim=-1)(x)
        return probs, loss

class LitCustomModel(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = CustomModel(model)
        self.validation_step_output = []

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=1e-5)
        return opt

    def training_step(self, batch, batch_idx=None):
        x = batch['pixel_values']
        label = batch['label']
        probs, loss = self.model(x, label)
        self.log(f"train_loss", loss, on_step=True, on_epoch=False)
        return loss

    def validation_step(self, batch, batch_idx=None):
        x = batch['pixel_values']
        label = batch['label']
        probs, loss = self.model(x, label)
        self.validation_step_output.append([probs,label])
        return loss

    def predict_step(self, batch, batch_idx=None):
        x = batch['pixel_values']
        probs, _ = self.model(x)
        return probs

    def validation_epoch_end(self, step_output):
        pred = torch.cat([x for x, _ in self.validation_step_output]).cpu().detach().numpy().argmax(1)
        label = torch.cat([label for _, label in self.validation_step_output]).cpu().detach().numpy()
        score = f1_score(label,pred, average='macro')
        self.log("val_score", score)
        print("val_score", score)
        self.validation_step_output.clear()
        return score

In [9]:
SEED = 42

if IS_TEST :
    N_SPLIT = 3
else:
    N_SPLIT = 5
    
BATCH_SIZE = 24

In [10]:
L.seed_everything(SEED)

42

In [11]:
train_df = pd.read_csv('/kaggle/input/dacon-image-bird/train.csv')
len(train_df)

15834

In [15]:
if IS_TEST :
    train_df = train_df[:400]
    
train_df['img_path'] = train_df['img_path'].apply(lambda x: x.replace("./", "/kaggle/input/dacon-image-bird/"))
train_df['upscale_img_path'] = train_df['upscale_img_path'].apply(lambda x: x.replace("./", "/kaggle/input/dacon-image-bird/"))
le = LabelEncoder()
train_df['class'] = le.fit_transform(train_df['label'])

In [16]:
train_df.head()

Unnamed: 0,img_path,upscale_img_path,label,class
0,/kaggle/input/dacon-image-bird/train/TRAIN_000...,/kaggle/input/dacon-image-bird/upscale_train/T...,Ruddy Shelduck,19
1,/kaggle/input/dacon-image-bird/train/TRAIN_000...,/kaggle/input/dacon-image-bird/upscale_train/T...,Gray Wagtail,9
2,/kaggle/input/dacon-image-bird/train/TRAIN_000...,/kaggle/input/dacon-image-bird/upscale_train/T...,Indian Peacock,13
3,/kaggle/input/dacon-image-bird/train/TRAIN_000...,/kaggle/input/dacon-image-bird/upscale_train/T...,Common Kingfisher,3
4,/kaggle/input/dacon-image-bird/train/TRAIN_000...,/kaggle/input/dacon-image-bird/upscale_train/T...,Common Kingfisher,3


In [22]:
skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)

In [23]:
train_transform = transforms.Compose([
    transforms.Resize(size=(196,196), interpolation=transforms.InterpolationMode.BICUBIC),
    #transforms.RandomVerticalFlip(),  # Vertical 랜덤 플립
    transforms.RandomHorizontalFlip(),  # Horizontal 랜덤 플립
    transforms.RandomRotation(degrees=10),  # 랜덤 회전
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # 랜덤 이동
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0, hue=0),  # 랜덤 색상 조정
    transforms.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

val_transform = transforms.Compose([
    transforms.Resize(size=(196,196), interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

train_collate_fn = CustomCollateFn(train_transform, 'train')
val_collate_fn = CustomCollateFn(val_transform, 'val')

In [24]:
from torch.utils.data import RandomSampler
import timm

for fold_idx, (train_index, val_index) in enumerate(skf.split(train_df, train_df['class'])):
    train_fold_df = train_df.loc[train_index,:]
    val_fold_df = train_df.loc[val_index,:]

    train_dataset = CustomDataset(train_fold_df, 'img_path', mode='train')
    val_dataset = CustomDataset(val_fold_df, 'img_path', mode='val')
    
    train_random_sampler = RandomSampler(train_dataset)
    val_random_sampler = RandomSampler(val_dataset)

    train_dataloader = DataLoader(train_dataset, collate_fn=train_collate_fn, batch_size=BATCH_SIZE,num_workers=8,sampler = train_random_sampler)
    val_dataloader = DataLoader(val_dataset, collate_fn=val_collate_fn, batch_size=BATCH_SIZE*2,num_workers=8,sampler = val_random_sampler)

    model = timm.create_model("timm/eva_large_patch14_196.in22k_ft_in22k_in1k", pretrained=True, num_classes=25)
    #model = timm.create_model("timm/beitv2_large_patch16_224.in1k_ft_in22k_in1k", pretrained=True, num_classes=25)
    #model = Swinv2Model.from_pretrained("microsoft/swinv2-large-patch4-window12to16-192to256-22kto1k-ft")
    
    print(model)
    lit_model = LitCustomModel(model)

    checkpoint_callback = ModelCheckpoint(
        monitor='val_score',
        mode='max',
        dirpath='./checkpoints/',
        filename=f'swinv2-large-resize-fold_idx={fold_idx}'+'-{epoch:02d}-{train_loss:.4f}-{val_score:.4f}',
        save_top_k=1,
        save_weights_only=True,
        verbose=True
    )
    earlystopping_callback = EarlyStopping(monitor="val_score", mode="max", patience=3)
    trainer = L.Trainer(max_epochs=100, accelerator='auto', precision=32, callbacks=[checkpoint_callback, earlystopping_callback], val_check_interval=0.5)
    trainer.fit(lit_model, train_dataloader, val_dataloader)

    model.cpu()
    lit_model.cpu()
    del model, lit_model, checkpoint_callback, earlystopping_callback, trainer
    gc.collect()
    torch.cuda.empty_cache()


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Id

Sanity Checking: 0it [00:00, ?it/s]

val_score 0.07972161172161173


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

val_score 0.9568352301587225


Validation: 0it [00:00, ?it/s]

val_score 0.9688067762735817


Validation: 0it [00:00, ?it/s]

val_score 0.975279359517468


Validation: 0it [00:00, ?it/s]

val_score 0.9733770034747115


Validation: 0it [00:00, ?it/s]

val_score 0.9748923212115925


Validation: 0it [00:00, ?it/s]

val_score 0.9760121474213538


Validation: 0it [00:00, ?it/s]

val_score 0.9765429945980091


Validation: 0it [00:00, ?it/s]

val_score 0.9687497206395022


Validation: 0it [00:00, ?it/s]

val_score 0.9734970284025651


Validation: 0it [00:00, ?it/s]

val_score 0.9744868435531316
VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inpl

Sanity Checking: 0it [00:00, ?it/s]

val_score 0.00924731182795699


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

val_score 0.9501038701650498


Validation: 0it [00:00, ?it/s]

val_score 0.963979580987278


Validation: 0it [00:00, ?it/s]

val_score 0.9674018634591003


Validation: 0it [00:00, ?it/s]

val_score 0.9674224332386074


Validation: 0it [00:00, ?it/s]

val_score 0.9680903547629666


Validation: 0it [00:00, ?it/s]

val_score 0.9720608969768445


Validation: 0it [00:00, ?it/s]

val_score 0.9657921798162925


Validation: 0it [00:00, ?it/s]

val_score 0.9652679046196585


Validation: 0it [00:00, ?it/s]

val_score 0.9693213856480987
VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inpl

Sanity Checking: 0it [00:00, ?it/s]

val_score 0.02


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

val_score 0.9512909867636329


Validation: 0it [00:00, ?it/s]

val_score 0.9659505778834192


Validation: 0it [00:00, ?it/s]

val_score 0.9674213954664634


Validation: 0it [00:00, ?it/s]

val_score 0.9694737845187847


Validation: 0it [00:00, ?it/s]

val_score 0.9707208155760823


Validation: 0it [00:00, ?it/s]

val_score 0.9725108367203776


Validation: 0it [00:00, ?it/s]

val_score 0.9739457663841686


Validation: 0it [00:00, ?it/s]

val_score 0.9756743094365211


Validation: 0it [00:00, ?it/s]

val_score 0.974219520440178


Validation: 0it [00:00, ?it/s]

val_score 0.9699708877884344


Validation: 0it [00:00, ?it/s]

val_score 0.970295782437773
VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inpla

Sanity Checking: 0it [00:00, ?it/s]

val_score 0.026654456654456652


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

val_score 0.9601608585350467


Validation: 0it [00:00, ?it/s]

val_score 0.9678559904416533


Validation: 0it [00:00, ?it/s]

val_score 0.9691229671636614


Validation: 0it [00:00, ?it/s]

val_score 0.9749414437335032


Validation: 0it [00:00, ?it/s]

val_score 0.971935171878329


Validation: 0it [00:00, ?it/s]

val_score 0.975174410781861


Validation: 0it [00:00, ?it/s]

val_score 0.976678935526906


Validation: 0it [00:00, ?it/s]

val_score 0.9761528141035112


Validation: 0it [00:00, ?it/s]

val_score 0.9771915698756382


Validation: 0it [00:00, ?it/s]

val_score 0.9720088852515133


Validation: 0it [00:00, ?it/s]

val_score 0.9728241376422868


Validation: 0it [00:00, ?it/s]

val_score 0.9728936625041062
VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inpl

Sanity Checking: 0it [00:00, ?it/s]

val_score 0.0


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

val_score 0.9521915372751433


Validation: 0it [00:00, ?it/s]

val_score 0.9674231704844902


Validation: 0it [00:00, ?it/s]

val_score 0.9704129282257249


Validation: 0it [00:00, ?it/s]

val_score 0.9682501551746165


Validation: 0it [00:00, ?it/s]

val_score 0.9743328688820435


Validation: 0it [00:00, ?it/s]

val_score 0.972181440869518


Validation: 0it [00:00, ?it/s]

val_score 0.9777637912632283


Validation: 0it [00:00, ?it/s]

val_score 0.9713798846734918


Validation: 0it [00:00, ?it/s]

val_score 0.9760988295341678


Validation: 0it [00:00, ?it/s]

val_score 0.9727284391206109


In [25]:
test_df = pd.read_csv('/kaggle/input/dacon-image-bird/test.csv')
if IS_TEST :
    test_df = test_df[:400]
test_df['img_path'] = test_df['img_path'].apply(lambda x: x.replace("./", "/kaggle/input/dacon-image-bird/"))

In [27]:
test_transform = transforms.Compose([
    transforms.Resize(size=(196,196), interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

test_collate_fn = CustomCollateFn(test_transform, 'inference')
test_dataset = CustomDataset(test_df, 'img_path', mode='inference')
test_dataloader = DataLoader(test_dataset, collate_fn=test_collate_fn, batch_size=BATCH_SIZE*2, num_workers=8)

In [28]:
fold_preds = []
for checkpoint_path in glob('/kaggle/working/checkpoints/swinv2-large-resize*.ckpt'):
    #deit3_large_patch16_224.fb_in22k_ft_in1k
    model = timm.create_model("timm/eva_large_patch14_196.in22k_ft_in22k_in1k", pretrained=True, num_classes=25)
    #model = timm.create_model("timm/beitv2_large_patch16_224.in1k_ft_in22k_in1k", pretrained=True, num_classes=25)
    #model = Swinv2Model.from_pretrained("microsoft/swinv2-large-patch4-window12to16-192to256-22kto1k-ft")
    lit_model = LitCustomModel.load_from_checkpoint(checkpoint_path, model=model)
    trainer = L.Trainer( accelerator='auto', precision=32)
    preds = trainer.predict(lit_model, test_dataloader)
    preds = torch.cat(preds,dim=0).detach().cpu().numpy().argmax(1)
    fold_preds.append(preds)
pred_ensemble = list(map(lambda x: np.bincount(x).argmax(),np.stack(fold_preds,axis=1)))

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

In [29]:
submission = pd.read_csv('/kaggle/input/dacon-image-bird/sample_submission.csv')

In [30]:
if IS_TEST :
    submission = submission[:400]

In [31]:
submission['label'] = le.inverse_transform(pred_ensemble)

In [32]:
submission.to_csv('swinv2_baseline_large_resize.csv',index=False)

In [33]:
submission.head()

Unnamed: 0,id,label
0,TEST_00000,Indian Roller
1,TEST_00001,Asian Green Bee-Eater
2,TEST_00002,Jungle Babbler
3,TEST_00003,Sarus Crane
4,TEST_00004,Northern Lapwing


In [34]:
len(submission["label"].value_counts())

25

In [35]:
submission["label"].value_counts()

label
Asian Green Bee-Eater        314
Common Myna                  304
House Crow                   301
Forest Wagtail               300
Brown-Headed Barbet          291
Hoopoe                       288
Cattle Egret                 280
Indian Peacock               280
White-Breasted Waterhen      279
Sarus Crane                  277
Indian Roller                277
Common Tailorbird            276
Indian Grey Hornbill         274
White Wagtail                269
Northern Lapwing             269
Red-Wattled Lapwing          267
Gray Wagtail                 266
Jungle Babbler               264
Common Rosefinch             264
Common Kingfisher            264
White-Breasted Kingfisher    261
Ruddy Shelduck               250
Rufous Treepie               246
Coppersmith Barbet           230
Indian Pitta                 195
Name: count, dtype: int64