[[Pytorch] Hybrid Swin Transformer + CNN](https://www.kaggle.com/debarshichanda/pytorch-hybrid-swin-transformer-cnn/notebook)

# Install Required Libraries

In [1]:
# ! pip install git+https://github.com/rwightman/pytorch-image-models
! pip install -q -U wandb albumentations timm

In [2]:
import os
import gc
import cv2
import copy
import time
import random
from PIL import Image

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# Pytorch Image Model Library
import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

import warnings
warnings.filterwarnings('ignore')

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
import wandb

# for kaggle
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

# for local
wandb.login(key='d60a4af56f6cd9cccec7d9da1dbced7960b61310')
wandb.init(project="petfinder-pawpularity-score", entity="jiwon7258")

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjiwon7258[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
ROOT_DIR = './input/'
TRAIN_DIR = './input/train'
TEST_DIR = './input//test'

In [5]:
CONFIG = dict(
    seed=42,
    backbone='swin_base_patch4_window7_224',
    embedder='tf_efficientnet_b4_ns',
    train_batch_size=16,
    valid_batch_size=32,
    img_size=448,
    epochs=5,
    learning_rate=1e-4,
    scheduler='CosineAnnealingLR',
    min_lr=1e-6,
    T_max=100,
    weight_decay=1e-6,
    n_accumulate=1,
    n_fold=5,
    num_classes=1,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    competition='PetFinder',
    _wandb_kernel_='deb',
)

# Set Seed for Reproducibility

In [6]:
def set_seed(seed = 42) :
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Read the Data

In [7]:
def get_train_file_path(id):
    return f'{TRAIN_DIR}/{id}.jpg'

In [8]:
df = pd.read_csv(f'{ROOT_DIR}/train.csv')
# file_path에 해당하는 column을 만든다
df['file_path'] = df['Id'].apply(get_train_file_path)

In [9]:
df.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,file_path
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,./input/train/0007de18844b0dbbb5e1f607da0606e0...
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,./input/train/0009c66b9439883ba2750fb825e1d7db...
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,./input/train/0013fd999caf9a3efe1352ca1b0d937e...
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,./input/train/0018df346ac9c1d8413cfcc888ca8246...
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,./input/train/001dc955e10590d3ca4673f034feeef2...


In [10]:
# feature_cols를 통해 사용할 feature 목록을 관리한다
feature_cols = [col for col in df.columns if col not in ['Id', 'Pawpularity', 'file_path']]

# Create Folds

Pawpularity는 0~100 사이의 정수 값을 가진다. Stratifed

In [11]:
def create_folds(df, n_s=5, n_grp = None):
    df['kfold'] = -1

    if n_grp is None:
        skf = KFold(n_splits=n_s, random_state=CONFIG['seed'])
        target = df['Pawpularity']
    else:
        skf = StratifiedKFold(n_splits=n_s, shuffle=True, random_state=CONFIG['seed'])
        # Pawpularity를 구간별로, n_grp 수만큼 자른다
        # 따라서 Pawpularity와 grp의 히스토그램 분포는 동일하다
        df['grp'] = pd.cut(df['Pawpularity'], n_grp, labels=False)
        target = df['grp']

    # n_grp의 분포를 기반으로 StratifiedKFold를 진행한다
    for fold_no, (t,v) in enumerate(skf.split(target,target)):
        df.loc[v, 'kfold'] = fold_no

    df = df.drop('grp', axis = 1)
    return df

In [12]:
df = create_folds(df, n_s=CONFIG['n_fold'], n_grp=14)
df.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,file_path,kfold
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,./input/train/0007de18844b0dbbb5e1f607da0606e0...,0
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,./input/train/0009c66b9439883ba2750fb825e1d7db...,2
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,./input/train/0013fd999caf9a3efe1352ca1b0d937e...,0
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,./input/train/0018df346ac9c1d8413cfcc888ca8246...,3
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,./input/train/001dc955e10590d3ca4673f034feeef2...,4


Pawpularity는 0~100까지의 정수로 이루어져 있다. 이 분포 그대로 KFold를 진행하지 않는다. 대신 n_grp개 만큼, 일정한 길이별로 구간을 나눈 후, 이 분포를 이용하여 KFold를 진행한다.

```df.Pawpularity.hist(bins=14) == df.grp.hist(bins=14)```

![Pawpularity Histogram](./img/paupularity.png)

![이미지](./img/pawpularity_and_grp_hist.png)



# Dataset Class 

In [13]:
class PawpularityDataset(Dataset):
    def __init__(self, root_dir, df, transforms=None):
        self.root_dir = root_dir
        self.df = df
        self.file_names = df['file_path'].values    # numpy array
        self.targets = df['Pawpularity'].values     # numpy array
        self.transforms = transforms

    # 데이터 프레임의 길이를 반환
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index) :
        img_path = self.file_names[index]
        img = cv2.imread(img_path)                  # numpy array
        img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        target=self.targets[index]

        if self.transforms:
            img = self.transforms(image=img)['image']
        
        # 이미지 데이터, target label
        return img, target



# Augmentations

In [14]:
data_transforms = {
    "train": A.Compose([
        # 리사이징
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        # 가로 반전
        A.HorizontalFlip(p=0.5),
        # 정규화
        A.Normalize(),
        ToTensorV2()
    ], p=1.),

    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(),
        ToTensorV2()
    ])
}

# Create Model

In [15]:
class HybridEmbed(nn.Module):
    '''
    CNN Feature Map
    Extract feature map from CNN, flatten, project to embedding idm.
    '''

    def __init__(self, backbone, img_size=224, patch_size=1, feature_size=None, in_chans=3, embed_dim=768):
        super().__init__()
        assert isinstance(backbone, nn.Module)

        img_size = (img_size,img_size)
        patch_size =(patch_size, patch_size)
        
        self.img_size=img_size
        self.patch_size = patch_size
        self.backbone = backbone

        # backbone의 output feature_size를 모르는 경우, 가장 확실한 방법은 forward시켜 보는 것이다
        # zero tensor를 생성해서 forward pass 시켜보자
        if feature_size is None :
            with torch.no_grad() :
                # NOTE Most reliable way of determinig output idms is to run forward pass
                training = backbone.training
                if training:
                    backbone.eval()
                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))           # bs, in_channel, feature_size
                if isinstance(o, (list, tuple)):        
                    o = o[-1] # last batch if backbone outputs list/tuple of features
                feature_size = o.shape[-2:]
                feature_dim = o.shape[1]
                backbone.train(training)
        
        else :
            feature_size = (feature_size, feature_size)
            if hasattr(self.backbone, 'feature_info'):
                feature_dim = self.backbone.feature_info.channels()[-1]
            else:
                feature_dim = self.backbone.num_features
        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.proj = nn.Conv2d(in_channels=feature_dim, out_channels=embed_dim, kernel_size=patch_size, stride = patch_size)

    def forward(self, x):
        x = self.backbone(x)
        # if x is list or tuple
        if isinstance(x, (list,tuple)):
            x = x[-1] 
        x = self.proj(x).flatten(start_dim=2).transpose(1,2)
        return x
        
    

In [16]:
class PawpularityModel(nn.Module):
    def __init__ (self, backbone, embedder, pretrained=True):
        super().__init__()
        self.backbone = timm.create_model(backbone, pretrained=pretrained)
        self.embedder = timm.create_model(embedder, features_only=True, out_indices=[2], pretrained=pretrained)
        # backbone 모델 안에 있는 patch_embed block을 재설정한다
        self.backbone.patch_embed=HybridEmbed(backbone = self.embedder, img_size=CONFIG['img_size'], embed_dim=128)
        self.n_features = self.backbone.head.in_features
        self.backbone.reset_classifier(0)
        self.fc = nn.Linear(self.n_features, CONFIG['num_classes'])

    def forward (self, images):
        features = self.backbone(images)            # features = (batch size, embedding_size)
        output = self.fc(features)                  # outputs = (batch_size, num_classes)
        return output

model = PawpularityModel(CONFIG['backbone'], CONFIG['embedder'])
model.to(CONFIG['device'])

PawpularityModel(
  (backbone): SwinTransformer(
    (patch_embed): HybridEmbed(
      (backbone): EfficientNetFeatures(
        (conv_stem): Conv2dSame(3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
        (blocks): Sequential(
          (0): Sequential(
            (0): DepthwiseSeparableConv(
              (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
              (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
              (act1): SiLU(inplace=True)
              (se): SqueezeExcite(
                (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
                (act1): SiLU(inplace=True)
                (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
                (gate): Sigmoid()
              )
              (conv

In [17]:
from torchsummary import summary

summary(model, (3,448,448))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 48, 224, 224]           1,296
        Conv2dSame-2         [-1, 48, 224, 224]           1,296
       BatchNorm2d-3         [-1, 48, 224, 224]              96
       BatchNorm2d-4         [-1, 48, 224, 224]              96
              SiLU-5         [-1, 48, 224, 224]               0
              SiLU-6         [-1, 48, 224, 224]               0
            Conv2d-7         [-1, 48, 224, 224]             432
            Conv2d-8         [-1, 48, 224, 224]             432
       BatchNorm2d-9         [-1, 48, 224, 224]              96
      BatchNorm2d-10         [-1, 48, 224, 224]              96
             SiLU-11         [-1, 48, 224, 224]               0
             SiLU-12         [-1, 48, 224, 224]               0
           Conv2d-13             [-1, 12, 1, 1]             588
           Conv2d-14             [-1, 1

In [19]:
# test
img = torch.randn(5, 3, CONFIG['img_size'], CONFIG['img_size']).to(CONFIG['device'])
print(
model(img)
)

tensor([[-0.1843],
        [-0.1658],
        [-0.2310],
        [-0.2037],
        [-0.2477]], grad_fn=<AddmmBackward0>)


# Loss Function

In [None]:
def criterion(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs.view(-1), targets.view(-1)))

# Training Function

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    # train 모드로 변경
    model.train()

    # for the Mixed Precision 
    # Pytorch 예제 : https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples
    scaler = amp.GradScaler()

    dataset_size = 0
    running_loss = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (images, targets) in bar:
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)

        batch_size = images.size(0)

        with amp.autocast(enabled=True):
            outputs = model(images)
            loss = criterion(outputs,targets)
            loss = loss / CONFIG['n_accumulate']

        # loss를 Scale
        # Scaled Grdients를 계산(call)하기 위해 scaled loss를 backward() 
        scaler.scale(loss).backward()

        if (step + 1) % CONFIG['n_accumulate'] == 0:
            scaler.step(optimizer)
            scaler.update()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
        
        # loss.item()은 loss를 Python Float으로 반환
        # loss.item()은 batch data의 average loss이므로, sum of loss를 구하기 위해 batch_size를 곱해준다
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch = epoch, Train_Loss = epoch_loss, LR = optimizer.param_groups[0]['lr'])
    
    # Garbage Collector
    gc.collect()

    return epoch_loss

    

# Validation Function

In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size= 0
    running_loss = 0

    TARGETS= []
    PREDS = []

    bar = tqdm(enumerate(dataloader), total = len(dataloader))

    for step, (images, targets) in bar:
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)

        batch_size = images.size(0)

        outputs = model(images)
        loss = criterion(outputs, targets)

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        PREDS.append(outputs.view(-1).cpu().detach().numpy())
        TARGETS.append(targets.view(-1).cpu().detach().numpy())

        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, LR = optimizer.param_groups[0]['lr'])
    
    TARGETS = np.concatenate(TARGETS)
    PREDS = np.concatenate(PREDS)
    val_rmse = mean_squared_error(TARGETS, PREDS, squared=False)
    
    gc.collect()


    return epoch_loss, val_rmse

# Training

In [None]:
def run_training(model, optimzier, scheduler, device, num_epochs):
    # To automatically log graidents
    wandb.watch(model, log_freq=100)

    if torch.cuda.is_available():
        print("[INFO] Using GPU:{}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_rmse=np.inf
    history=defaultdict(list)

    # num_epochs만큼, train과 val을 실행한다
    for epoch in range(1,num_epochs +1):
        gc.collect()
        
        # train one epoch
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, dataloader=train_loader, devcie=CONFIG['device'], epoch=epoch)
        val_epoch_loss, val_epoch_rmse = valid_one_epoch(model, valid_loader, device=CONFIG['device'], epoch=epoch)

        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Valid RMSE'].append(val_epoch_rmse)

        # Log the metrics
        wandb.log({"Train Loss" : train_epoch_loss})
        wandb.log({'Valid Loss' : val_epoch_loss})
        wandb.log({'Valid RMSE' : val_epoch_rmse})

        print(f'Valid RMSE : {val_epoch_rmse}')


        # deep copy the model
        if val_epoch_rmse <= best_epoch_rmse:
            print(f'Validation Loss improved( {best_epoch_rmse} ---> {val_epoch_rmse}  )')
            best_epoch_rmse = val_epoch_rmse
            run.summary['Best RMSE'] = best_epoch_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = 'RMSE{:.4f}_epoch{:.0f}.bin'.format(best_epoch_rmse, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            wandb.save(PATH)
            print(f'Model Saved')

        print()

    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print('Best RMSE: {:.4f}'.format(best_epoch_rmse))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

In [None]:
def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)