In [None]:
import os, sys, timeit, math, copy, random
sys.path.append("../input/timmmaster/")

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torchvision
import cv2
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch import nn, optim
from torch.cuda import amp
import torch.nn.functional as F

import timm

from sklearn.model_selection import StratifiedKFold

import albumentations as A
from albumentations.pytorch import ToTensorV2

import optuna

import transformers

In [None]:
# Config and seed

class Config:
    img_size = 224
    n_splits = 10
    seed = 42
    n_bins = 20 # for validation
    
    device = "cuda"
    batch_size = 32
    num_workers = 2
    
      
cfg = Config

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)

In [None]:
# Dataset

class ImgDataset(Dataset):
    def __init__(self, transform=None, folder = "train"):
        self.transform = transform
        
        # Load labels
        self.df = pd.read_csv(f"../input/petfinder-pawpularity-score/{folder}.csv")
                 
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        id_ = self.df.iloc[idx]["Id"]
        image = cv2.imread(f"../input/petfinder-pawpularity-score/train/{id_}.jpg")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float64)
        
        label = self.df.iloc[idx]['Pawpularity']
        
        if self.transform is not None:
            image = self.transform(image = image)["image"]
        
        return id_, image, torch.from_numpy(np.array(label)).float()

train_transform = A.Compose([
    A.SmallestMaxSize(cfg.img_size),
    A.RandomCrop(cfg.img_size, cfg.img_size),
    A.HorizontalFlip(p=0.5),
    A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0),
    ToTensorV2()
])

val_transform = A.Compose([
    A.SmallestMaxSize(cfg.img_size),
    A.CenterCrop(cfg.img_size, cfg.img_size),
    A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0),
    ToTensorV2()
])

train_dataset = ImgDataset(transform = train_transform, folder= "train")
val_dataset = ImgDataset(transform = val_transform, folder= "train")

In [None]:
# Validation

df = pd.read_csv(f"../input/petfinder-pawpularity-score/train.csv")
#labels = np.array(df["Pawpularity"])
labels = pd.cut(np.array(df["Pawpularity"]), bins=cfg.n_bins, labels=False)

splitter = StratifiedKFold(n_splits = cfg.n_splits, shuffle = True, random_state = cfg.seed)
splits = splitter.split(labels, labels)

# Check mean
for fold, (train_idx,val_idx) in enumerate(splits):
    print(df.iloc[train_idx]["Pawpularity"].mean(), df.iloc[val_idx]["Pawpularity"].mean())

In [None]:
# Modelling

class BaseSwin(nn.Module):
    def __init__(self, name = "swin_large_patch4_window7_224"):
        super(BaseSwin, self).__init__()
        self.model = timm.create_model(name, pretrained=True)
        #self.model.head = nn.Sequential(nn.Dropout(p =  dropout), nn.Linear(self.model.head.in_features, 1))
        self.model.head = nn.Linear(self.model.head.in_features, 1)

    def forward(self, x):
        x = self.model(x)
        return x.ravel()
    
def train_epoch(model, device, dataloader, loss_fn, optimizer, scheduler = None, metric = None, mixup = 0):
    train_loss = []
    train_metric = []
    
    model.train()
    
    for id_, X, target in tqdm(dataloader):
        X, target = X.to(device), target.to(device)

        optimizer.zero_grad()
        
        if mixup:
            X_mixup, target_mixup, lam = mixup_data(X, target, alpha = mixup)
            output = model(X_mixup)
            loss = loss_fn(output, target_mixup)
        else:
            output = model(X)
            loss = loss_fn(output, target)
            
        loss.backward()
        optimizer.step()
        
        if scheduler is not None:
             scheduler.step()
        
        if metric is not None:
            with torch.no_grad():
                train_metric.append(metric(output, target).detach().item())
        
        train_loss.append(loss.detach().item())
           
    if metric is not None:
        return train_loss, train_metric
    else:
        return train_loss

def valid_epoch(model, device, dataloader, loss_fn):
    valid_loss = []
    
    model.eval()
    
    for id_, X, target in tqdm(dataloader):
        with torch.no_grad():
            X, target = X.to(device), target.to(device)

            output = model(X)
            loss = loss_fn(output, target)
            
            valid_loss.append(loss.detach().item())
            
    return valid_loss

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

# https://github.com/facebookresearch/mixup-cifar10/blob/main/train.py
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_y = lam * y + (1 - lam) * y[index]
    return mixed_x, mixed_y, lam

"""
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)
"""

class BCE_scaled(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, preds, target):
        return F.binary_cross_entropy_with_logits(preds, target / 100)
    
class MSE_scaled(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, preds, target):
        return F.mse_loss(sigmoid(preds) * 100, target)

def train_swin(lr = 1e-4, weight_decay = 1e-04, epochs = 1, scheduler_epochs = 1, mixup = 0):
    df = pd.read_csv(f"../input/petfinder-pawpularity-score/train.csv")
    #labels = np.array(df["Pawpularity"])
    labels = pd.cut(np.array(df["Pawpularity"]), bins=cfg.n_bins, labels=False)

    splitter = StratifiedKFold(n_splits = cfg.n_splits, shuffle = True, random_state = cfg.seed)
    splits = splitter.split(labels, labels)
    
    opt_criterion = BCE_scaled()
    val_criterion = MSE_scaled()
    
    device = cfg.device
    val_losses = []
    
    for fold, (train_idx,val_idx) in enumerate(splits):
        print(f"Fold {fold + 1}", "\n")

        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)
        train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers = cfg.num_workers, pin_memory = True)
        val_loader = DataLoader(val_dataset, batch_size=cfg.batch_size, sampler=val_sampler, num_workers = cfg.num_workers, pin_memory = True)

        model = BaseSwin()
        model.to(device)
        optimizer = transformers.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        #optimizer = transformers.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, betas = (0.9, 0.99), eps = 1e-05)
        # FastAI params for AdamW, larger eps -> closer to sgd, can tune
        scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = (scheduler_epochs * len(train_loader)) // 10,
                                                                            num_training_steps = scheduler_epochs * len(train_loader))
        #scheduler = transformers.get_constant_schedule(optimizer)
        
        for epoch in range(epochs):
            start_time = timeit.default_timer()
            train_loss, train_metric = train_epoch(model,device,train_loader,opt_criterion,
                                                   optimizer,
                                                   scheduler = scheduler,
                                                   metric = MSE_scaled(), 
                                                   mixup = mixup)
            val_loss = valid_epoch(model,device,val_loader,val_criterion)
            end_time = timeit.default_timer()
            
            total = end_time - start_time

            train_loss = np.mean(np.array(train_loss))
            val_loss = np.sqrt(np.mean(np.array(val_loss)))
            train_metric = np.sqrt(np.mean(np.array(train_metric)))
            
            print(f"Epoch: {epoch + 1} | T loss: {train_loss:.4f} T rmse: {train_metric:.4f} V rmse: {val_loss:.4f} Time: {total:.4f}")
            
        val_losses.append(val_loss)
        torch.save(model.state_dict(), f"fold_{fold}.pth")
        
    print("Avg final val loss", np.sqrt(np.mean(np.array(val_losses)**2)), np.std(val_losses))
    return np.mean(val_losses)

In [None]:
# Optuna

def objective(trial):
    training_params = {
        'lr': trial.suggest_loguniform('lr', 1e-07, 1e-03),
        'weight_decay': trial.suggest_loguniform('weight_decay', 1e-07, 0.1),
        'epochs': trial.suggest_categorical('epochs', [1, 2, 3])
    }
    
    return train_swin(**training_params)

def optune_params(study_file = None, n_trials = 2):
    if study_file is None:
        study = optuna.create_study(direction="minimize")
    else:
        study = study_file ### TODO
    study.optimize(objective, n_trials=n_trials)

    print(study.best_params)
    print(study.best_value)

    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()

    fig = optuna.visualization.plot_param_importances(study)
    fig.show()
    
#optune_params()