In [1]:
COLAB = False

In [2]:
if COLAB:
    from google.colab import drive
    drive.mount("/content/gdrive")

In [3]:
if COLAB:
    !unzip gdrive/MyDrive/idao_data/IDAO_2021_oski.zip

In [4]:
if COLAB:
    !pip install transformers
    !pip install timm
    !pip install albumentations==0.4.6


In [5]:
!ls ..

config	data  experiments  notebooks  README.md  src


In [6]:
import sys

if COLAB:
    sys.path.append("IDAO_2021_oski/src")
else:
    sys.path.append("../src")

from collections import defaultdict
import os
import random

import numpy as np
import pandas as pd
import yaml
import shutil
from sklearn.model_selection import train_test_split, StratifiedKFold

from torch.utils.data import DataLoader
from torch import nn
import torch

from transformers import get_linear_schedule_with_warmup

from datasets import SimpleDataset
from models import Wrapper, MixUp
from pipeline_utils import training, pseudolabeling
from models import ENCODER_PARAMS



In [7]:
if COLAB:
    PATH_TO_CFG = "IDAO_2021_oski/config/config.yaml"
else:
    PATH_TO_CFG = "../config/config.yaml"
with open(PATH_TO_CFG, "r") as file:
    config = yaml.load(file)

DATA_ROOT = config["general"]["data_root"]

def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

fix_seed(config["general"]["seed"])
device_ids = [str(id) for id in config["general"]["device_ids"]]
ids = ",".join(device_ids)
DEVICE = torch.device(f"cuda:{ids}")

  


In [8]:
if COLAB:
    DATA_ROOT = "IDAO_2021_oski/data/track_1/idao_dataset"
else:
    DATA_ROOT = "../data/track_1/idao_dataset"
    

In [9]:
train = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"), index_col=0)

val_private = pd.read_csv(os.path.join(DATA_ROOT, "val_private.csv"), index_col=0)

In [10]:
if COLAB:
    train["file_path"] = train["file_path"].str.replace("/media/paniquex/samsung_2tb/IDAO_2021_oski", "IDAO_2021_oski")
    val_private["file_path"] = val_private["file_path"].str.replace("/media/paniquex/samsung_2tb/IDAO_2021_oski", "IDAO_2021_oski")

In [11]:
train.sample(10)

Unnamed: 0,0,1,file_path,target
4030,NR,20,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,8
12593,ER,30,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,4
7515,ER,3,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,3
1608,NR,6,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,11
9627,ER,30,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,4
7460,ER,30,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,4
8568,ER,10,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,1
7526,ER,10,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,1
35,NR,20,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,8
1749,NR,1,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,6


In [12]:
test = pd.read_csv(os.path.join(DATA_ROOT, "test.csv"), index_col=0)
if COLAB:
    test["file_path"] = test["file_path"].str.replace("/media/paniquex/samsung_2tb/IDAO_2021_oski", "IDAO_2021_oski")
test.sample(5)

Unnamed: 0,file_path,type
7498,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,private
4411,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,private
2246,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,private
16149,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,private
10109,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,private


In [13]:
if COLAB:
    path = "IDAO_2021_oski/data/track_1/idao_dataset" # directory with public_test and private_test directories
else:
    path = "../data/track_1/idao_dataset" # directory with public_test and private_test directories
    
file_names_public = [x for x in os.listdir(os.path.join(path, "public_test")) if ".png" in x] #+ os.listdir(os.path.join(path, "private_test"))
test_csv = pd.DataFrame({"file_path": file_names_public, "type": "public"})
file_names_private = [x for x in os.listdir(os.path.join(path, "private_test")) if ".png" in x]#+ os.listdir(os.path.join(path, "private_test"))
test_csv = test_csv.append(pd.DataFrame({"file_path": file_names_private, "type": "private"})).reset_index()

test_csv.loc[test_csv["type"] == "public", "file_path"] = str(os.path.join(path, "public_test")) + "/" + test_csv["file_path"]
test_csv.loc[test_csv["type"] == "private", "file_path"] = str(os.path.join(path, "private_test")) + "/" + test_csv["file_path"]

In [14]:
TEST = test_csv

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() 



mask_NR = (train["0"] == "NR") & ((train["1"] == 1) | (train["1"] == 6) | (train["1"] == 20))
mask_ER = (train["0"] == "ER") & ((train["1"] == 3) | (train["1"] == 10) | (train["1"] == 30))
train = train[mask_NR | mask_ER]
train.index = pd.RangeIndex(0, len(train.index))
if config["general"]["task_type"] == "regression":
    train["target"] = train["1"]
    val_private["target"] = val_private["1"]
elif config["general"]["task_type"] == "classification":
    train["target"] = le.fit_transform(train["target"])
#     val_private["target"] = le.fit_transform(val_private)
elif config["general"]["task_type"] == "joint":
    train["target_regression"] = train["1"]
    train["target_classification"] = le.fit_transform(train["0"])
    train["target"] = train["target_regression"].astype(str) + "_" + train["target_classification"].astype(str)
    
    val_private["target_regression"] = val_private["1"]
    val_private["target_classification"] = le.fit_transform(val_private["0"])


kfold = StratifiedKFold(n_splits=config["training"]["n_folds"], shuffle=True,
                        random_state=config["general"]["seed"])
for fold, (t_idx, v_idx) in enumerate(kfold.split(train, train["target"])):
    train.loc[v_idx, "kfold"] = fold


    
train.to_csv(os.path.join(DATA_ROOT, "train", "train_folds.csv"))



In [16]:
val_private

Unnamed: 0,0,1,file_path,target,target_regression,target_classification
2719,NR,30,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,10,30,1
3783,NR,3,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,9,3,1
4874,NR,10,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,7,10,1
4915,NR,30,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,10,30,1
5485,NR,3,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,9,3,1
5781,NR,10,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,7,10,1
7132,ER,20,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,2,20,0
7397,ER,20,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,2,20,0
8012,ER,1,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,0,1,0
8142,ER,6,/media/paniquex/samsung_2tb/IDAO_2021_oski/dat...,5,6,0


In [17]:
if COLAB:
    !pip install --upgrade --force-reinstall --no-deps albumentations


In [18]:
import albumentations
from albumentations import *
from albumentations.pytorch import ToTensorV2


transforms_train = albumentations.Compose([
    ColorJitter (brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, always_apply=False, p=0.5),
    ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
    CenterCrop(height=400,
               width=400),
    Resize(*config["preprocessing"]["img_size"]),
    Normalize(
         mean=[0.485, 0.456, 0.406],
         std=[0.229, 0.224, 0.225],
     ),
    ToTensorV2()
])

transforms_val = albumentations.Compose([
    CenterCrop(height=400,
               width=400),
    Resize(*config["preprocessing"]["img_size"]),
    Normalize(
         mean=[0.485, 0.456, 0.406],
         std=[0.229, 0.224, 0.225],
     ),
    ToTensorV2()
])

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
    
    
def focal_loss(input, target, focus=2.0, raw=False):

    if raw:
        input = torch.sigmoid(input)

    eps = 1e-7

    prob_true = input * target + (1 - input) * (1 - target)
    prob_true = torch.clamp(prob_true, eps, 1-eps)
    modulating_factor = (1.0 - prob_true).pow(focus)

    return (-modulating_factor * prob_true.log()).mean()


from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss


class AngularPenaltySMLoss(nn.Module):

    def __init__(self, in_features, out_features, loss_type="cosface", eps=1e-7, s=None, m=None):
        '''
        Angular Penalty Softmax Loss
        Three 'loss_types' available: ['arcface', 'sphereface', 'cosface']
        These losses are described in the following papers: 
        
        ArcFace: https://arxiv.org/abs/1801.07698
        SphereFace: https://arxiv.org/abs/1704.08063
        CosFace/Ad Margin: https://arxiv.org/abs/1801.05599
        '''
        super(AngularPenaltySMLoss, self).__init__()
        loss_type = loss_type.lower()
        assert loss_type in  ['arcface', 'sphereface', 'cosface']
        if loss_type == 'arcface':
            self.s = 64.0 if not s else s
            self.m = 0.5 if not m else m
        if loss_type == 'sphereface':
            self.s = 64.0 if not s else s
            self.m = 1.35 if not m else m
        if loss_type == 'cosface':
            self.s = 30.0 if not s else s
            self.m = 0.4 if not m else m
        self.loss_type = loss_type
        self.in_features = in_features
        self.out_features = out_features
        self.fc = nn.Linear(in_features, out_features, bias=False)
        self.eps = eps

    def forward(self, x, labels):
        '''
        input shape (N, in_features)
        '''
        assert len(x) == len(labels)
        assert torch.min(labels) >= 0
        assert torch.max(labels) < self.out_features
        
        for W in self.fc.parameters():
            W = F.normalize(W, p=2, dim=1)

        x = F.normalize(x, p=2, dim=1)
#         print(x.shape)
        wf = self.fc(x)
        if self.loss_type == 'cosface':
            numerator = self.s * (torch.diagonal(wf.transpose(0, 1)[labels]) - self.m)
        if self.loss_type == 'arcface':
            numerator = self.s * torch.cos(torch.acos(torch.clamp(torch.diagonal(wf.transpose(0, 1)[labels]), -1.+self.eps, 1-self.eps)) + self.m)
        if self.loss_type == 'sphereface':
            numerator = self.s * torch.cos(self.m * torch.acos(torch.clamp(torch.diagonal(wf.transpose(0, 1)[labels]), -1.+self.eps, 1-self.eps)))
        excl = torch.cat([torch.cat((wf[i, :y], wf[i, y+1:])).unsqueeze(0) for i, y in enumerate(labels)], dim=0)
        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.s * excl), dim=1)
        L = numerator - torch.log(denominator)
        return -torch.mean(L), wf

In [20]:
config["training"]["loss"] = {"reg": "L1", "clf": "BCE"}

In [21]:
EPOCHS = config["training"]["n_epochs"]


criterion_aam = None
if config["general"]["task_type"] == "classification":
    if config["training"]["loss"]["clf"] == "FOCAL":
        criterion = focal_loss
    elif config["training"]["loss"]["clf"] == "AAM":
        criterion = "AAM"
        criterion_aam = AngularPenaltySMLoss
    elif config["training"]["loss"]["clf"] == "BCE":
        criterion = nn.BCELoss()
elif config["general"]["task_type"] == "regression":
    if config["training"]["loss"]["reg"] == "L1":
        criterion = nn.L1Loss()
    elif config["training"]["loss"]["reg"] == "L2":
        criterion = nn.MSELoss()
elif config["training"]["loss"] == "L1":
    criterion = nn.L1Loss()
elif config["general"]["task_type"] == "joint":
    criterion = {}
    if config["training"]["loss"]["clf"] == "FOCAL":
        criterion["clf"] = focal_loss
    elif config["training"]["loss"]["clf"] == "AAM":
        criterion["clf"] = "AAM"
        criterion_aam = AngularPenaltySMLoss
    elif config["training"]["loss"]["clf"] == "BCE":
        criterion["clf"] = nn.BCELoss()
    if config["training"]["loss"]["reg"] == "L1":
        criterion["reg"] = nn.L1Loss()
    elif config["training"]["loss"]["reg"] == "L2":
        criterion["reg"] = nn.MSELoss()

In [22]:
model_names = None
if config["training"]["finetune"]:
    model_names = [name for name in os.listdir(config['training']['models_dir']) if name.find("best_model_fold") != -1]
    model_names = sorted(model_names)

In [23]:
criterion

{'clf': BCELoss(), 'reg': L1Loss()}

In [24]:
if COLAB:
    config["general"]["out_path"] = "IDAO_2021_oski/experiments/resnest14d_1e-4_joint_BCE_L1/"
else:
    config["general"]["out_path"] = "../experiments/resnest14d_1e-4_joint_BCE_L1/"
    
config["preprocessing"]["img_size"] = [224, 224]
config["training"]["dataloader"]["batch_size"] = 4
config["training"]["n_folds"] = 3

In [25]:
try:
    shutil.rmtree(config["general"]["out_path"])
except:
    pass

try:
    os.mkdir(config["general"]["out_path"])
except:
    pass


if config["general"]["task_type"] == "regression":
    config["general"]["classes_num"] = 1
elif config["general"]["task_type"] == "joint":
    config["general"]["classes_num"] = 2
    
    
samples2preds_all = {}
samples2trues_all = {}

models = []
for i in range(config["training"]["n_folds"]):
    model_name = config["general"]["model_name"]
    model = None
    model = ENCODER_PARAMS[model_name]["init_op"]()
    model = Wrapper(model, feat_module=None, classes_num=config["general"]["classes_num"],
                    model_name=model_name,
                    spec_augmenter=None, 
                    mixup_module=None,
                    task_type=config["general"]["task_type"],
                    activation_func=config["training"]["activation_func"],
                    criterion_aam=criterion_aam)
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=config["training"]["lr"])
#         optimizer = Ranger(model.parameters(),
#                lr=config["training"]["lr"],
#                betas=(.90, 0.999), k=4)
    train_dataset = SimpleDataset(df=train[train["kfold"] != fold], mode="train",
                                  transform=transforms_train, classes_num=config["general"]["classes_num"],
                                  task_type=config["general"]["task_type"])

    val_dataset = SimpleDataset(df=train[train["kfold"] == fold], mode="val",
                                transform=transforms_val, classes_num=config["general"]["classes_num"],
                                task_type=config["general"]["task_type"])
    val_private_dataset = SimpleDataset(df=val_private, mode="val",
                                        transform=transforms_val, classes_num=config["general"]["classes_num"],
                                        task_type=config["general"]["task_type"])
    
    train_dataloader = DataLoader(train_dataset,
                                  **config["training"]["dataloader"])
    val_dataloader = DataLoader(val_dataset,
                                **config["validation"]["dataloader"])
    val_private_dataloader = DataLoader(val_private_dataset,
                                        **config["validation"]["dataloader"])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                      T_max=(config["training"]["n_epochs"] - config["training"]["n_epochs_flat"])  * len(train_dataloader),
                                                      eta_min=1e-8)    
    samples2preds, samples2trues, model = training(EPOCHS=EPOCHS, model=model,
                                            train_dataloader=train_dataloader, 
                                            val_dataloaders_dct={"val_dataloader": val_dataloader,
                                                                  "val_private_dataloader": val_private_dataloader},
                                            DEVICE=DEVICE, criterion=criterion,
                                            optimizer=optimizer, scheduler=scheduler,
                                            config=config, fold=i,
                                            task_type=config["general"]["task_type"], CONFIG_PATH=PATH_TO_CFG)
    models.append(model)
    samples2preds_all.update(samples2preds)
    samples2trues_all.update(samples2trues)

5.7503:   7%|▋         | 166/2232 [00:29<06:10,  5.58it/s] 
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



ERROR in loss list appending
Traceback (most recent call last):
  File "/home/paniquex/anaconda3/envs/kaggle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-4156a8dd7984>", line 65, in <module>
    task_type=config["general"]["task_type"], CONFIG_PATH=PATH_TO_CFG)
  File "../src/pipeline_utils.py", line 63, in training
    for batch in t:
  File "/home/paniquex/anaconda3/envs/kaggle/lib/python3.7/site-packages/tqdm/std.py", line 1158, in __iter__
    for obj in iterable:
  File "/home/paniquex/anaconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 517, in __next__
    data = self._next_data()
  File "/home/paniquex/anaconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1182, in _next_data
    idx, data = self._get_data()
  File "/home/paniquex/anaconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/

TypeError: object of type 'NoneType' has no len()

In [None]:
# import imp
# #imp.reload(pipeline_utils)
# imp.reload(datasets)
# test = test_csv
#import datasets

In [None]:


samples2preds_all = {}
samples2trues_all = {}
LR = config["training"]["lr"]
flag_LR = True

for j in range(config["pseudo"]["iter"]):
    with torch.no_grad():
        train, test = pipeline_utils.pseudolabeling(models, train, test, config, DEVICE, transforms_val)
        private = (train["type"].values == "private").sum()
        public = (train["type"].values == "public").sum()
        print("Pseudo labeling epoch", j)
        print("Private ratio", private / (public + private)) 
        print("Public ratio", public / (public + private))
        if flag_LR:
            LR *= config["pseudo"]["lr_coef"]
            flag_LR = False
    
    for i in range(config["training"]["n_folds"]):
        model = models[i]
        model.to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    #         optimizer = Ranger(model.parameters(),
    #                lr=config["training"]["lr"],
    #                betas=(.90, 0.999), k=4)
        train_dataset = SimpleDataset(df=train[train["kfold"] != fold], mode="train",
                                      transform=transforms_train, classes_num=config["general"]["classes_num"],
                                      task_type=config["general"]["task_type"])

        val_dataset = SimpleDataset(df=train[train["kfold"] == fold], mode="val",
                                    transform=transforms_val, classes_num=config["general"]["classes_num"],
                                    task_type=config["general"]["task_type"])
        val_private_dataset = SimpleDataset(df=val_private, mode="val",
                                            transform=transforms_val, classes_num=config["general"]["classes_num"],
                                            task_type=config["general"]["task_type"])
        
        train_dataloader = DataLoader(train_dataset,
                                      **config["training"]["dataloader"])
        val_dataloader = DataLoader(val_dataset,
                                    **config["validation"]["dataloader"])
        val_private_dataloader = DataLoader(val_private_dataset,
                                            **config["validation"]["dataloader"])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                          T_max=(config["pseudo"]["n_epochs"] - config["pseudo"]["n_epochs_flat"])  * len(train_dataloader),
                                                          eta_min=1e-8)    
        samples2preds, samples2trues, model = training(EPOCHS=config["pseudo"]["n_epochs"], model=model,
                                                train_dataloader=train_dataloader, 
                                                val_dataloaders_dct={"val_dataloader": val_dataloader,
                                                                      "val_private_dataloader": val_private_dataloader},
                                                DEVICE=DEVICE, criterion=criterion,
                                                optimizer=optimizer, scheduler=scheduler,
                                                config=config, fold=i, pseudo_iter=j+1,
                                                task_type=config["general"]["task_type"], CONFIG_PATH=PATH_TO_CFG)
        models[i] = model
        samples2preds_all.update(samples2preds)
        samples2trues_all.update(samples2trues)
