# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/fb3-deberta-v3-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='harshit-FB3'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
#     model = "../input/v3base-mlm-fb3/"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train_lower",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Currently logged in as: [33manony-moose-400094[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('pip uninstall -y tokenizers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')






[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip
tokenizers.__version__: 0.12.1
transformers.__version__: 4.19.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i] # to compensate for scaling
        y_pred = y_preds[:,i] # 
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# train['full_text'] = train['full_text'].apply(lambda x: x.replace("\n\n"," [SEP] "))
# test['full_text'] = test['full_text'].apply(lambda x: x.replace("\n\n"," [SEP] "))
submission = pd.read_csv('sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [7]:
!pip install inflect

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
import inflect
p = inflect.engine()

In [9]:
train['A'] = train['full_text'].apply(lambda x: x.split("\n\n"))
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,A
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,[I think that students would benefit from lear...
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,[When a problem is a change you have to let it...
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,"[Dear, Principal, If u change the school polic..."
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,[The best time in life is when you become your...
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,[Small act of kindness can impact in other peo...


In [10]:
train['A'] = train['A'].apply(lambda x: [" ".join([p.number_to_words(f),st]) for \
                                                  f, st in enumerate(x)])
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,A
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,[zero I think that students would benefit from...
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,[zero When a problem is a change you have to l...
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,"[zero Dear, Principal, one If u change the sch..."
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,[zero The best time in life is when you become...
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,[zero Small act of kindness can impact in othe...


In [11]:
train = train.explode('A')

In [12]:
train['full_text'] = train.apply(lambda x: x['full_text'] + "[SEP]" +\
                x['A'] if x['full_text'] != x['A'] else x['full_text'], axis = 1)
# train.drop(['full_text'], axis = 1, inplace = True)
# train.rename(columns = {'A':'full_text'}, inplace = True)

In [13]:
train.shape

(21510, 9)

In [14]:
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,A
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,zero I think that students would benefit from ...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,one The hardest part of school is getting read...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,two most students usually take showers before ...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,three when your home your comfortable and you ...
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,zero When a problem is a change you have to le...


In [15]:
labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
# map = {1:0,1.5:1,2:2,2.5:3,3:4,3.5:5,4:6,4.5:7,5:8}
# for label in labels:
# #     max1 = train[label].max()
#     train[label] = train[label].map(map)
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,A
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,zero I think that students would benefit from ...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,one The hardest part of school is getting read...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,two most students usually take showers before ...
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,three when your home your comfortable and you ...
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,zero When a problem is a change you have to le...


In [16]:
train.reset_index(drop = True, inplace = True)

# CV split

In [17]:
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (_, val_index) in enumerate(Fold.split(train, groups=train['text_id'])):
    train.loc[val_index, 'fold'] = int(n)
    
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    5378
1    5378
2    5377
3    5377
dtype: int64

In [18]:
# # ====================================================
# # CV split
# # ====================================================
# Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
#     train.loc[val_index, 'fold'] = int(n)
# train['fold'] = train['fold'].astype(int)
# display(train.groupby('fold').size())

In [19]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [20]:
# ====================================================
# tokenizer
# ====================================================
from tokenizers import AddedToken
# new_tokens = [" [NL] "]
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
print(f"tokenizer len: {len(tokenizer)}")
# print("adding new tokens...")
# tokens_to_add = []
# for this_tok in new_tokens:
#     tokens_to_add.append(AddedToken(this_tok.lower(), lstrip=True, rstrip=False)) # lowercase for MLM


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer len: 128001


In [21]:
# tokenizer.add_tokens(tokens_to_add)
# print(f"tokenizer len: {len(tokenizer)}")

tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [22]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/21510 [00:00<?, ?it/s]

max_len: 2723


In [23]:
train['full_text'][0]

"I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there house they'll be pay more attention. they will be comfortable at home.\n\nThe hardest part of school is getting ready. you wake up go brush your teeth and go to your closet and look at your cloths. after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain. Then you'll have to change. with the online classes you can wear anything and stay home and you wont need to stress about what to wear.\n\nmost students usually take showers before school. they either take it before they sleep or when they wake up. some students do both to smell good. that causes them do miss the bus and effects on there lesson time cause they come late to school. when u have online classes u wont need to miss lessons cause you can get everything set up and go t

In [24]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [25]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
            
        self.model.resize_token_embeddings(len(tokenizer))
        
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self.sigmoid = nn.Sigmoid()
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [26]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [27]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [28]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') #RMSELoss(reduction="mean")
#     criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
#     predictions = np.clip(predictions, 1, 5) # checking
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [29]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold > 2:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Epoch: [1][0/2016] Elapsed 0m 1s (remain 59m 10s) Loss: 2.1913(2.1913) Grad: inf  LR: 0.00002000  
Epoch: [1][20/2016] Elapsed 0m 17s (remain 28m 17s) Loss: 0.1776(1.1557) Grad: 146252.6250  LR: 0.00002000  
Epoch: [1][40/2016] Elapsed 0m 35s (remain 28m 6s) Loss: 0.1660(0.7060) Grad: 82055.0938  LR: 0.00002000  
Epoch: [1][60/2016] Elapsed 0m 54s (remain 29m 16s) Loss: 0.1852(0.5358) Grad: 73813.0312  LR: 0.00002000  
Epoch: [1][80/2016] Elapsed 1m 16s (remain 30m 34s) Loss: 0.0956(0.4400) Grad: 120102.1016  LR: 0.00002000  
Epoch: [1][100/2016] Elapsed 1m 35s (remain 30m 5s) Loss: 0.1293(0.3789) Grad: 57602.1250  LR: 0.00001999  
Epoch: [1][120/2016] Elapsed 1m 54s (remain 30m 0s) Loss: 0.1440(0.3382) Grad: 93558.5781  LR: 0.00001999  
Epoch: [1][140/2016] Elapsed 2m 13s (remain 29m 37s) Loss: 0.0869(0.3080) Grad: 89741.2188  LR: 0.00001998  
Epoch: [1][160/2016] Elapsed 2m 32s (remain 29m 15s) Loss: 0.1140(0.2869) Grad: 88240.6953  LR: 0.00001998  
Epoch: [1][180/2016] Elapsed 2m 57

Epoch 1 - avg_train_loss: 0.1169  avg_val_loss: 0.1136  time: 2210s
Epoch 1 - Score: 0.4778  Scores: [0.5231824780110864, 0.4667939676335875, 0.4195210651614346, 0.4995337082082646, 0.5059015065971707, 0.4519881816124189]
Epoch 1 - Save Best Score: 0.4778 Model


EVAL: [336/337] Elapsed 2m 54s (remain 0m 0s) Loss: 0.1233(0.1136) 
Epoch: [2][0/2016] Elapsed 0m 1s (remain 47m 7s) Loss: 0.0663(0.0663) Grad: 208242.2656  LR: 0.00001707  
Epoch: [2][20/2016] Elapsed 0m 20s (remain 32m 46s) Loss: 0.0479(0.0721) Grad: 54107.5938  LR: 0.00001701  
Epoch: [2][40/2016] Elapsed 0m 40s (remain 32m 28s) Loss: 0.0495(0.0677) Grad: 35296.0156  LR: 0.00001696  
Epoch: [2][60/2016] Elapsed 1m 1s (remain 32m 41s) Loss: 0.0487(0.0679) Grad: 86547.6094  LR: 0.00001690  
Epoch: [2][80/2016] Elapsed 1m 21s (remain 32m 22s) Loss: 0.0475(0.0656) Grad: 62252.9727  LR: 0.00001685  
Epoch: [2][100/2016] Elapsed 1m 40s (remain 31m 36s) Loss: 0.0626(0.0650) Grad: 79979.8359  LR: 0.00001679  
Epoch: [2][120/2016] Elapsed 2m 1s (remain 31m 45s) Loss: 0.0510(0.0644) Grad: 73997.2188  LR: 0.00001673  
Epoch: [2][140/2016] Elapsed 2m 17s (remain 30m 27s) Loss: 0.0685(0.0638) Grad: 118839.4219  LR: 0.00001667  
Epoch: [2][160/2016] Elapsed 2m 36s (remain 30m 3s) Loss: 0.0575(0.0

Epoch 2 - avg_train_loss: 0.0518  avg_val_loss: 0.1165  time: 2210s
Epoch 2 - Score: 0.4841  Scores: [0.5380824524206657, 0.4780078632167015, 0.41584839200413687, 0.48317392559189526, 0.5159191043520955, 0.4736802390454806]


EVAL: [336/337] Elapsed 2m 54s (remain 0m 0s) Loss: 0.1115(0.1165) 
Epoch: [3][0/2016] Elapsed 0m 1s (remain 53m 25s) Loss: 0.0403(0.0403) Grad: 144878.3750  LR: 0.00001000  
Epoch: [3][20/2016] Elapsed 0m 22s (remain 34m 56s) Loss: 0.0425(0.0371) Grad: 157088.1094  LR: 0.00000992  
Epoch: [3][40/2016] Elapsed 0m 41s (remain 33m 25s) Loss: 0.0260(0.0336) Grad: 81088.5547  LR: 0.00000984  
Epoch: [3][60/2016] Elapsed 1m 4s (remain 34m 39s) Loss: 0.0162(0.0338) Grad: 62346.1055  LR: 0.00000977  
Epoch: [3][80/2016] Elapsed 1m 25s (remain 33m 59s) Loss: 0.0320(0.0332) Grad: 124945.5391  LR: 0.00000969  
Epoch: [3][100/2016] Elapsed 1m 45s (remain 33m 19s) Loss: 0.0298(0.0333) Grad: 64351.6289  LR: 0.00000961  
Epoch: [3][120/2016] Elapsed 2m 5s (remain 32m 42s) Loss: 0.0277(0.0327) Grad: 87092.9219  LR: 0.00000953  
Epoch: [3][140/2016] Elapsed 2m 30s (remain 33m 16s) Loss: 0.0337(0.0329) Grad: 116422.7422  LR: 0.00000945  
Epoch: [3][160/2016] Elapsed 2m 46s (remain 31m 54s) Loss: 0.0386

Epoch 3 - avg_train_loss: 0.0267  avg_val_loss: 0.1232  time: 2216s
Epoch 3 - Score: 0.4986  Scores: [0.5377578359661003, 0.5011124827370012, 0.433716898418676, 0.4974320063408475, 0.5315397129567057, 0.4901744398577935]


EVAL: [336/337] Elapsed 2m 54s (remain 0m 0s) Loss: 0.0896(0.1232) 
Epoch: [4][0/2016] Elapsed 0m 1s (remain 64m 43s) Loss: 0.0175(0.0175) Grad: 81201.4375  LR: 0.00000293  
Epoch: [4][20/2016] Elapsed 0m 26s (remain 41m 20s) Loss: 0.0117(0.0189) Grad: 79637.0547  LR: 0.00000288  
Epoch: [4][40/2016] Elapsed 0m 45s (remain 36m 8s) Loss: 0.0138(0.0185) Grad: 78122.8281  LR: 0.00000282  
Epoch: [4][60/2016] Elapsed 1m 10s (remain 37m 48s) Loss: 0.0222(0.0179) Grad: 94329.0859  LR: 0.00000277  
Epoch: [4][80/2016] Elapsed 1m 33s (remain 37m 14s) Loss: 0.0216(0.0186) Grad: 98685.0000  LR: 0.00000271  
Epoch: [4][100/2016] Elapsed 1m 51s (remain 35m 17s) Loss: 0.0133(0.0182) Grad: 73805.1719  LR: 0.00000266  
Epoch: [4][120/2016] Elapsed 2m 10s (remain 34m 1s) Loss: 0.0136(0.0184) Grad: 71129.8828  LR: 0.00000261  
Epoch: [4][140/2016] Elapsed 2m 28s (remain 32m 52s) Loss: 0.0166(0.0185) Grad: 93219.0469  LR: 0.00000256  
Epoch: [4][160/2016] Elapsed 2m 46s (remain 31m 54s) Loss: 0.0221(0.0

Epoch 4 - avg_train_loss: 0.0169  avg_val_loss: 0.1256  time: 2205s
Epoch 4 - Score: 0.5037  Scores: [0.5439362525734599, 0.508205968321211, 0.4397558991013181, 0.5056011527987956, 0.5310216133791357, 0.4934146443796618]


EVAL: [336/337] Elapsed 2m 55s (remain 0m 0s) Loss: 0.1034(0.1256) 


Score: 0.4778  Scores: [0.5231824780110864, 0.4667939676335875, 0.4195210651614346, 0.4995337082082646, 0.5059015065971707, 0.4519881816124189]
Score: 0.4778  Scores: [0.5231824780110864, 0.4667939676335875, 0.4195210651614346, 0.4995337082082646, 0.5059015065971707, 0.4519881816124189]


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold3] avg_train_loss,█▃▂▁
[fold3] avg_val_loss,▁▃▇█
[fold3] epoch,▁▃▆█
[fold3] loss,▆▄█▄▃▅▆▃▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▂▂▂▁▁▂▁▂▁▁▂▂
[fold3] lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold3] score,▁▃▇█

0,1
[fold3] avg_train_loss,0.01685
[fold3] avg_val_loss,0.12562
[fold3] epoch,4.0
[fold3] loss,0.0205
[fold3] lr,0.0
[fold3] score,0.50366


In [30]:
labels = CFG.target_cols
pred_labels = [f"pred_{c}" for c in CFG.target_cols]
agg_labels = {k:'mean' for k in labels}
agg_pred_labels = {k:'mean' for k in pred_labels}

In [31]:
agg_labels.update(agg_pred_labels)
agg_labels

{'cohesion': 'mean',
 'syntax': 'mean',
 'vocabulary': 'mean',
 'phraseology': 'mean',
 'grammar': 'mean',
 'conventions': 'mean',
 'pred_cohesion': 'mean',
 'pred_syntax': 'mean',
 'pred_vocabulary': 'mean',
 'pred_phraseology': 'mean',
 'pred_grammar': 'mean',
 'pred_conventions': 'mean'}

In [34]:
oof_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,A,fold,pred_cohesion,pred_syntax,pred_vocabulary,pred_phraseology,pred_grammar,pred_conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,zero I think that students would benefit from ...,3,2.916249,2.726161,3.021538,2.818232,2.891804,2.935116
1,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,one The hardest part of school is getting read...,3,2.960047,2.754701,3.022443,2.827655,2.893153,3.017515
2,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,two most students usually take showers before ...,3,2.895300,2.731591,3.038600,2.833760,2.886196,2.915610
3,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,three when your home your comfortable and you ...,3,2.845835,2.695977,2.960149,2.735655,2.825235,2.854933
4,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,zero When a problem is a change you have to le...,3,2.723149,2.497188,2.735950,2.341263,2.307486,2.535380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5372,FEDCB06D743F,"Dear Generic_Name,\n\nIt has come to my attent...",3.5,3.5,3.0,3.5,3.5,2.5,nine Generic_Name,3,3.814322,3.477579,3.605341,3.551111,3.434705,3.189371
5373,FFA6690BC309,Having activities after school are good ideas ...,4.0,3.5,4.0,3.5,3.5,4.0,zero Having activities after school are good i...,3,3.463807,3.134905,3.241024,3.042943,2.905559,3.214255
5374,FFA6690BC309,Having activities after school are good ideas ...,4.0,3.5,4.0,3.5,3.5,4.0,"one First, students need to have the freedom t...",3,3.566515,3.228259,3.343150,3.169655,2.995138,3.324851
5375,FFA6690BC309,Having activities after school are good ideas ...,4.0,3.5,4.0,3.5,3.5,4.0,"two Second, students need this time after scho...",3,3.525712,3.199621,3.325330,3.144158,3.028299,3.291754


In [32]:
oof_df1 = oof_df.groupby('text_id').agg(agg_labels).reset_index()

In [33]:
get_result(oof_df1)

Score: 0.4686  Scores: [0.5024046614083704, 0.4600307755333685, 0.41373461847665133, 0.4946484250130086, 0.4968070543348217, 0.44392775462063755]
