## Log
* use 512 instead of 1024 as max_seq_len
* turn discriminative learning rates on

In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Model training')
    # params of training
    parser.add_argument(
        "--fold", dest="fold", help="Train fold", default=None, type=int)
    parser.add_argument(
        '--batch_size',
        dest='batch_size',
        help='Mini batch size of one gpu or cpu',
        type=int,
        default=None)
    return parser.parse_args()


# Config

In [3]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'pool'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 512
    max_position_embeddings = 512
    folds = [2]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = True
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
#     encoder_lr = 5e-6
#     head_lr = 5e-6
    encoder_lr = 20e-6
    head_lr = 10e-5
    
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 8
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    
    

## logger

In [4]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    if not os.path.exists(CFG.OUTPUT_DIR):
        os.makedirs(CFG.OUTPUT_DIR)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Preproc

In [5]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

df["fold"] = df["prompt_id"].map(id2fold)

In [6]:
df 

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,1
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,1
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710,1
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,1
...,...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990,3
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784,3
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294,3
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538,3


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fa4487f1640>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 0fe4381b-773e-4bcd-ab1d-ba270440c69b)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [9]:
mask_lm_datacollator = DataCollatorForWholeWordMask(tokenizer)
def data_collator(batch):
    input_ids = [{'input_ids':i[0]} for i in batch]
    token_type_ids = [i[1] for i in batch]
    attention_mask = [i[2] for i in batch]
    labels = [i[3] for i in batch]
    masked_input = mask_lm_datacollator(input_ids)['input_ids']
    return masked_input,\
               torch.stack(token_type_ids),\
               torch.stack(attention_mask),\
               torch.stack(labels)

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.prompt_title = df['prompt_title'].values.astype(str)
        self.prompt_text = df['prompt_text'].values.astype(str)
        self.prompt_question = df['prompt_question'].values.astype(str)
        self.text = df['text'].values.astype(str)
        self.content = df['content'].values
        self.wording = df['wording'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.prompt_title)
    
    def tokenize(self, example):
        sep = self.tokenizer.sep_token
        if  CFG.input_type == '1':
            prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
        else:
            prompt = example["prompt_question"] 
        
        labels = [float(example["content"]), float(example["wording"])]

        tokenized = tokenizer(
            example["text"],
            prompt,
            padding='max_length',
            truncation=True,
            max_length=CFG.max_input_length,
            return_tensors=None,
        )
        
        return {
            **tokenized,
            "labels": labels,
        }
    
    def __getitem__(self, item):
        example = {
                    "prompt_title":self.prompt_title[item],
                    "prompt_text":self.prompt_text[item],
                    "prompt_question":self.prompt_question[item],
                    "text":self.text[item],
                    "content":self.content[item],
                    "wording":self.wording[item],
                  }
        
        out = self.tokenize(example)
       
        return {
                'input_ids': torch.as_tensor(out['input_ids'], dtype=torch.long),
                'token_type_ids': torch.as_tensor(out['token_type_ids'], dtype=torch.long),
                'attention_mask': torch.as_tensor(out['attention_mask'], dtype=torch.long),
                'labels': torch.as_tensor(out['labels'], dtype=torch.float),
        }
        
        
        

## Model

In [11]:
def init_params(module_lst):
    for module in module_lst:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

class Custom_Bert(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls = nn.Sequential(
            nn.Linear(dim,1)
        )
        init_params([self.cls,self.attention])

    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output = self.cls(logits)
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)


class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(torch.mean(output, dim=1))
        return SequenceClassifierOutput(
            loss=nn.MSELoss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )

class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret    

class Custom_Bert_Pool(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        #self.base = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        print('load pretrained model ...');
        self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model_1009', config = self.config)
        
        self.pool = GeMText()
        self.cls = nn.Linear(self.config.hidden_size,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(self.pool(output, attention_mask))
        return SequenceClassifierOutput(
            loss=nn.SmoothL1Loss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            print(f'Re-initialize {module}')
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class Custom_Bert_Mean(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states=True
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)

    def forward(self, input_ids, attention_mask,labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                            )


        output = base_output.hidden_states[-1]
        output = self.cls(self.dropout(torch.mean(output, dim=1)))
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

class Custom_Bert_M(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls_0 = nn.Sequential(
            nn.Linear(dim,1)
        )

        self.cls_1 = nn.Linear(dim,5)
        init_params([self.cls_0,self.cls_1,self.attention])

    def forward(self, input_ids, attention_mask, labels):
        base_output = self.base(input_ids=input_ids,
                    attention_mask=attention_mask,
                             )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output_0 = self.cls_0(logits)
        output_1 = self.cls_1(logits)
        if labels is None:
            return output_0

        else:
            regression_loss = nn.MSELoss()(torch.squeeze(output_0,1),labels)
            labels = labels.double()
            cls_labels = torch.where(labels==1.,4.0,labels)
            cls_labels = torch.where(cls_labels==0.25,1.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.5,2.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.75,3.0,cls_labels)
            cls_labels = cls_labels.long()
            cls_loss = nn.CrossEntropyLoss()(output_1, cls_labels)
            return ( 0.8 * regression_loss + 0.2 * cls_loss, output_0)

In [12]:
def build_model():
    if CFG.model_type == 'base':
        model_config = AutoConfig.from_pretrained(CFG.model_path)
        model_config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })

        #print(model_config)
        model = AutoModelForSequenceClassification.from_pretrained(
            CFG.model_path, config=model_config
        )
    if CFG.model_type == 'simple':
        model = Custom_Bert_Simple()
    if CFG.model_type == 'pool':
        model = Custom_Bert_Pool()
        if CFG.reinit_layers > 0:
            print("=="*40)
            print(f"Reinitialize the last {CFG.reinit_layers} layer(s).")
            for layer in model.base.encoder.layer[-CFG.reinit_layers:]:
                print("===")
                layer.apply(model._init_weights)
            print("=="*40)
        if CFG.load_pretrained:
            model.load_state_dict(torch.load('./pretrained/microsoft_deberta-v3-base_best_ema.pth')['model'])
    return model

# Train

In [13]:
from copy import deepcopy
class ModelEMA:
    """Model Exponential Moving Average from https://github.com/rwightman/
    pytorch-image-models Keep a moving average of everything in the model
    state_dict (parameters and buffers).

    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/
    ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training
    schemes to perform well.
    This class is sensitive where it is initialized in the sequence
    of model init, GPU assignment and distributed training wrappers.
    """

    def __init__(self, model, decay=0.9999, updates=0):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
            updates (int): counter of EMA updates.
        """
        # Create EMA(FP32)
        self.ema_model = deepcopy(model).eval()
        self.ema = self.ema_model
        self.updates = updates
        # decay exponential ramp (to help early epochs)
        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def update(self, model):
        # Update EMA parameters
        with torch.no_grad():
            self.updates += 1
            d = self.decay(self.updates)
            msd =  model.state_dict()# model state_dict
            for k, v in self.ema.state_dict().items():
                if v.dtype.is_floating_point:
                    v *= d
                    v += (1.0 - d) * msd[k].detach()

class EMAHook:
    """EMAHook used in BEVDepth.

    Modified from https://github.com/Megvii-Base
    Detection/BEVDepth/blob/main/callbacks/ema.py.
    """

    def __init__(self, model, init_updates=0, decay=0.9990, resume=None, logger=None):
        super().__init__()
        self.init_updates = init_updates
        self.resume = resume
        self.decay = decay
        self.ema_model = self.before_run(model)
        self.logger = logger

    def before_run(self, model):
        from torch.nn.modules.batchnorm import SyncBatchNorm

        bn_model_list = list()
        bn_model_dist_group_list = list()
        for model_ref in model.modules():
            if isinstance(model_ref, SyncBatchNorm):
                bn_model_list.append(model_ref)
                bn_model_dist_group_list.append(model_ref.process_group)
                model_ref.process_group = None
        ema_model = ModelEMA(model, self.decay)

        for bn_model, dist_group in zip(bn_model_list,
                                        bn_model_dist_group_list):
            bn_model.process_group = dist_group
        ema_model.updates = self.init_updates

        if self.resume is not None:
            self.logger.info(f'resume ema checkpoint from {self.resume}')
            cpt = torch.load(self.resume, map_location='cpu')
            load_state_dict(ema_model.ema, cpt['state_dict'])
            ema_model.updates = cpt['updates']

        return ema_model

    def after_train_iter(self, model):
        self.ema_model.update(model)

In [14]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [15]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [16]:
CFG.discriminative_learning_rate_num_groups

1

In [17]:
def get_optimizer_llr_params(model, type='s'):
    """
    Setup the optimizer.
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    MODIFIED VERSION:
    * added support for differential learning rates per layer

    reference: https://github.com/huggingface/transformers/blob/05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe/src/transformers/trainer.py#L804
    """

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    ### ADDED
    if CFG.discriminative_learning_rate:

        num_layers = model.config.num_hidden_layers

        learning_rate_powers = range(0, num_layers, num_layers//CFG.discriminative_learning_rate_num_groups)
        layer_wise_learning_rates = [
            pow(CFG.discriminative_learning_rate_decay_rate, power) * CFG.encoder_lr 
            for power in learning_rate_powers 
            for _ in range(num_layers//CFG.discriminative_learning_rate_num_groups)
          ]
        layer_wise_learning_rates = layer_wise_learning_rates[::-1]
        print('Layer-wise learning rates:', layer_wise_learning_rates)

        # group embedding paramters from the transformer encoder
        embedding_layer = model.base.embeddings
        optimizer_grouped_parameters = [
          {
              "params": [p for n, p in embedding_layer.named_parameters() if not any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": CFG.weight_decay,
          },
          {
              "params": [p for n, p in embedding_layer.named_parameters() if any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": 0.0,
          },
        ]

        # group encoding paramters from the transformer encoder
        encoding_layers = [layer for layer in model.base.encoder.layer]
        for i, layer in enumerate(encoding_layers):
            optimizer_grouped_parameters += [
                {
                    "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": 0.0,
                },
            ]    
        print(f"Detected unattached modules in model.encoder: {[n for n, p in model.base.encoder.named_parameters() if not n.startswith('layer')]}")
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and not any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": 0.0,
            },
        ]

        # group paramters from the task specific head
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and not any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": 0.0,
            },
        ]
    ### END ADDED
    else:
        # group paramters for the entire network
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": 0.0,
            },
        ]
    return optimizer_grouped_parameters

In [18]:
def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        with torch.no_grad():
            model_output = model(**batch)
        label = batch['labels']
        loss, logits = model_output.loss, model_output.logits
        losses.update(loss.item(), batch_size)
        preds.append(logits.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    return losses.avg, predictions, labels

def train_fn(train_loader, model, optimizer, epoch, scheduler, device, valid_loader, start_time, best_score, best_score_ema,ema_hook,wandb, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        loss = model(**batch).loss
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        ema_hook.after_train_iter(model)
        global_step += 1
        scheduler.step()
        end = time.time()
        
        wandb.log({
                'train loss': loss.item(),
                'step': global_step,
                'epoch': epoch,
                'fold': fold,
                'batch_size':CFG.batch_size
            })
        
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
            # eval
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

            # scoring
            score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - content_rmse: {content_rmse:.4f} - wording_rmse: {wording_rmse:.4f} - mcrmse: {mcrmse:.4f}')
            
            
            if best_score > score['mcrmse']:
                if best_score != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
                best_score = score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
            
            
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, ema_hook.ema_model.ema, CFG.device)
            # ema scoring
            ema_score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(ema_score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - ema_content_rmse: {content_rmse:.4f} - ema_wording_rmse: {wording_rmse:.4f} - ema_mcrmse: {mcrmse:.4f}')
            
            
            if best_score_ema > ema_score['mcrmse']:
                if best_score_ema != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score_ema))
                best_score_ema = ema_score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - ema_Save Best Score: {best_score_ema:.4f} Model')
                torch.save({'model': ema_hook.ema_model.ema.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold,best_score_ema))
            
            wandb.log({
            'learning rate': optimizer.param_groups[0]['lr'],
            'validation mcrmse': score['mcrmse'],
            'validation ema mcrmse': ema_score['mcrmse'],
            'step': global_step,
            'epoch': epoch,
        })
            
            model.train()
    return losses.avg, best_score, best_score_ema



def train_loop():
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    wandb.init(project='kaggle-commonlit-eval-student-summaries-1009')
    wandb.config = dict(epochs=CFG.epochs, 
                            batch_size=CFG.batch_size, 
                            learning_rate=CFG.encoder_lr,
                            save_checkpoint=True,
                            )
    for fold in CFG.folds:
        
        if CFG.pretraining:
            tr_data = pd.read_csv('tmp_pessudo.csv')
            tr_data['prompt_title'] = ''
            tr_data = tr_data[-(tr_data['prompt_question'].isin(pdf['prompt_question'].tolist()))]
            va_data = df #df[df['fold']==fold].reset_index(drop=True)
        else:
            tr_data = df[df['fold']!=fold].reset_index(drop=True)
            va_data = df[df['fold']==fold].reset_index(drop=True)
        train_dataset = TrainDataset(tr_data, tokenizer)
        valid_dataset = TrainDataset(va_data, tokenizer)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        # ====================================================
        # model & optimizer
        # ====================================================
        model = build_model()
        #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.to(CFG.device)
        # for param in model.base.parameters():
        #         param.requires_grad = False
        ema_hook = EMAHook(model, init_updates=3000, logger=LOGGER)
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': 0.0},
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_llr_params(model)
        optimizer = AdamW(optimizer_parameters, eps=CFG.eps, betas=CFG.betas)


        
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
            if cfg.scheduler == 'linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                    num_cycles=cfg.num_cycles
                )
            return scheduler

        num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        # ====================================================
        # loop
        # ====================================================
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

        # criterion = LabelSmoothingLoss()
        best_score = float('inf')
        best_score_ema = float('inf')
        for epoch in range(CFG.epochs):

            start_time = time.time()

            # train
            avg_loss, best_score, best_score_ema = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device, valid_loader, start_time, best_score, best_score_ema ,ema_hook, wandb,fold)


        torch.cuda.empty_cache()
        gc.collect()
        del scheduler, optimizer, model
    return 


In [19]:
train_loop()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670207927624383, max=1.0…

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fa40e7d96d0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: bed2b014-8ef4-4869-aba3-ab7a26bc0cf9)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json


load pretrained model ...
Layer-wise learning rates: [2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]
Detected unattached modules in model.encoder: ['rel_embeddings.weight', 'LayerNorm.weight', 'LayerNorm.bias']
Epoch: [1][0/644] Elapsed 0m 1s (remain 20m 18s) Loss: 0.4694(0.4694) Grad: 5.4579  LR: 0.00001571  


Epoch 1 avg_val_loss: 0.4582  time: 94s
Epoch 1 - content_rmse: 1.1186 - wording_rmse: 0.9349 - mcrmse: 1.0268
Epoch 1 - Save Best Score: 1.0268 Model
Epoch 1 avg_val_loss: 0.5039  time: 188s
Epoch 1 - ema_content_rmse: 1.2110 - ema_wording_rmse: 0.9506 - ema_mcrmse: 1.0808
Epoch 1 - ema_Save Best Score: 1.0808 Model


Epoch: [1][100/644] Elapsed 4m 55s (remain 26m 30s) Loss: 0.1368(0.2101) Grad: 3.8765  LR: 0.00001565  


Epoch 1 avg_val_loss: 0.4406  time: 388s
Epoch 1 - content_rmse: 0.9068 - wording_rmse: 1.1241 - mcrmse: 1.0155
Epoch 1 - Save Best Score: 1.0155 Model
Epoch 1 avg_val_loss: 0.3393  time: 482s
Epoch 1 - ema_content_rmse: 0.7642 - ema_wording_rmse: 0.9939 - ema_mcrmse: 0.8791
Epoch 1 - ema_Save Best Score: 0.8791 Model


Epoch: [1][200/644] Elapsed 9m 50s (remain 21m 41s) Loss: 0.1874(0.1788) Grad: 5.6455  LR: 0.00001548  


Epoch 1 avg_val_loss: 0.2597  time: 682s
Epoch 1 - content_rmse: 0.6265 - wording_rmse: 0.8960 - mcrmse: 0.7612
Epoch 1 - Save Best Score: 0.7612 Model
Epoch 1 avg_val_loss: 0.2645  time: 777s
Epoch 1 - ema_content_rmse: 0.6340 - ema_wording_rmse: 0.8987 - ema_mcrmse: 0.7663
Epoch 1 - ema_Save Best Score: 0.7663 Model


Epoch: [1][300/644] Elapsed 14m 44s (remain 16m 48s) Loss: 0.1879(0.1625) Grad: 8.7110  LR: 0.00001519  


Epoch 1 avg_val_loss: 0.3700  time: 976s
Epoch 1 - content_rmse: 0.6492 - wording_rmse: 1.2090 - mcrmse: 0.9291
Epoch 1 avg_val_loss: 0.3654  time: 1068s
Epoch 1 - ema_content_rmse: 0.7346 - ema_wording_rmse: 1.1245 - ema_mcrmse: 0.9296


Epoch: [1][400/644] Elapsed 19m 32s (remain 11m 50s) Loss: 0.0588(0.1531) Grad: 2.4139  LR: 0.00001479  


Epoch 1 avg_val_loss: 0.2310  time: 1265s
Epoch 1 - content_rmse: 0.5632 - wording_rmse: 0.8885 - mcrmse: 0.7259
Epoch 1 - Save Best Score: 0.7259 Model
Epoch 1 avg_val_loss: 0.2210  time: 1360s
Epoch 1 - ema_content_rmse: 0.5376 - ema_wording_rmse: 0.8676 - ema_mcrmse: 0.7026
Epoch 1 - ema_Save Best Score: 0.7026 Model


Epoch: [1][500/644] Elapsed 24m 27s (remain 6m 58s) Loss: 0.0978(0.1466) Grad: 4.4891  LR: 0.00001429  


Epoch 1 avg_val_loss: 0.1894  time: 1559s
Epoch 1 - content_rmse: 0.5189 - wording_rmse: 0.7729 - mcrmse: 0.6459
Epoch 1 - Save Best Score: 0.6459 Model
Epoch 1 avg_val_loss: 0.2385  time: 1654s
Epoch 1 - ema_content_rmse: 0.5620 - ema_wording_rmse: 0.9075 - ema_mcrmse: 0.7347


Epoch: [1][600/644] Elapsed 29m 18s (remain 2m 5s) Loss: 0.0626(0.1405) Grad: 1.5299  LR: 0.00001370  


Epoch 1 avg_val_loss: 0.2071  time: 1851s
Epoch 1 - content_rmse: 0.5404 - wording_rmse: 0.8129 - mcrmse: 0.6766
Epoch 1 avg_val_loss: 0.2133  time: 1942s
Epoch 1 - ema_content_rmse: 0.5557 - ema_wording_rmse: 0.8200 - ema_mcrmse: 0.6879
Epoch 1 - ema_Save Best Score: 0.6879 Model


Epoch: [1][643/644] Elapsed 33m 10s (remain 0m 0s) Loss: 0.0483(0.1386) Grad: 2.2024  LR: 0.00001342  


Epoch 1 avg_val_loss: 0.1961  time: 2082s
Epoch 1 - content_rmse: 0.5316 - wording_rmse: 0.7818 - mcrmse: 0.6567
Epoch 1 avg_val_loss: 0.2382  time: 2174s
Epoch 1 - ema_content_rmse: 0.5716 - ema_wording_rmse: 0.8928 - ema_mcrmse: 0.7322


Epoch: [2][0/644] Elapsed 0m 1s (remain 13m 12s) Loss: 0.0647(0.0647) Grad: 1.8939  LR: 0.00001341  


Epoch 2 avg_val_loss: 0.1924  time: 93s
Epoch 2 - content_rmse: 0.5308 - wording_rmse: 0.7678 - mcrmse: 0.6493
Epoch 2 avg_val_loss: 0.2276  time: 185s
Epoch 2 - ema_content_rmse: 0.5627 - ema_wording_rmse: 0.8664 - ema_mcrmse: 0.7145


Epoch: [2][100/644] Elapsed 4m 49s (remain 25m 56s) Loss: 0.2000(0.0842) Grad: 3.9675  LR: 0.00001269  


Epoch 2 avg_val_loss: 0.1807  time: 381s
Epoch 2 - content_rmse: 0.5145 - wording_rmse: 0.7323 - mcrmse: 0.6234
Epoch 2 - Save Best Score: 0.6234 Model
Epoch 2 avg_val_loss: 0.2172  time: 476s
Epoch 2 - ema_content_rmse: 0.5481 - ema_wording_rmse: 0.8584 - ema_mcrmse: 0.7033


Epoch: [2][200/644] Elapsed 9m 41s (remain 21m 20s) Loss: 0.1313(0.0812) Grad: 1.9869  LR: 0.00001190  


Epoch 2 avg_val_loss: 0.1853  time: 673s
Epoch 2 - content_rmse: 0.5279 - wording_rmse: 0.7491 - mcrmse: 0.6385
Epoch 2 avg_val_loss: 0.1915  time: 765s
Epoch 2 - ema_content_rmse: 0.5290 - ema_wording_rmse: 0.7745 - ema_mcrmse: 0.6518
Epoch 2 - ema_Save Best Score: 0.6518 Model


Epoch: [2][300/644] Elapsed 14m 32s (remain 16m 34s) Loss: 0.1036(0.0792) Grad: 3.8730  LR: 0.00001105  


Epoch 2 avg_val_loss: 0.2458  time: 964s
Epoch 2 - content_rmse: 0.5504 - wording_rmse: 0.9333 - mcrmse: 0.7419
Epoch 2 avg_val_loss: 0.2270  time: 1056s
Epoch 2 - ema_content_rmse: 0.5415 - ema_wording_rmse: 0.8884 - ema_mcrmse: 0.7150


Epoch: [2][400/644] Elapsed 19m 20s (remain 11m 43s) Loss: 0.0737(0.0783) Grad: 3.9376  LR: 0.00001016  


Epoch 2 avg_val_loss: 0.2011  time: 1252s
Epoch 2 - content_rmse: 0.5122 - wording_rmse: 0.8321 - mcrmse: 0.6721
Epoch 2 avg_val_loss: 0.2101  time: 1344s
Epoch 2 - ema_content_rmse: 0.5199 - ema_wording_rmse: 0.8549 - ema_mcrmse: 0.6874


Epoch: [2][500/644] Elapsed 24m 9s (remain 6m 53s) Loss: 0.0830(0.0767) Grad: 3.0782  LR: 0.00000923  


Epoch 2 avg_val_loss: 0.2158  time: 1541s
Epoch 2 - content_rmse: 0.5136 - wording_rmse: 0.8786 - mcrmse: 0.6961
Epoch 2 avg_val_loss: 0.2074  time: 1632s
Epoch 2 - ema_content_rmse: 0.5159 - ema_wording_rmse: 0.8431 - ema_mcrmse: 0.6795


Epoch: [2][600/644] Elapsed 28m 57s (remain 2m 4s) Loss: 0.0533(0.0772) Grad: 3.9632  LR: 0.00000828  


Epoch 2 avg_val_loss: 0.2024  time: 1829s
Epoch 2 - content_rmse: 0.5128 - wording_rmse: 0.8350 - mcrmse: 0.6739
Epoch 2 avg_val_loss: 0.2034  time: 1921s
Epoch 2 - ema_content_rmse: 0.5071 - ema_wording_rmse: 0.8415 - ema_mcrmse: 0.6743


Epoch: [2][643/644] Elapsed 32m 45s (remain 0m 0s) Loss: 0.0538(0.0762) Grad: 1.7266  LR: 0.00000787  


Epoch 2 avg_val_loss: 0.2004  time: 2058s
Epoch 2 - content_rmse: 0.5036 - wording_rmse: 0.8334 - mcrmse: 0.6685
Epoch 2 avg_val_loss: 0.2079  time: 2149s
Epoch 2 - ema_content_rmse: 0.5057 - ema_wording_rmse: 0.8562 - ema_mcrmse: 0.6809


Epoch: [3][0/644] Elapsed 0m 1s (remain 13m 1s) Loss: 0.0376(0.0376) Grad: 1.7843  LR: 0.00000786  


Epoch 3 avg_val_loss: 0.2015  time: 93s
Epoch 3 - content_rmse: 0.5072 - wording_rmse: 0.8349 - mcrmse: 0.6711
Epoch 3 avg_val_loss: 0.2071  time: 185s
Epoch 3 - ema_content_rmse: 0.5057 - ema_wording_rmse: 0.8536 - ema_mcrmse: 0.6796


Epoch: [3][100/644] Elapsed 4m 49s (remain 25m 56s) Loss: 0.0597(0.0364) Grad: 1.6509  LR: 0.00000690  


Epoch 3 avg_val_loss: 0.2346  time: 381s
Epoch 3 - content_rmse: 0.5200 - wording_rmse: 0.9369 - mcrmse: 0.7285
Epoch 3 avg_val_loss: 0.2173  time: 473s
Epoch 3 - ema_content_rmse: 0.5034 - ema_wording_rmse: 0.8937 - ema_mcrmse: 0.6986


Epoch: [3][200/644] Elapsed 9m 38s (remain 21m 14s) Loss: 0.0457(0.0364) Grad: 2.0964  LR: 0.00000596  


Epoch 3 avg_val_loss: 0.1921  time: 670s
Epoch 3 - content_rmse: 0.5192 - wording_rmse: 0.7896 - mcrmse: 0.6544
Epoch 3 avg_val_loss: 0.2068  time: 762s
Epoch 3 - ema_content_rmse: 0.5190 - ema_wording_rmse: 0.8442 - ema_mcrmse: 0.6816


Epoch: [3][300/644] Elapsed 14m 26s (remain 16m 27s) Loss: 0.0079(0.0363) Grad: 0.5247  LR: 0.00000505  


Epoch 3 avg_val_loss: 0.2071  time: 958s
Epoch 3 - content_rmse: 0.5209 - wording_rmse: 0.8463 - mcrmse: 0.6836
Epoch 3 avg_val_loss: 0.2062  time: 1050s
Epoch 3 - ema_content_rmse: 0.5072 - ema_wording_rmse: 0.8537 - ema_mcrmse: 0.6804


Epoch: [3][400/644] Elapsed 19m 14s (remain 11m 39s) Loss: 0.0306(0.0360) Grad: 1.1184  LR: 0.00000418  


Epoch 3 avg_val_loss: 0.1980  time: 1247s
Epoch 3 - content_rmse: 0.4988 - wording_rmse: 0.8314 - mcrmse: 0.6651
Epoch 3 avg_val_loss: 0.2043  time: 1338s
Epoch 3 - ema_content_rmse: 0.5005 - ema_wording_rmse: 0.8520 - ema_mcrmse: 0.6762


Epoch: [3][500/644] Elapsed 24m 3s (remain 6m 52s) Loss: 0.0491(0.0359) Grad: 2.1145  LR: 0.00000336  


Epoch 3 avg_val_loss: 0.2376  time: 1535s
Epoch 3 - content_rmse: 0.5752 - wording_rmse: 0.9098 - mcrmse: 0.7425
Epoch 3 avg_val_loss: 0.2082  time: 1627s
Epoch 3 - ema_content_rmse: 0.5168 - ema_wording_rmse: 0.8567 - ema_mcrmse: 0.6867


Epoch: [3][600/644] Elapsed 28m 51s (remain 2m 3s) Loss: 0.0306(0.0361) Grad: 1.7256  LR: 0.00000261  


Epoch 3 avg_val_loss: 0.1977  time: 1824s
Epoch 3 - content_rmse: 0.5021 - wording_rmse: 0.8276 - mcrmse: 0.6648
Epoch 3 avg_val_loss: 0.2008  time: 1915s
Epoch 3 - ema_content_rmse: 0.5043 - ema_wording_rmse: 0.8385 - ema_mcrmse: 0.6714


Epoch: [3][643/644] Elapsed 32m 40s (remain 0m 0s) Loss: 0.0214(0.0363) Grad: 0.7533  LR: 0.00000231  


Epoch 3 avg_val_loss: 0.2115  time: 2052s
Epoch 3 - content_rmse: 0.5165 - wording_rmse: 0.8681 - mcrmse: 0.6923
Epoch 3 avg_val_loss: 0.2113  time: 2144s
Epoch 3 - ema_content_rmse: 0.5094 - ema_wording_rmse: 0.8703 - ema_mcrmse: 0.6898


Epoch: [4][0/644] Elapsed 0m 1s (remain 13m 3s) Loss: 0.0430(0.0430) Grad: 1.3232  LR: 0.00000230  


Epoch 4 avg_val_loss: 0.2095  time: 93s
Epoch 4 - content_rmse: 0.5156 - wording_rmse: 0.8623 - mcrmse: 0.6889
Epoch 4 avg_val_loss: 0.2111  time: 185s
Epoch 4 - ema_content_rmse: 0.5099 - ema_wording_rmse: 0.8695 - ema_mcrmse: 0.6897


Epoch: [4][100/644] Elapsed 4m 49s (remain 25m 56s) Loss: 0.0099(0.0187) Grad: 0.9582  LR: 0.00000167  


Epoch 4 avg_val_loss: 0.2074  time: 381s
Epoch 4 - content_rmse: 0.5033 - wording_rmse: 0.8609 - mcrmse: 0.6821
Epoch 4 avg_val_loss: 0.2032  time: 473s
Epoch 4 - ema_content_rmse: 0.5067 - ema_wording_rmse: 0.8445 - ema_mcrmse: 0.6756


Epoch: [4][200/644] Elapsed 9m 37s (remain 21m 13s) Loss: 0.0159(0.0188) Grad: 1.0341  LR: 0.00000113  


Epoch 4 avg_val_loss: 0.2067  time: 670s
Epoch 4 - content_rmse: 0.5055 - wording_rmse: 0.8569 - mcrmse: 0.6812
Epoch 4 avg_val_loss: 0.2091  time: 761s
Epoch 4 - ema_content_rmse: 0.5088 - ema_wording_rmse: 0.8633 - ema_mcrmse: 0.6861


Epoch: [4][300/644] Elapsed 14m 26s (remain 16m 27s) Loss: 0.0247(0.0181) Grad: 1.4809  LR: 0.00000068  


Epoch 4 avg_val_loss: 0.2054  time: 958s
Epoch 4 - content_rmse: 0.5037 - wording_rmse: 0.8535 - mcrmse: 0.6786
Epoch 4 avg_val_loss: 0.2062  time: 1050s
Epoch 4 - ema_content_rmse: 0.5055 - ema_wording_rmse: 0.8550 - ema_mcrmse: 0.6802


Epoch: [4][400/644] Elapsed 19m 14s (remain 11m 39s) Loss: 0.0156(0.0178) Grad: 1.2359  LR: 0.00000035  


Epoch 4 avg_val_loss: 0.2082  time: 1246s
Epoch 4 - content_rmse: 0.5051 - wording_rmse: 0.8626 - mcrmse: 0.6838
Epoch 4 avg_val_loss: 0.2092  time: 1338s
Epoch 4 - ema_content_rmse: 0.5047 - ema_wording_rmse: 0.8662 - ema_mcrmse: 0.6855


Epoch: [4][500/644] Elapsed 24m 3s (remain 6m 51s) Loss: 0.0136(0.0179) Grad: 1.5525  LR: 0.00000012  


Epoch 4 avg_val_loss: 0.2045  time: 1535s
Epoch 4 - content_rmse: 0.5042 - wording_rmse: 0.8509 - mcrmse: 0.6775
Epoch 4 avg_val_loss: 0.2050  time: 1627s
Epoch 4 - ema_content_rmse: 0.5043 - ema_wording_rmse: 0.8524 - ema_mcrmse: 0.6784


Epoch: [4][600/644] Elapsed 28m 51s (remain 2m 3s) Loss: 0.0122(0.0177) Grad: 0.7535  LR: 0.00000001  


Epoch 4 avg_val_loss: 0.2054  time: 1823s
Epoch 4 - content_rmse: 0.5040 - wording_rmse: 0.8539 - mcrmse: 0.6790
Epoch 4 avg_val_loss: 0.2054  time: 1915s
Epoch 4 - ema_content_rmse: 0.5041 - ema_wording_rmse: 0.8538 - ema_mcrmse: 0.6790


Epoch: [4][643/644] Elapsed 32m 40s (remain 0m 0s) Loss: 0.0084(0.0178) Grad: 0.9220  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.2055  time: 2052s
Epoch 4 - content_rmse: 0.5041 - wording_rmse: 0.8543 - mcrmse: 0.6792
Epoch 4 avg_val_loss: 0.2055  time: 2144s
Epoch 4 - ema_content_rmse: 0.5041 - ema_wording_rmse: 0.8542 - ema_mcrmse: 0.6791


In [20]:
## total_complex = []
# for fold in range(4):
#     va_data = train_df[train_df['fold'] == fold]
#     preds = torch.load('/content/drive/MyDrive/deb_simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['predictions']
#     va_data['preds'] = preds
#     va_data = va_data[['id', 'preds', 'score']]
#     print(compute_metrics(va_data['preds'].values.reshape(-1,1), va_data['score'].values))
#     total_complex.append(va_data)
# total_complex = pd.concat(total_complex)
# compute_metrics(total_complex['preds'].values.reshape(-1,1), total_complex['score'].values)

In [21]:
# !mkdir -p /root/.kaggle
# !cp /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets init -p /content/drive/MyDrive/elc_mean/

In [22]:
#!kaggle datasets create -p /content/drive/MyDrive/elc_mean/

In [23]:
# deberta v3 large
# 1.5 0.8228
# 2 0.8197

# 1.5  8137
#2 8175
#2.5 8181
#3 8181
#3.5 8175

