## Log
* use 512 instead of 1024 as max_seq_len

In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Model training')
    # params of training
    parser.add_argument(
        "--fold", dest="fold", help="Train fold", default=None, type=int)
    parser.add_argument(
        '--batch_size',
        dest='batch_size',
        help='Mini batch size of one gpu or cpu',
        type=int,
        default=None)
    return parser.parse_args()


# Config

In [3]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'pool'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 512
    max_position_embeddings = 512
    folds = [2,3]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = False
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
    encoder_lr = 5e-6
    head_lr = 5e-6
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 4
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    
    

## logger

In [4]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    if not os.path.exists(CFG.OUTPUT_DIR):
        os.makedirs(CFG.OUTPUT_DIR)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Preproc

In [5]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

df["fold"] = df["prompt_id"].map(id2fold)

In [6]:
df 

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,1
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,1
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710,1
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,1
...,...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990,3
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784,3
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294,3
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538,3


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f054c453b80>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 068386ae-1672-4ba6-bacd-8a3ad46acbc7)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [9]:
mask_lm_datacollator = DataCollatorForWholeWordMask(tokenizer)
def data_collator(batch):
    input_ids = [{'input_ids':i[0]} for i in batch]
    token_type_ids = [i[1] for i in batch]
    attention_mask = [i[2] for i in batch]
    labels = [i[3] for i in batch]
    masked_input = mask_lm_datacollator(input_ids)['input_ids']
    return masked_input,\
               torch.stack(token_type_ids),\
               torch.stack(attention_mask),\
               torch.stack(labels)

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.prompt_title = df['prompt_title'].values.astype(str)
        self.prompt_text = df['prompt_text'].values.astype(str)
        self.prompt_question = df['prompt_question'].values.astype(str)
        self.text = df['text'].values.astype(str)
        self.content = df['content'].values
        self.wording = df['wording'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.prompt_title)
    
    def tokenize(self, example):
        sep = self.tokenizer.sep_token
        if  CFG.input_type == '1':
            prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
        else:
            prompt = example["prompt_question"] 
        
        labels = [float(example["content"]), float(example["wording"])]

        tokenized = tokenizer(
            example["text"],
            prompt,
            padding='max_length',
            truncation=True,
            max_length=CFG.max_input_length,
            return_tensors=None,
        )
        
        return {
            **tokenized,
            "labels": labels,
        }
    
    def __getitem__(self, item):
        example = {
                    "prompt_title":self.prompt_title[item],
                    "prompt_text":self.prompt_text[item],
                    "prompt_question":self.prompt_question[item],
                    "text":self.text[item],
                    "content":self.content[item],
                    "wording":self.wording[item],
                  }
        
        out = self.tokenize(example)
       
        return {
                'input_ids': torch.as_tensor(out['input_ids'], dtype=torch.long),
                'token_type_ids': torch.as_tensor(out['token_type_ids'], dtype=torch.long),
                'attention_mask': torch.as_tensor(out['attention_mask'], dtype=torch.long),
                'labels': torch.as_tensor(out['labels'], dtype=torch.float),
        }
        
        
        

## Model

In [11]:
def init_params(module_lst):
    for module in module_lst:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

class Custom_Bert(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls = nn.Sequential(
            nn.Linear(dim,1)
        )
        init_params([self.cls,self.attention])

    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output = self.cls(logits)
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)


class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(torch.mean(output, dim=1))
        return SequenceClassifierOutput(
            loss=nn.MSELoss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )

class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret    

class Custom_Bert_Pool(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        #self.base = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        print('load pretrained model ...');
        self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model', config = self.config)
        
        self.pool = GeMText()
        self.cls = nn.Linear(self.config.hidden_size,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(self.pool(output, attention_mask))
        return SequenceClassifierOutput(
            loss=nn.SmoothL1Loss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            print(f'Re-initialize {module}')
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class Custom_Bert_Mean(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states=True
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)

    def forward(self, input_ids, attention_mask,labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                            )


        output = base_output.hidden_states[-1]
        output = self.cls(self.dropout(torch.mean(output, dim=1)))
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

class Custom_Bert_M(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls_0 = nn.Sequential(
            nn.Linear(dim,1)
        )

        self.cls_1 = nn.Linear(dim,5)
        init_params([self.cls_0,self.cls_1,self.attention])

    def forward(self, input_ids, attention_mask, labels):
        base_output = self.base(input_ids=input_ids,
                    attention_mask=attention_mask,
                             )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output_0 = self.cls_0(logits)
        output_1 = self.cls_1(logits)
        if labels is None:
            return output_0

        else:
            regression_loss = nn.MSELoss()(torch.squeeze(output_0,1),labels)
            labels = labels.double()
            cls_labels = torch.where(labels==1.,4.0,labels)
            cls_labels = torch.where(cls_labels==0.25,1.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.5,2.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.75,3.0,cls_labels)
            cls_labels = cls_labels.long()
            cls_loss = nn.CrossEntropyLoss()(output_1, cls_labels)
            return ( 0.8 * regression_loss + 0.2 * cls_loss, output_0)

In [12]:
def build_model():
    if CFG.model_type == 'base':
        model_config = AutoConfig.from_pretrained(CFG.model_path)
        model_config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })

        #print(model_config)
        model = AutoModelForSequenceClassification.from_pretrained(
            CFG.model_path, config=model_config
        )
    if CFG.model_type == 'simple':
        model = Custom_Bert_Simple()
    if CFG.model_type == 'pool':
        model = Custom_Bert_Pool()
        if CFG.reinit_layers > 0:
            print("=="*40)
            print(f"Reinitialize the last {CFG.reinit_layers} layer(s).")
            for layer in model.base.encoder.layer[-CFG.reinit_layers:]:
                print("===")
                layer.apply(model._init_weights)
            print("=="*40)
        if CFG.load_pretrained:
            model.load_state_dict(torch.load('./pretrained/microsoft_deberta-v3-base_best_ema.pth')['model'])
    return model

# Train

In [13]:
from copy import deepcopy
class ModelEMA:
    """Model Exponential Moving Average from https://github.com/rwightman/
    pytorch-image-models Keep a moving average of everything in the model
    state_dict (parameters and buffers).

    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/
    ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training
    schemes to perform well.
    This class is sensitive where it is initialized in the sequence
    of model init, GPU assignment and distributed training wrappers.
    """

    def __init__(self, model, decay=0.9999, updates=0):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
            updates (int): counter of EMA updates.
        """
        # Create EMA(FP32)
        self.ema_model = deepcopy(model).eval()
        self.ema = self.ema_model
        self.updates = updates
        # decay exponential ramp (to help early epochs)
        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def update(self, model):
        # Update EMA parameters
        with torch.no_grad():
            self.updates += 1
            d = self.decay(self.updates)
            msd =  model.state_dict()# model state_dict
            for k, v in self.ema.state_dict().items():
                if v.dtype.is_floating_point:
                    v *= d
                    v += (1.0 - d) * msd[k].detach()

class EMAHook:
    """EMAHook used in BEVDepth.

    Modified from https://github.com/Megvii-Base
    Detection/BEVDepth/blob/main/callbacks/ema.py.
    """

    def __init__(self, model, init_updates=0, decay=0.9990, resume=None, logger=None):
        super().__init__()
        self.init_updates = init_updates
        self.resume = resume
        self.decay = decay
        self.ema_model = self.before_run(model)
        self.logger = logger

    def before_run(self, model):
        from torch.nn.modules.batchnorm import SyncBatchNorm

        bn_model_list = list()
        bn_model_dist_group_list = list()
        for model_ref in model.modules():
            if isinstance(model_ref, SyncBatchNorm):
                bn_model_list.append(model_ref)
                bn_model_dist_group_list.append(model_ref.process_group)
                model_ref.process_group = None
        ema_model = ModelEMA(model, self.decay)

        for bn_model, dist_group in zip(bn_model_list,
                                        bn_model_dist_group_list):
            bn_model.process_group = dist_group
        ema_model.updates = self.init_updates

        if self.resume is not None:
            self.logger.info(f'resume ema checkpoint from {self.resume}')
            cpt = torch.load(self.resume, map_location='cpu')
            load_state_dict(ema_model.ema, cpt['state_dict'])
            ema_model.updates = cpt['updates']

        return ema_model

    def after_train_iter(self, model):
        self.ema_model.update(model)

In [14]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [15]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [16]:
CFG.discriminative_learning_rate_num_groups

1

In [17]:
def get_optimizer_llr_params(model, type='s'):
    """
    Setup the optimizer.
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    MODIFIED VERSION:
    * added support for differential learning rates per layer

    reference: https://github.com/huggingface/transformers/blob/05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe/src/transformers/trainer.py#L804
    """

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    ### ADDED
    if CFG.discriminative_learning_rate:

        num_layers = model.config.num_hidden_layers

        learning_rate_powers = range(0, num_layers, num_layers//CFG.discriminative_learning_rate_num_groups)
        layer_wise_learning_rates = [
            pow(CFG.discriminative_learning_rate_decay_rate, power) * CFG.encoder_lr 
            for power in learning_rate_powers 
            for _ in range(num_layers//CFG.discriminative_learning_rate_num_groups)
          ]
        layer_wise_learning_rates = layer_wise_learning_rates[::-1]
        print('Layer-wise learning rates:', layer_wise_learning_rates)

        # group embedding paramters from the transformer encoder
        embedding_layer = model.base.embeddings
        optimizer_grouped_parameters = [
          {
              "params": [p for n, p in embedding_layer.named_parameters() if not any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": CFG.weight_decay,
          },
          {
              "params": [p for n, p in embedding_layer.named_parameters() if any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": 0.0,
          },
        ]

        # group encoding paramters from the transformer encoder
        encoding_layers = [layer for layer in model.base.encoder.layer]
        for i, layer in enumerate(encoding_layers):
            optimizer_grouped_parameters += [
                {
                    "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": 0.0,
                },
            ]    
        print(f"Detected unattached modules in model.encoder: {[n for n, p in model.base.encoder.named_parameters() if not n.startswith('layer')]}")
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and not any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": 0.0,
            },
        ]

        # group paramters from the task specific head
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and not any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": 0.0,
            },
        ]
    ### END ADDED
    else:
        # group paramters for the entire network
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": 0.0,
            },
        ]
    return optimizer_grouped_parameters

In [18]:
def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        with torch.no_grad():
            model_output = model(**batch)
        label = batch['labels']
        loss, logits = model_output.loss, model_output.logits
        losses.update(loss.item(), batch_size)
        preds.append(logits.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    return losses.avg, predictions, labels

def train_fn(train_loader, model, optimizer, epoch, scheduler, device, valid_loader, start_time, best_score, best_score_ema,ema_hook,wandb, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        loss = model(**batch).loss
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        ema_hook.after_train_iter(model)
        global_step += 1
        scheduler.step()
        end = time.time()
        
        wandb.log({
                'train loss': loss.item(),
                'step': global_step,
                'epoch': epoch,
                'fold': fold,
                'batch_size':CFG.batch_size
            })
        
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
            # eval
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

            # scoring
            score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - content_rmse: {content_rmse:.4f} - wording_rmse: {wording_rmse:.4f} - mcrmse: {mcrmse:.4f}')
            
            
            if best_score > score['mcrmse']:
                if best_score != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
                best_score = score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
            
            
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, ema_hook.ema_model.ema, CFG.device)
            # ema scoring
            ema_score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(ema_score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - ema_content_rmse: {content_rmse:.4f} - ema_wording_rmse: {wording_rmse:.4f} - ema_mcrmse: {mcrmse:.4f}')
            
            
            if best_score_ema > ema_score['mcrmse']:
                if best_score_ema != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score_ema))
                best_score_ema = ema_score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - ema_Save Best Score: {best_score_ema:.4f} Model')
                torch.save({'model': ema_hook.ema_model.ema.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold,best_score_ema))
            
            wandb.log({
            'learning rate': optimizer.param_groups[0]['lr'],
            'validation mcrmse': score['mcrmse'],
            'validation ema mcrmse': ema_score['mcrmse'],
            'step': global_step,
            'epoch': epoch,
        })
            
            model.train()
    return losses.avg, best_score, best_score_ema



def train_loop():
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    wandb.init(project='kaggle-commonlit-eval-student-summaries-0809')
    wandb.config = dict(epochs=CFG.epochs, 
                            batch_size=CFG.batch_size, 
                            learning_rate=CFG.encoder_lr,
                            save_checkpoint=True,
                            )
    for fold in CFG.folds:
        
        if CFG.pretraining:
            tr_data = pd.read_csv('tmp_pessudo.csv')
            tr_data['prompt_title'] = ''
            tr_data = tr_data[-(tr_data['prompt_question'].isin(pdf['prompt_question'].tolist()))]
            va_data = df #df[df['fold']==fold].reset_index(drop=True)
        else:
            tr_data = df[df['fold']!=fold].reset_index(drop=True)
            va_data = df[df['fold']==fold].reset_index(drop=True)
        train_dataset = TrainDataset(tr_data, tokenizer)
        valid_dataset = TrainDataset(va_data, tokenizer)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        # ====================================================
        # model & optimizer
        # ====================================================
        model = build_model()
        #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.to(CFG.device)
        # for param in model.base.parameters():
        #         param.requires_grad = False
        ema_hook = EMAHook(model, init_updates=3000, logger=LOGGER)
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': 0.0},
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_llr_params(model)
        optimizer = AdamW(optimizer_parameters, eps=CFG.eps, betas=CFG.betas)


        
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
            if cfg.scheduler == 'linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                    num_cycles=cfg.num_cycles
                )
            return scheduler

        num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        # ====================================================
        # loop
        # ====================================================
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

        # criterion = LabelSmoothingLoss()
        best_score = float('inf')
        best_score_ema = float('inf')
        for epoch in range(CFG.epochs):

            start_time = time.time()

            # train
            avg_loss, best_score, best_score_ema = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device, valid_loader, start_time, best_score, best_score_ema ,ema_hook, wandb,fold)


        torch.cuda.empty_cache()
        gc.collect()
        del scheduler, optimizer, model
    return 


In [19]:
train_loop()

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669741932613155, max=1.0…

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f0512bd0790>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 9ade698c-942f-4420-8831-7f9e5766553f)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json


load pretrained model ...
Epoch: [1][0/1289] Elapsed 0m 1s (remain 29m 11s) Loss: 0.2912(0.2912) Grad: 8.8301  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.5114  time: 94s
Epoch 1 - content_rmse: 1.2331 - wording_rmse: 0.9461 - mcrmse: 1.0896
Epoch 1 - Save Best Score: 1.0896 Model
Epoch 1 avg_val_loss: 0.5233  time: 191s
Epoch 1 - ema_content_rmse: 1.2552 - ema_wording_rmse: 0.9529 - ema_mcrmse: 1.1041
Epoch 1 - ema_Save Best Score: 1.1041 Model


Epoch: [1][100/1289] Elapsed 4m 12s (remain 49m 24s) Loss: 0.1044(0.2803) Grad: 5.9134  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.2557  time: 345s
Epoch 1 - content_rmse: 0.6624 - wording_rmse: 0.8254 - mcrmse: 0.7439
Epoch 1 - Save Best Score: 0.7439 Model
Epoch 1 avg_val_loss: 0.2767  time: 441s
Epoch 1 - ema_content_rmse: 0.6982 - ema_wording_rmse: 0.8530 - ema_mcrmse: 0.7756
Epoch 1 - ema_Save Best Score: 0.7756 Model


Epoch: [1][200/1289] Elapsed 8m 23s (remain 45m 24s) Loss: 0.1550(0.2195) Grad: 6.6780  LR: 0.00000498  


Epoch 1 avg_val_loss: 0.2274  time: 597s
Epoch 1 - content_rmse: 0.5760 - wording_rmse: 0.8254 - mcrmse: 0.7007
Epoch 1 - Save Best Score: 0.7007 Model
Epoch 1 avg_val_loss: 0.2569  time: 692s
Epoch 1 - ema_content_rmse: 0.6322 - ema_wording_rmse: 0.8681 - ema_mcrmse: 0.7502
Epoch 1 - ema_Save Best Score: 0.7502 Model


Epoch: [1][300/1289] Elapsed 12m 33s (remain 41m 14s) Loss: 0.0834(0.1942) Grad: 3.8660  LR: 0.00000496  


Epoch 1 avg_val_loss: 0.2622  time: 847s
Epoch 1 - content_rmse: 0.5817 - wording_rmse: 0.9270 - mcrmse: 0.7544
Epoch 1 avg_val_loss: 0.3027  time: 940s
Epoch 1 - ema_content_rmse: 0.6415 - ema_wording_rmse: 0.9914 - ema_mcrmse: 0.8164


Epoch: [1][400/1289] Elapsed 16m 38s (remain 36m 51s) Loss: 0.1736(0.1798) Grad: 5.0381  LR: 0.00000493  


Epoch 1 avg_val_loss: 0.2336  time: 1092s
Epoch 1 - content_rmse: 0.5852 - wording_rmse: 0.8437 - mcrmse: 0.7144
Epoch 1 avg_val_loss: 0.2334  time: 1185s
Epoch 1 - ema_content_rmse: 0.5892 - ema_wording_rmse: 0.8377 - ema_mcrmse: 0.7135
Epoch 1 - ema_Save Best Score: 0.7135 Model


Epoch: [1][500/1289] Elapsed 20m 46s (remain 32m 40s) Loss: 0.4088(0.1695) Grad: 12.1389  LR: 0.00000488  


Epoch 1 avg_val_loss: 0.1979  time: 1340s
Epoch 1 - content_rmse: 0.5171 - wording_rmse: 0.7834 - mcrmse: 0.6502
Epoch 1 - Save Best Score: 0.6502 Model
Epoch 1 avg_val_loss: 0.1948  time: 1436s
Epoch 1 - ema_content_rmse: 0.5159 - ema_wording_rmse: 0.7713 - ema_mcrmse: 0.6436
Epoch 1 - ema_Save Best Score: 0.6436 Model


Epoch: [1][600/1289] Elapsed 24m 57s (remain 28m 34s) Loss: 0.0755(0.1614) Grad: 4.0893  LR: 0.00000483  


Epoch 1 avg_val_loss: 0.2523  time: 1590s
Epoch 1 - content_rmse: 0.5381 - wording_rmse: 0.9592 - mcrmse: 0.7486
Epoch 1 avg_val_loss: 0.2903  time: 1684s
Epoch 1 - ema_content_rmse: 0.6422 - ema_wording_rmse: 0.9896 - ema_mcrmse: 0.8159


Epoch: [1][700/1289] Elapsed 29m 2s (remain 24m 21s) Loss: 0.1455(0.1564) Grad: 4.1415  LR: 0.00000478  


Epoch 1 avg_val_loss: 0.2391  time: 1835s
Epoch 1 - content_rmse: 0.6170 - wording_rmse: 0.8403 - mcrmse: 0.7286
Epoch 1 avg_val_loss: 0.2316  time: 1928s
Epoch 1 - ema_content_rmse: 0.5940 - ema_wording_rmse: 0.8358 - ema_mcrmse: 0.7149


Epoch: [1][800/1289] Elapsed 33m 7s (remain 20m 10s) Loss: 0.0195(0.1517) Grad: 3.5119  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.2194  time: 2081s
Epoch 1 - content_rmse: 0.5230 - wording_rmse: 0.8756 - mcrmse: 0.6993
Epoch 1 avg_val_loss: 0.2339  time: 2174s
Epoch 1 - ema_content_rmse: 0.5441 - ema_wording_rmse: 0.9050 - ema_mcrmse: 0.7245


Epoch: [1][900/1289] Elapsed 37m 12s (remain 16m 1s) Loss: 0.0293(0.1491) Grad: 1.5222  LR: 0.00000463  


Epoch 1 avg_val_loss: 0.2338  time: 2325s
Epoch 1 - content_rmse: 0.5991 - wording_rmse: 0.8500 - mcrmse: 0.7245
Epoch 1 avg_val_loss: 0.2239  time: 2419s
Epoch 1 - ema_content_rmse: 0.5721 - ema_wording_rmse: 0.8403 - ema_mcrmse: 0.7062


Epoch: [1][1000/1289] Elapsed 41m 17s (remain 11m 52s) Loss: 0.1400(0.1448) Grad: 8.4428  LR: 0.00000455  


Epoch 1 avg_val_loss: 0.2086  time: 2570s
Epoch 1 - content_rmse: 0.5301 - wording_rmse: 0.8257 - mcrmse: 0.6779
Epoch 1 avg_val_loss: 0.2561  time: 2664s
Epoch 1 - ema_content_rmse: 0.5685 - ema_wording_rmse: 0.9460 - ema_mcrmse: 0.7573


Epoch: [1][1100/1289] Elapsed 45m 22s (remain 7m 44s) Loss: 0.1973(0.1414) Grad: 9.4114  LR: 0.00000446  


Epoch 1 avg_val_loss: 0.1748  time: 2816s
Epoch 1 - content_rmse: 0.4965 - wording_rmse: 0.7247 - mcrmse: 0.6106
Epoch 1 - Save Best Score: 0.6106 Model
Epoch 1 avg_val_loss: 0.1791  time: 2912s
Epoch 1 - ema_content_rmse: 0.5019 - ema_wording_rmse: 0.7401 - ema_mcrmse: 0.6210
Epoch 1 - ema_Save Best Score: 0.6210 Model


Epoch: [1][1200/1289] Elapsed 49m 33s (remain 3m 37s) Loss: 0.0915(0.1385) Grad: 4.8777  LR: 0.00000436  


Epoch 1 avg_val_loss: 0.1900  time: 3067s
Epoch 1 - content_rmse: 0.4996 - wording_rmse: 0.7822 - mcrmse: 0.6409
Epoch 1 avg_val_loss: 0.1953  time: 3160s
Epoch 1 - ema_content_rmse: 0.5140 - ema_wording_rmse: 0.7880 - ema_mcrmse: 0.6510


Epoch: [1][1288/1289] Elapsed 53m 31s (remain 0m 0s) Loss: 0.0167(0.1361) Grad: 1.3190  LR: 0.00000427  


Epoch 1 avg_val_loss: 0.1792  time: 3305s
Epoch 1 - content_rmse: 0.5162 - wording_rmse: 0.7213 - mcrmse: 0.6188
Epoch 1 avg_val_loss: 0.1957  time: 3398s
Epoch 1 - ema_content_rmse: 0.5230 - ema_wording_rmse: 0.7783 - ema_mcrmse: 0.6506


Epoch: [2][0/1289] Elapsed 0m 0s (remain 16m 52s) Loss: 0.0877(0.0877) Grad: 4.1368  LR: 0.00000427  


Epoch 2 avg_val_loss: 0.1804  time: 94s
Epoch 2 - content_rmse: 0.5189 - wording_rmse: 0.7232 - mcrmse: 0.6210
Epoch 2 avg_val_loss: 0.1934  time: 187s
Epoch 2 - ema_content_rmse: 0.5223 - ema_wording_rmse: 0.7706 - ema_mcrmse: 0.6465


Epoch: [2][100/1289] Elapsed 4m 6s (remain 48m 14s) Loss: 0.0650(0.0851) Grad: 3.9025  LR: 0.00000416  


Epoch 2 avg_val_loss: 0.2194  time: 339s
Epoch 2 - content_rmse: 0.5136 - wording_rmse: 0.8677 - mcrmse: 0.6906
Epoch 2 avg_val_loss: 0.2348  time: 432s
Epoch 2 - ema_content_rmse: 0.5727 - ema_wording_rmse: 0.8630 - ema_mcrmse: 0.7178


Epoch: [2][200/1289] Elapsed 8m 11s (remain 44m 18s) Loss: 0.1204(0.0883) Grad: 8.4377  LR: 0.00000404  


Epoch 2 avg_val_loss: 0.2327  time: 584s
Epoch 2 - content_rmse: 0.5531 - wording_rmse: 0.8916 - mcrmse: 0.7223
Epoch 2 avg_val_loss: 0.2235  time: 677s
Epoch 2 - ema_content_rmse: 0.5396 - ema_wording_rmse: 0.8757 - ema_mcrmse: 0.7077


Epoch: [2][300/1289] Elapsed 12m 16s (remain 40m 15s) Loss: 0.0431(0.0907) Grad: 2.7832  LR: 0.00000392  


Epoch 2 avg_val_loss: 0.1823  time: 829s
Epoch 2 - content_rmse: 0.5110 - wording_rmse: 0.7437 - mcrmse: 0.6273
Epoch 2 avg_val_loss: 0.1961  time: 922s
Epoch 2 - ema_content_rmse: 0.5179 - ema_wording_rmse: 0.7958 - ema_mcrmse: 0.6569


Epoch: [2][400/1289] Elapsed 16m 21s (remain 36m 12s) Loss: 0.0767(0.0898) Grad: 3.2480  LR: 0.00000379  


Epoch 2 avg_val_loss: 0.2244  time: 1074s
Epoch 2 - content_rmse: 0.5439 - wording_rmse: 0.8605 - mcrmse: 0.7022
Epoch 2 avg_val_loss: 0.1882  time: 1167s
Epoch 2 - ema_content_rmse: 0.5039 - ema_wording_rmse: 0.7698 - ema_mcrmse: 0.6368


Epoch: [2][500/1289] Elapsed 20m 26s (remain 32m 8s) Loss: 0.0918(0.0871) Grad: 3.3536  LR: 0.00000365  


Epoch 2 avg_val_loss: 0.2143  time: 1319s
Epoch 2 - content_rmse: 0.5320 - wording_rmse: 0.8556 - mcrmse: 0.6938
Epoch 2 avg_val_loss: 0.2093  time: 1412s
Epoch 2 - ema_content_rmse: 0.5251 - ema_wording_rmse: 0.8439 - ema_mcrmse: 0.6845


Epoch: [2][600/1289] Elapsed 24m 31s (remain 28m 3s) Loss: 0.1111(0.0852) Grad: 6.6944  LR: 0.00000352  


Epoch 2 avg_val_loss: 0.2876  time: 1564s
Epoch 2 - content_rmse: 0.6234 - wording_rmse: 0.9952 - mcrmse: 0.8093
Epoch 2 avg_val_loss: 0.2414  time: 1657s
Epoch 2 - ema_content_rmse: 0.5697 - ema_wording_rmse: 0.9057 - ema_mcrmse: 0.7377


Epoch: [2][700/1289] Elapsed 28m 36s (remain 23m 59s) Loss: 0.1247(0.0857) Grad: 4.5414  LR: 0.00000338  


Epoch 2 avg_val_loss: 0.2067  time: 1809s
Epoch 2 - content_rmse: 0.4915 - wording_rmse: 0.8587 - mcrmse: 0.6751
Epoch 2 avg_val_loss: 0.2050  time: 1902s
Epoch 2 - ema_content_rmse: 0.4935 - ema_wording_rmse: 0.8509 - ema_mcrmse: 0.6722


Epoch: [2][800/1289] Elapsed 32m 40s (remain 19m 54s) Loss: 0.0723(0.0859) Grad: 2.4786  LR: 0.00000323  


Epoch 2 avg_val_loss: 0.2245  time: 2054s
Epoch 2 - content_rmse: 0.5278 - wording_rmse: 0.8902 - mcrmse: 0.7090
Epoch 2 avg_val_loss: 0.2122  time: 2147s
Epoch 2 - ema_content_rmse: 0.5135 - ema_wording_rmse: 0.8624 - ema_mcrmse: 0.6879


Epoch: [2][900/1289] Elapsed 36m 45s (remain 15m 49s) Loss: 0.1031(0.0843) Grad: 5.8189  LR: 0.00000309  


Epoch 2 avg_val_loss: 0.2146  time: 2299s
Epoch 2 - content_rmse: 0.5012 - wording_rmse: 0.8709 - mcrmse: 0.6861
Epoch 2 avg_val_loss: 0.2045  time: 2392s
Epoch 2 - ema_content_rmse: 0.5075 - ema_wording_rmse: 0.8379 - ema_mcrmse: 0.6727


Epoch: [2][1000/1289] Elapsed 40m 50s (remain 11m 45s) Loss: 0.0555(0.0833) Grad: 3.7869  LR: 0.00000294  


Epoch 2 avg_val_loss: 0.1926  time: 2544s
Epoch 2 - content_rmse: 0.4993 - wording_rmse: 0.8011 - mcrmse: 0.6502
Epoch 2 avg_val_loss: 0.2029  time: 2637s
Epoch 2 - ema_content_rmse: 0.5006 - ema_wording_rmse: 0.8386 - ema_mcrmse: 0.6696


Epoch: [2][1100/1289] Elapsed 44m 55s (remain 7m 40s) Loss: 0.0631(0.0834) Grad: 3.3673  LR: 0.00000279  


Epoch 2 avg_val_loss: 0.2282  time: 2789s
Epoch 2 - content_rmse: 0.5228 - wording_rmse: 0.9122 - mcrmse: 0.7175
Epoch 2 avg_val_loss: 0.2115  time: 2882s
Epoch 2 - ema_content_rmse: 0.5080 - ema_wording_rmse: 0.8653 - ema_mcrmse: 0.6867


Epoch: [2][1200/1289] Elapsed 49m 0s (remain 3m 35s) Loss: 0.0283(0.0833) Grad: 2.8944  LR: 0.00000263  


Epoch 2 avg_val_loss: 0.1976  time: 3034s
Epoch 2 - content_rmse: 0.5084 - wording_rmse: 0.8150 - mcrmse: 0.6617
Epoch 2 avg_val_loss: 0.2012  time: 3127s
Epoch 2 - ema_content_rmse: 0.5080 - ema_wording_rmse: 0.8290 - ema_mcrmse: 0.6685


Epoch: [2][1288/1289] Elapsed 52m 58s (remain 0m 0s) Loss: 0.2433(0.0845) Grad: 7.2836  LR: 0.00000250  


Epoch 2 avg_val_loss: 0.1887  time: 3272s
Epoch 2 - content_rmse: 0.5133 - wording_rmse: 0.7736 - mcrmse: 0.6435
Epoch 2 avg_val_loss: 0.1932  time: 3365s
Epoch 2 - ema_content_rmse: 0.5090 - ema_wording_rmse: 0.7965 - ema_mcrmse: 0.6527


Epoch: [3][0/1289] Elapsed 0m 0s (remain 16m 58s) Loss: 0.2052(0.2052) Grad: 8.2187  LR: 0.00000250  


Epoch 3 avg_val_loss: 0.1902  time: 94s
Epoch 3 - content_rmse: 0.5072 - wording_rmse: 0.7860 - mcrmse: 0.6466
Epoch 3 avg_val_loss: 0.1930  time: 187s
Epoch 3 - ema_content_rmse: 0.5089 - ema_wording_rmse: 0.7958 - ema_mcrmse: 0.6524


Epoch: [3][100/1289] Elapsed 4m 5s (remain 48m 11s) Loss: 0.0080(0.0548) Grad: 1.3659  LR: 0.00000235  


Epoch 3 avg_val_loss: 0.1996  time: 339s
Epoch 3 - content_rmse: 0.5112 - wording_rmse: 0.8165 - mcrmse: 0.6639
Epoch 3 avg_val_loss: 0.1916  time: 432s
Epoch 3 - ema_content_rmse: 0.5047 - ema_wording_rmse: 0.7919 - ema_mcrmse: 0.6483


Epoch: [3][200/1289] Elapsed 8m 10s (remain 44m 17s) Loss: 0.0843(0.0563) Grad: 5.9269  LR: 0.00000219  


Epoch 3 avg_val_loss: 0.2054  time: 584s
Epoch 3 - content_rmse: 0.5091 - wording_rmse: 0.8396 - mcrmse: 0.6744
Epoch 3 avg_val_loss: 0.1942  time: 677s
Epoch 3 - ema_content_rmse: 0.5084 - ema_wording_rmse: 0.7998 - ema_mcrmse: 0.6541


Epoch: [3][300/1289] Elapsed 12m 15s (remain 40m 15s) Loss: 0.0232(0.0551) Grad: 0.8880  LR: 0.00000204  


Epoch 3 avg_val_loss: 0.1844  time: 829s
Epoch 3 - content_rmse: 0.5021 - wording_rmse: 0.7637 - mcrmse: 0.6329
Epoch 3 avg_val_loss: 0.1907  time: 922s
Epoch 3 - ema_content_rmse: 0.5012 - ema_wording_rmse: 0.7916 - ema_mcrmse: 0.6464


Epoch: [3][400/1289] Elapsed 16m 21s (remain 36m 12s) Loss: 0.0536(0.0551) Grad: 5.3127  LR: 0.00000190  


Epoch 3 avg_val_loss: 0.1778  time: 1074s
Epoch 3 - content_rmse: 0.4895 - wording_rmse: 0.7514 - mcrmse: 0.6205
Epoch 3 avg_val_loss: 0.1855  time: 1167s
Epoch 3 - ema_content_rmse: 0.4911 - ema_wording_rmse: 0.7817 - ema_mcrmse: 0.6364


Epoch: [3][500/1289] Elapsed 20m 26s (remain 32m 8s) Loss: 0.0326(0.0552) Grad: 1.8196  LR: 0.00000175  


Epoch 3 avg_val_loss: 0.2164  time: 1319s
Epoch 3 - content_rmse: 0.5021 - wording_rmse: 0.8861 - mcrmse: 0.6941
Epoch 3 avg_val_loss: 0.2047  time: 1412s
Epoch 3 - ema_content_rmse: 0.4987 - ema_wording_rmse: 0.8496 - ema_mcrmse: 0.6741


Epoch: [3][600/1289] Elapsed 24m 31s (remain 28m 3s) Loss: 0.0912(0.0549) Grad: 5.6422  LR: 0.00000160  


Epoch 3 avg_val_loss: 0.1856  time: 1564s
Epoch 3 - content_rmse: 0.4931 - wording_rmse: 0.7809 - mcrmse: 0.6370
Epoch 3 avg_val_loss: 0.1935  time: 1657s
Epoch 3 - ema_content_rmse: 0.4953 - ema_wording_rmse: 0.8094 - ema_mcrmse: 0.6524


Epoch: [3][700/1289] Elapsed 28m 36s (remain 23m 59s) Loss: 0.0272(0.0541) Grad: 2.0384  LR: 0.00000146  


Epoch 3 avg_val_loss: 0.1958  time: 1809s
Epoch 3 - content_rmse: 0.4930 - wording_rmse: 0.8236 - mcrmse: 0.6583
Epoch 3 avg_val_loss: 0.2014  time: 1903s
Epoch 3 - ema_content_rmse: 0.4941 - ema_wording_rmse: 0.8420 - ema_mcrmse: 0.6680


Epoch: [3][800/1289] Elapsed 32m 41s (remain 19m 54s) Loss: 0.0853(0.0547) Grad: 3.2022  LR: 0.00000133  


Epoch 3 avg_val_loss: 0.2189  time: 2054s
Epoch 3 - content_rmse: 0.5018 - wording_rmse: 0.8973 - mcrmse: 0.6996
Epoch 3 avg_val_loss: 0.2051  time: 2148s
Epoch 3 - ema_content_rmse: 0.4952 - ema_wording_rmse: 0.8544 - ema_mcrmse: 0.6748


Epoch: [3][900/1289] Elapsed 36m 46s (remain 15m 50s) Loss: 0.0717(0.0545) Grad: 4.7546  LR: 0.00000120  


Epoch 3 avg_val_loss: 0.2101  time: 2299s
Epoch 3 - content_rmse: 0.4972 - wording_rmse: 0.8683 - mcrmse: 0.6828
Epoch 3 avg_val_loss: 0.1997  time: 2392s
Epoch 3 - ema_content_rmse: 0.4939 - ema_wording_rmse: 0.8345 - ema_mcrmse: 0.6642


Epoch: [3][1000/1289] Elapsed 40m 51s (remain 11m 45s) Loss: 0.0470(0.0546) Grad: 2.6135  LR: 0.00000107  


Epoch 3 avg_val_loss: 0.1981  time: 2544s
Epoch 3 - content_rmse: 0.4951 - wording_rmse: 0.8261 - mcrmse: 0.6606
Epoch 3 avg_val_loss: 0.1984  time: 2638s
Epoch 3 - ema_content_rmse: 0.4942 - ema_wording_rmse: 0.8297 - ema_mcrmse: 0.6619


Epoch: [3][1100/1289] Elapsed 44m 56s (remain 7m 40s) Loss: 0.0287(0.0550) Grad: 3.6719  LR: 0.00000095  


Epoch 3 avg_val_loss: 0.1895  time: 2789s
Epoch 3 - content_rmse: 0.4917 - wording_rmse: 0.7979 - mcrmse: 0.6448
Epoch 3 avg_val_loss: 0.1906  time: 2883s
Epoch 3 - ema_content_rmse: 0.4912 - ema_wording_rmse: 0.8036 - ema_mcrmse: 0.6474


Epoch: [3][1200/1289] Elapsed 49m 1s (remain 3m 35s) Loss: 0.0332(0.0550) Grad: 3.6690  LR: 0.00000083  


Epoch 3 avg_val_loss: 0.1916  time: 3034s
Epoch 3 - content_rmse: 0.4919 - wording_rmse: 0.8024 - mcrmse: 0.6472
Epoch 3 avg_val_loss: 0.1950  time: 3127s
Epoch 3 - ema_content_rmse: 0.4930 - ema_wording_rmse: 0.8139 - ema_mcrmse: 0.6535


Epoch: [3][1288/1289] Elapsed 52m 59s (remain 0m 0s) Loss: 0.0465(0.0550) Grad: 4.4955  LR: 0.00000073  


Epoch 3 avg_val_loss: 0.1941  time: 3272s
Epoch 3 - content_rmse: 0.4931 - wording_rmse: 0.8153 - mcrmse: 0.6542
Epoch 3 avg_val_loss: 0.1946  time: 3366s
Epoch 3 - ema_content_rmse: 0.4922 - ema_wording_rmse: 0.8164 - ema_mcrmse: 0.6543


Epoch: [4][0/1289] Elapsed 0m 0s (remain 16m 46s) Loss: 0.0246(0.0246) Grad: 2.5300  LR: 0.00000073  


Epoch 4 avg_val_loss: 0.1938  time: 94s
Epoch 4 - content_rmse: 0.4930 - wording_rmse: 0.8144 - mcrmse: 0.6537
Epoch 4 avg_val_loss: 0.1946  time: 187s
Epoch 4 - ema_content_rmse: 0.4922 - ema_wording_rmse: 0.8164 - ema_mcrmse: 0.6543


Epoch: [4][100/1289] Elapsed 4m 5s (remain 48m 11s) Loss: 0.0418(0.0379) Grad: 2.7967  LR: 0.00000063  


Epoch 4 avg_val_loss: 0.1970  time: 339s
Epoch 4 - content_rmse: 0.4903 - wording_rmse: 0.8264 - mcrmse: 0.6584
Epoch 4 avg_val_loss: 0.1938  time: 432s
Epoch 4 - ema_content_rmse: 0.4908 - ema_wording_rmse: 0.8154 - ema_mcrmse: 0.6531


Epoch: [4][200/1289] Elapsed 8m 10s (remain 44m 16s) Loss: 0.0821(0.0401) Grad: 2.8100  LR: 0.00000053  


Epoch 4 avg_val_loss: 0.2079  time: 584s
Epoch 4 - content_rmse: 0.4952 - wording_rmse: 0.8642 - mcrmse: 0.6797
Epoch 4 avg_val_loss: 0.2052  time: 677s
Epoch 4 - ema_content_rmse: 0.4955 - ema_wording_rmse: 0.8539 - ema_mcrmse: 0.6747


Epoch: [4][300/1289] Elapsed 12m 15s (remain 40m 15s) Loss: 0.0374(0.0414) Grad: 2.5145  LR: 0.00000044  


Epoch 4 avg_val_loss: 0.1974  time: 829s
Epoch 4 - content_rmse: 0.4920 - wording_rmse: 0.8280 - mcrmse: 0.6600
Epoch 4 avg_val_loss: 0.1964  time: 922s
Epoch 4 - ema_content_rmse: 0.4915 - ema_wording_rmse: 0.8242 - ema_mcrmse: 0.6578


Epoch: [4][400/1289] Elapsed 16m 20s (remain 36m 12s) Loss: 0.0254(0.0405) Grad: 3.1591  LR: 0.00000036  


Epoch 4 avg_val_loss: 0.2009  time: 1074s
Epoch 4 - content_rmse: 0.4928 - wording_rmse: 0.8405 - mcrmse: 0.6667
Epoch 4 avg_val_loss: 0.1996  time: 1167s
Epoch 4 - ema_content_rmse: 0.4918 - ema_wording_rmse: 0.8359 - ema_mcrmse: 0.6638


Epoch: [4][500/1289] Elapsed 20m 25s (remain 32m 8s) Loss: 0.0418(0.0399) Grad: 1.8376  LR: 0.00000028  


Epoch 4 avg_val_loss: 0.1970  time: 1319s
Epoch 4 - content_rmse: 0.4922 - wording_rmse: 0.8262 - mcrmse: 0.6592
Epoch 4 avg_val_loss: 0.1999  time: 1412s
Epoch 4 - ema_content_rmse: 0.4923 - ema_wording_rmse: 0.8365 - ema_mcrmse: 0.6644


Epoch: [4][600/1289] Elapsed 24m 30s (remain 28m 3s) Loss: 0.0376(0.0393) Grad: 2.9845  LR: 0.00000022  


Epoch 4 avg_val_loss: 0.2014  time: 1564s
Epoch 4 - content_rmse: 0.4924 - wording_rmse: 0.8414 - mcrmse: 0.6669
Epoch 4 avg_val_loss: 0.1982  time: 1657s
Epoch 4 - ema_content_rmse: 0.4921 - ema_wording_rmse: 0.8305 - ema_mcrmse: 0.6613


Epoch: [4][700/1289] Elapsed 28m 36s (remain 23m 59s) Loss: 0.0164(0.0384) Grad: 1.8073  LR: 0.00000016  


Epoch 4 avg_val_loss: 0.2007  time: 1809s
Epoch 4 - content_rmse: 0.4930 - wording_rmse: 0.8387 - mcrmse: 0.6659
Epoch 4 avg_val_loss: 0.1990  time: 1902s
Epoch 4 - ema_content_rmse: 0.4927 - ema_wording_rmse: 0.8326 - ema_mcrmse: 0.6626


Epoch: [4][800/1289] Elapsed 32m 41s (remain 19m 54s) Loss: 0.0661(0.0383) Grad: 3.4604  LR: 0.00000011  


Epoch 4 avg_val_loss: 0.1972  time: 2054s
Epoch 4 - content_rmse: 0.4926 - wording_rmse: 0.8270 - mcrmse: 0.6598
Epoch 4 avg_val_loss: 0.2004  time: 2147s
Epoch 4 - ema_content_rmse: 0.4929 - ema_wording_rmse: 0.8381 - ema_mcrmse: 0.6655


Epoch: [4][900/1289] Elapsed 36m 45s (remain 15m 49s) Loss: 0.0299(0.0382) Grad: 3.2330  LR: 0.00000007  


Epoch 4 avg_val_loss: 0.1962  time: 2299s
Epoch 4 - content_rmse: 0.4929 - wording_rmse: 0.8220 - mcrmse: 0.6575
Epoch 4 avg_val_loss: 0.1957  time: 2392s
Epoch 4 - ema_content_rmse: 0.4926 - ema_wording_rmse: 0.8209 - ema_mcrmse: 0.6568


Epoch: [4][1000/1289] Elapsed 40m 51s (remain 11m 45s) Loss: 0.0697(0.0378) Grad: 4.3609  LR: 0.00000004  


Epoch 4 avg_val_loss: 0.1984  time: 2544s
Epoch 4 - content_rmse: 0.4924 - wording_rmse: 0.8307 - mcrmse: 0.6615
Epoch 4 avg_val_loss: 0.1984  time: 2638s
Epoch 4 - ema_content_rmse: 0.4925 - ema_wording_rmse: 0.8305 - ema_mcrmse: 0.6615


Epoch: [4][1100/1289] Elapsed 44m 56s (remain 7m 40s) Loss: 0.0254(0.0374) Grad: 1.5703  LR: 0.00000002  


Epoch 4 avg_val_loss: 0.1977  time: 2789s
Epoch 4 - content_rmse: 0.4923 - wording_rmse: 0.8280 - mcrmse: 0.6601
Epoch 4 avg_val_loss: 0.1978  time: 2883s
Epoch 4 - ema_content_rmse: 0.4924 - ema_wording_rmse: 0.8282 - ema_mcrmse: 0.6603


Epoch: [4][1200/1289] Elapsed 49m 1s (remain 3m 35s) Loss: 0.0248(0.0371) Grad: 2.8660  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1980  time: 3034s
Epoch 4 - content_rmse: 0.4922 - wording_rmse: 0.8291 - mcrmse: 0.6606
Epoch 4 avg_val_loss: 0.1979  time: 3128s
Epoch 4 - ema_content_rmse: 0.4922 - ema_wording_rmse: 0.8289 - ema_mcrmse: 0.6605


Epoch: [4][1288/1289] Elapsed 52m 59s (remain 0m 0s) Loss: 0.0467(0.0369) Grad: 4.3034  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1979  time: 3273s
Epoch 4 - content_rmse: 0.4922 - wording_rmse: 0.8289 - mcrmse: 0.6606
Epoch 4 avg_val_loss: 0.1979  time: 3366s
Epoch 4 - ema_content_rmse: 0.4922 - ema_wording_rmse: 0.8289 - ema_mcrmse: 0.6606
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f05129b6460>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 3b11de2e-8c4e-4b80-a427-6d72d8088e3f)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json


load pretrained model ...
Epoch: [1][0/1292] Elapsed 0m 0s (remain 20m 58s) Loss: 0.5580(0.5580) Grad: 6.8567  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.4905  time: 94s
Epoch 1 - content_rmse: 0.9952 - wording_rmse: 1.1457 - mcrmse: 1.0705
Epoch 1 - Save Best Score: 1.0705 Model
Epoch 1 avg_val_loss: 0.4782  time: 189s
Epoch 1 - ema_content_rmse: 0.9964 - ema_wording_rmse: 1.1151 - ema_mcrmse: 1.0558
Epoch 1 - ema_Save Best Score: 1.0558 Model


Epoch: [1][100/1292] Elapsed 4m 10s (remain 49m 18s) Loss: 0.4795(0.3457) Grad: 5.7691  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.2675  time: 344s
Epoch 1 - content_rmse: 0.6175 - wording_rmse: 0.8744 - mcrmse: 0.7460
Epoch 1 - Save Best Score: 0.7460 Model
Epoch 1 avg_val_loss: 0.2862  time: 440s
Epoch 1 - ema_content_rmse: 0.6569 - ema_wording_rmse: 0.8920 - ema_mcrmse: 0.7745
Epoch 1 - ema_Save Best Score: 0.7745 Model


Epoch: [1][200/1292] Elapsed 8m 21s (remain 45m 19s) Loss: 0.1544(0.2745) Grad: 4.0213  LR: 0.00000498  


Epoch 1 avg_val_loss: 0.2210  time: 594s
Epoch 1 - content_rmse: 0.5379 - wording_rmse: 0.8189 - mcrmse: 0.6784
Epoch 1 - Save Best Score: 0.6784 Model
Epoch 1 avg_val_loss: 0.2294  time: 690s
Epoch 1 - ema_content_rmse: 0.5602 - ema_wording_rmse: 0.8265 - ema_mcrmse: 0.6933
Epoch 1 - ema_Save Best Score: 0.6933 Model


Epoch: [1][300/1292] Elapsed 12m 31s (remain 41m 15s) Loss: 0.0481(0.2393) Grad: 4.0808  LR: 0.00000496  


Epoch 1 avg_val_loss: 0.2214  time: 845s
Epoch 1 - content_rmse: 0.5148 - wording_rmse: 0.8247 - mcrmse: 0.6698
Epoch 1 - Save Best Score: 0.6698 Model
Epoch 1 avg_val_loss: 0.2211  time: 940s
Epoch 1 - ema_content_rmse: 0.4992 - ema_wording_rmse: 0.8360 - ema_mcrmse: 0.6676
Epoch 1 - ema_Save Best Score: 0.6676 Model


Epoch: [1][400/1292] Elapsed 16m 41s (remain 37m 5s) Loss: 0.0596(0.2231) Grad: 4.0894  LR: 0.00000493  


Epoch 1 avg_val_loss: 0.1895  time: 1095s
Epoch 1 - content_rmse: 0.4682 - wording_rmse: 0.7676 - mcrmse: 0.6179
Epoch 1 - Save Best Score: 0.6179 Model
Epoch 1 avg_val_loss: 0.2120  time: 1190s
Epoch 1 - ema_content_rmse: 0.4632 - ema_wording_rmse: 0.8318 - ema_mcrmse: 0.6475
Epoch 1 - ema_Save Best Score: 0.6475 Model


Epoch: [1][500/1292] Elapsed 20m 51s (remain 32m 56s) Loss: 0.0961(0.2080) Grad: 5.4275  LR: 0.00000488  


Epoch 1 avg_val_loss: 0.1682  time: 1345s
Epoch 1 - content_rmse: 0.4817 - wording_rmse: 0.6985 - mcrmse: 0.5901
Epoch 1 - Save Best Score: 0.5901 Model
Epoch 1 avg_val_loss: 0.1599  time: 1440s
Epoch 1 - ema_content_rmse: 0.4548 - ema_wording_rmse: 0.6896 - ema_mcrmse: 0.5722
Epoch 1 - ema_Save Best Score: 0.5722 Model


Epoch: [1][600/1292] Elapsed 25m 1s (remain 28m 46s) Loss: 0.0819(0.1956) Grad: 5.2134  LR: 0.00000484  


Epoch 1 avg_val_loss: 0.1585  time: 1595s
Epoch 1 - content_rmse: 0.4622 - wording_rmse: 0.6657 - mcrmse: 0.5639
Epoch 1 - Save Best Score: 0.5639 Model
Epoch 1 avg_val_loss: 0.1526  time: 1690s
Epoch 1 - ema_content_rmse: 0.4513 - ema_wording_rmse: 0.6542 - ema_mcrmse: 0.5527
Epoch 1 - ema_Save Best Score: 0.5527 Model


Epoch: [1][700/1292] Elapsed 29m 11s (remain 24m 37s) Loss: 0.1562(0.1851) Grad: 5.2645  LR: 0.00000478  


Epoch 1 avg_val_loss: 0.1490  time: 1845s
Epoch 1 - content_rmse: 0.4623 - wording_rmse: 0.6297 - mcrmse: 0.5460
Epoch 1 - Save Best Score: 0.5460 Model
Epoch 1 avg_val_loss: 0.1522  time: 1940s
Epoch 1 - ema_content_rmse: 0.4698 - ema_wording_rmse: 0.6354 - ema_mcrmse: 0.5526
Epoch 1 - ema_Save Best Score: 0.5526 Model


Epoch: [1][800/1292] Elapsed 33m 21s (remain 20m 27s) Loss: 0.2985(0.1800) Grad: 7.8579  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.1264  time: 2095s
Epoch 1 - content_rmse: 0.4387 - wording_rmse: 0.5718 - mcrmse: 0.5053
Epoch 1 - Save Best Score: 0.5053 Model
Epoch 1 avg_val_loss: 0.1281  time: 2190s
Epoch 1 - ema_content_rmse: 0.4470 - ema_wording_rmse: 0.5712 - ema_mcrmse: 0.5091
Epoch 1 - ema_Save Best Score: 0.5091 Model


Epoch: [1][900/1292] Elapsed 37m 32s (remain 16m 17s) Loss: 0.0515(0.1747) Grad: 2.3749  LR: 0.00000463  


Epoch 1 avg_val_loss: 0.1489  time: 2345s
Epoch 1 - content_rmse: 0.4628 - wording_rmse: 0.6345 - mcrmse: 0.5487
Epoch 1 avg_val_loss: 0.1421  time: 2438s
Epoch 1 - ema_content_rmse: 0.4651 - ema_wording_rmse: 0.6082 - ema_mcrmse: 0.5367


Epoch: [1][1000/1292] Elapsed 41m 36s (remain 12m 5s) Loss: 0.1875(0.1694) Grad: 4.0287  LR: 0.00000455  


Epoch 1 avg_val_loss: 0.1158  time: 2589s
Epoch 1 - content_rmse: 0.4452 - wording_rmse: 0.5242 - mcrmse: 0.4847
Epoch 1 - Save Best Score: 0.4847 Model
Epoch 1 avg_val_loss: 0.1250  time: 2685s
Epoch 1 - ema_content_rmse: 0.4412 - ema_wording_rmse: 0.5627 - ema_mcrmse: 0.5020
Epoch 1 - ema_Save Best Score: 0.5020 Model


Epoch: [1][1100/1292] Elapsed 45m 46s (remain 7m 56s) Loss: 0.2600(0.1654) Grad: 12.4162  LR: 0.00000446  


Epoch 1 avg_val_loss: 0.1700  time: 2839s
Epoch 1 - content_rmse: 0.5284 - wording_rmse: 0.6565 - mcrmse: 0.5925
Epoch 1 avg_val_loss: 0.1397  time: 2932s
Epoch 1 - ema_content_rmse: 0.4661 - ema_wording_rmse: 0.6015 - ema_mcrmse: 0.5338


Epoch: [1][1200/1292] Elapsed 49m 50s (remain 3m 46s) Loss: 0.3401(0.1618) Grad: 5.8140  LR: 0.00000436  


Epoch 1 avg_val_loss: 0.1316  time: 3083s
Epoch 1 - content_rmse: 0.4509 - wording_rmse: 0.5830 - mcrmse: 0.5170
Epoch 1 avg_val_loss: 0.1240  time: 3176s
Epoch 1 - ema_content_rmse: 0.4389 - ema_wording_rmse: 0.5632 - ema_mcrmse: 0.5010
Epoch 1 - ema_Save Best Score: 0.5010 Model


Epoch: [1][1291/1292] Elapsed 53m 52s (remain 0m 0s) Loss: 0.1508(0.1592) Grad: 4.5553  LR: 0.00000427  


Epoch 1 avg_val_loss: 0.1322  time: 3325s
Epoch 1 - content_rmse: 0.4555 - wording_rmse: 0.5852 - mcrmse: 0.5204
Epoch 1 avg_val_loss: 0.1353  time: 3418s
Epoch 1 - ema_content_rmse: 0.4473 - ema_wording_rmse: 0.6043 - ema_mcrmse: 0.5258


Epoch: [2][0/1292] Elapsed 0m 0s (remain 19m 5s) Loss: 0.0560(0.0560) Grad: 2.9591  LR: 0.00000427  


Epoch 2 avg_val_loss: 0.1324  time: 94s
Epoch 2 - content_rmse: 0.4606 - wording_rmse: 0.5814 - mcrmse: 0.5210
Epoch 2 avg_val_loss: 0.1342  time: 186s
Epoch 2 - ema_content_rmse: 0.4461 - ema_wording_rmse: 0.6011 - ema_mcrmse: 0.5236


Epoch: [2][100/1292] Elapsed 4m 5s (remain 48m 12s) Loss: 0.1162(0.0980) Grad: 4.3164  LR: 0.00000416  


Epoch 2 avg_val_loss: 0.1468  time: 338s
Epoch 2 - content_rmse: 0.4355 - wording_rmse: 0.6506 - mcrmse: 0.5430
Epoch 2 avg_val_loss: 0.1334  time: 431s
Epoch 2 - ema_content_rmse: 0.4335 - ema_wording_rmse: 0.6031 - ema_mcrmse: 0.5183


Epoch: [2][200/1292] Elapsed 8m 9s (remain 44m 16s) Loss: 0.0908(0.1024) Grad: 5.5367  LR: 0.00000404  


Epoch 2 avg_val_loss: 0.1177  time: 582s
Epoch 2 - content_rmse: 0.4339 - wording_rmse: 0.5413 - mcrmse: 0.4876
Epoch 2 avg_val_loss: 0.1181  time: 675s
Epoch 2 - ema_content_rmse: 0.4345 - ema_wording_rmse: 0.5426 - ema_mcrmse: 0.4885
Epoch 2 - ema_Save Best Score: 0.4885 Model


Epoch: [2][300/1292] Elapsed 12m 16s (remain 40m 25s) Loss: 0.1089(0.0967) Grad: 5.5363  LR: 0.00000392  


Epoch 2 avg_val_loss: 0.1112  time: 829s
Epoch 2 - content_rmse: 0.4262 - wording_rmse: 0.5197 - mcrmse: 0.4729
Epoch 2 - Save Best Score: 0.4729 Model
Epoch 2 avg_val_loss: 0.1139  time: 925s
Epoch 2 - ema_content_rmse: 0.4225 - ema_wording_rmse: 0.5333 - ema_mcrmse: 0.4779
Epoch 2 - ema_Save Best Score: 0.4779 Model


Epoch: [2][400/1292] Elapsed 16m 26s (remain 36m 32s) Loss: 0.0355(0.0952) Grad: 3.1333  LR: 0.00000379  


Epoch 2 avg_val_loss: 0.1241  time: 1080s
Epoch 2 - content_rmse: 0.4481 - wording_rmse: 0.5529 - mcrmse: 0.5005
Epoch 2 avg_val_loss: 0.1176  time: 1173s
Epoch 2 - ema_content_rmse: 0.4362 - ema_wording_rmse: 0.5382 - ema_mcrmse: 0.4872


Epoch: [2][500/1292] Elapsed 20m 31s (remain 32m 23s) Loss: 0.1177(0.0941) Grad: 5.8405  LR: 0.00000366  


Epoch 2 avg_val_loss: 0.1203  time: 1324s
Epoch 2 - content_rmse: 0.4423 - wording_rmse: 0.5481 - mcrmse: 0.4952
Epoch 2 avg_val_loss: 0.1302  time: 1417s
Epoch 2 - ema_content_rmse: 0.4436 - ema_wording_rmse: 0.5866 - ema_mcrmse: 0.5151


Epoch: [2][600/1292] Elapsed 24m 35s (remain 28m 16s) Loss: 0.0235(0.0960) Grad: 3.2223  LR: 0.00000352  


Epoch 2 avg_val_loss: 0.1181  time: 1568s
Epoch 2 - content_rmse: 0.4203 - wording_rmse: 0.5509 - mcrmse: 0.4856
Epoch 2 avg_val_loss: 0.1113  time: 1661s
Epoch 2 - ema_content_rmse: 0.4208 - ema_wording_rmse: 0.5249 - ema_mcrmse: 0.4728
Epoch 2 - ema_Save Best Score: 0.4728 Model


Epoch: [2][700/1292] Elapsed 28m 42s (remain 24m 12s) Loss: 0.0626(0.0970) Grad: 6.0898  LR: 0.00000338  


Epoch 2 avg_val_loss: 0.1157  time: 1815s
Epoch 2 - content_rmse: 0.4310 - wording_rmse: 0.5364 - mcrmse: 0.4837
Epoch 2 avg_val_loss: 0.1225  time: 1908s
Epoch 2 - ema_content_rmse: 0.4282 - ema_wording_rmse: 0.5663 - ema_mcrmse: 0.4973


Epoch: [2][800/1292] Elapsed 32m 46s (remain 20m 5s) Loss: 0.0909(0.0965) Grad: 2.2548  LR: 0.00000324  


Epoch 2 avg_val_loss: 0.1146  time: 2059s
Epoch 2 - content_rmse: 0.4230 - wording_rmse: 0.5372 - mcrmse: 0.4801
Epoch 2 avg_val_loss: 0.1273  time: 2152s
Epoch 2 - ema_content_rmse: 0.4358 - ema_wording_rmse: 0.5780 - ema_mcrmse: 0.5069


Epoch: [2][900/1292] Elapsed 36m 50s (remain 15m 59s) Loss: 0.0950(0.0948) Grad: 4.4822  LR: 0.00000309  


Epoch 2 avg_val_loss: 0.1190  time: 2303s
Epoch 2 - content_rmse: 0.4199 - wording_rmse: 0.5573 - mcrmse: 0.4886
Epoch 2 avg_val_loss: 0.1172  time: 2396s
Epoch 2 - ema_content_rmse: 0.4181 - ema_wording_rmse: 0.5512 - ema_mcrmse: 0.4846


Epoch: [2][1000/1292] Elapsed 40m 54s (remain 11m 53s) Loss: 0.0165(0.0949) Grad: 2.4111  LR: 0.00000294  


Epoch 2 avg_val_loss: 0.1300  time: 2548s
Epoch 2 - content_rmse: 0.4389 - wording_rmse: 0.5883 - mcrmse: 0.5136
Epoch 2 avg_val_loss: 0.1273  time: 2640s
Epoch 2 - ema_content_rmse: 0.4390 - ema_wording_rmse: 0.5766 - ema_mcrmse: 0.5078


Epoch: [2][1100/1292] Elapsed 44m 59s (remain 7m 48s) Loss: 0.0377(0.0946) Grad: 4.5903  LR: 0.00000279  


Epoch 2 avg_val_loss: 0.1142  time: 2792s
Epoch 2 - content_rmse: 0.4581 - wording_rmse: 0.5039 - mcrmse: 0.4810
Epoch 2 avg_val_loss: 0.1081  time: 2885s
Epoch 2 - ema_content_rmse: 0.4250 - ema_wording_rmse: 0.5082 - ema_mcrmse: 0.4666
Epoch 2 - ema_Save Best Score: 0.4666 Model


Epoch: [2][1200/1292] Elapsed 49m 6s (remain 3m 43s) Loss: 0.0952(0.0937) Grad: 4.7948  LR: 0.00000264  


Epoch 2 avg_val_loss: 0.1390  time: 3039s
Epoch 2 - content_rmse: 0.4392 - wording_rmse: 0.6236 - mcrmse: 0.5314
Epoch 2 avg_val_loss: 0.1280  time: 3132s
Epoch 2 - ema_content_rmse: 0.4324 - ema_wording_rmse: 0.5858 - ema_mcrmse: 0.5091


Epoch: [2][1291/1292] Elapsed 53m 5s (remain 0m 0s) Loss: 0.0607(0.0939) Grad: 2.8404  LR: 0.00000250  


Epoch 2 avg_val_loss: 0.1172  time: 3278s
Epoch 2 - content_rmse: 0.4210 - wording_rmse: 0.5502 - mcrmse: 0.4856
Epoch 2 avg_val_loss: 0.1194  time: 3371s
Epoch 2 - ema_content_rmse: 0.4235 - ema_wording_rmse: 0.5600 - ema_mcrmse: 0.4917


Epoch: [3][0/1292] Elapsed 0m 0s (remain 19m 0s) Loss: 0.1702(0.1702) Grad: 4.0934  LR: 0.00000250  


Epoch 3 avg_val_loss: 0.1167  time: 94s
Epoch 3 - content_rmse: 0.4203 - wording_rmse: 0.5486 - mcrmse: 0.4844
Epoch 3 avg_val_loss: 0.1192  time: 186s
Epoch 3 - ema_content_rmse: 0.4231 - ema_wording_rmse: 0.5593 - ema_mcrmse: 0.4912


Epoch: [3][100/1292] Elapsed 4m 5s (remain 48m 13s) Loss: 0.0557(0.0735) Grad: 1.4735  LR: 0.00000235  


Epoch 3 avg_val_loss: 0.1179  time: 338s
Epoch 3 - content_rmse: 0.4352 - wording_rmse: 0.5433 - mcrmse: 0.4893
Epoch 3 avg_val_loss: 0.1248  time: 431s
Epoch 3 - ema_content_rmse: 0.4338 - ema_wording_rmse: 0.5750 - ema_mcrmse: 0.5044


Epoch: [3][200/1292] Elapsed 8m 9s (remain 44m 16s) Loss: 0.0560(0.0661) Grad: 4.2541  LR: 0.00000220  


Epoch 3 avg_val_loss: 0.1221  time: 582s
Epoch 3 - content_rmse: 0.4310 - wording_rmse: 0.5656 - mcrmse: 0.4983
Epoch 3 avg_val_loss: 0.1272  time: 675s
Epoch 3 - ema_content_rmse: 0.4319 - ema_wording_rmse: 0.5841 - ema_mcrmse: 0.5080


Epoch: [3][300/1292] Elapsed 12m 13s (remain 40m 15s) Loss: 0.0456(0.0619) Grad: 3.4829  LR: 0.00000205  


Epoch 3 avg_val_loss: 0.1150  time: 827s
Epoch 3 - content_rmse: 0.4231 - wording_rmse: 0.5399 - mcrmse: 0.4815
Epoch 3 avg_val_loss: 0.1165  time: 919s
Epoch 3 - ema_content_rmse: 0.4248 - ema_wording_rmse: 0.5456 - ema_mcrmse: 0.4852


Epoch: [3][400/1292] Elapsed 16m 18s (remain 36m 13s) Loss: 0.0196(0.0611) Grad: 1.7661  LR: 0.00000190  


Epoch 3 avg_val_loss: 0.1311  time: 1071s
Epoch 3 - content_rmse: 0.4349 - wording_rmse: 0.5941 - mcrmse: 0.5145
Epoch 3 avg_val_loss: 0.1187  time: 1164s
Epoch 3 - ema_content_rmse: 0.4308 - ema_wording_rmse: 0.5505 - ema_mcrmse: 0.4907


Epoch: [3][500/1292] Elapsed 20m 22s (remain 32m 10s) Loss: 0.0274(0.0606) Grad: 2.0721  LR: 0.00000175  


Epoch 3 avg_val_loss: 0.1214  time: 1315s
Epoch 3 - content_rmse: 0.4392 - wording_rmse: 0.5521 - mcrmse: 0.4957
Epoch 3 avg_val_loss: 0.1254  time: 1408s
Epoch 3 - ema_content_rmse: 0.4339 - ema_wording_rmse: 0.5728 - ema_mcrmse: 0.5033


Epoch: [3][600/1292] Elapsed 24m 26s (remain 28m 6s) Loss: 0.0711(0.0611) Grad: 4.2896  LR: 0.00000161  


Epoch 3 avg_val_loss: 0.1159  time: 1560s
Epoch 3 - content_rmse: 0.4392 - wording_rmse: 0.5281 - mcrmse: 0.4837
Epoch 3 avg_val_loss: 0.1182  time: 1652s
Epoch 3 - ema_content_rmse: 0.4312 - ema_wording_rmse: 0.5449 - ema_mcrmse: 0.4880


Epoch: [3][700/1292] Elapsed 28m 31s (remain 24m 2s) Loss: 0.0243(0.0610) Grad: 3.5175  LR: 0.00000147  


Epoch 3 avg_val_loss: 0.1165  time: 1804s
Epoch 3 - content_rmse: 0.4243 - wording_rmse: 0.5442 - mcrmse: 0.4843
Epoch 3 avg_val_loss: 0.1177  time: 1897s
Epoch 3 - ema_content_rmse: 0.4238 - ema_wording_rmse: 0.5488 - ema_mcrmse: 0.4863


Epoch: [3][800/1292] Elapsed 32m 35s (remain 19m 58s) Loss: 0.0327(0.0608) Grad: 1.7326  LR: 0.00000133  


Epoch 3 avg_val_loss: 0.1239  time: 2048s
Epoch 3 - content_rmse: 0.4309 - wording_rmse: 0.5650 - mcrmse: 0.4980
Epoch 3 avg_val_loss: 0.1192  time: 2141s
Epoch 3 - ema_content_rmse: 0.4236 - ema_wording_rmse: 0.5538 - ema_mcrmse: 0.4887


Epoch: [3][900/1292] Elapsed 36m 39s (remain 15m 54s) Loss: 0.0444(0.0612) Grad: 2.7261  LR: 0.00000120  


Epoch 3 avg_val_loss: 0.1112  time: 2292s
Epoch 3 - content_rmse: 0.4219 - wording_rmse: 0.5247 - mcrmse: 0.4733
Epoch 3 avg_val_loss: 0.1123  time: 2385s
Epoch 3 - ema_content_rmse: 0.4203 - ema_wording_rmse: 0.5297 - ema_mcrmse: 0.4750


Epoch: [3][1000/1292] Elapsed 40m 43s (remain 11m 50s) Loss: 0.0366(0.0607) Grad: 3.7632  LR: 0.00000107  


Epoch 3 avg_val_loss: 0.1301  time: 2537s
Epoch 3 - content_rmse: 0.4281 - wording_rmse: 0.5934 - mcrmse: 0.5108
Epoch 3 avg_val_loss: 0.1210  time: 2629s
Epoch 3 - ema_content_rmse: 0.4267 - ema_wording_rmse: 0.5597 - ema_mcrmse: 0.4932


Epoch: [3][1100/1292] Elapsed 44m 48s (remain 7m 46s) Loss: 0.0492(0.0603) Grad: 1.6996  LR: 0.00000095  


Epoch 3 avg_val_loss: 0.1154  time: 2781s
Epoch 3 - content_rmse: 0.4307 - wording_rmse: 0.5354 - mcrmse: 0.4830
Epoch 3 avg_val_loss: 0.1181  time: 2874s
Epoch 3 - ema_content_rmse: 0.4287 - ema_wording_rmse: 0.5471 - ema_mcrmse: 0.4879


Epoch: [3][1200/1292] Elapsed 48m 52s (remain 3m 42s) Loss: 0.0580(0.0604) Grad: 3.3690  LR: 0.00000083  


Epoch 3 avg_val_loss: 0.1130  time: 3025s
Epoch 3 - content_rmse: 0.4348 - wording_rmse: 0.5217 - mcrmse: 0.4783
Epoch 3 avg_val_loss: 0.1137  time: 3118s
Epoch 3 - ema_content_rmse: 0.4337 - ema_wording_rmse: 0.5260 - ema_mcrmse: 0.4798


Epoch: [3][1291/1292] Elapsed 52m 51s (remain 0m 0s) Loss: 0.0486(0.0606) Grad: 4.0713  LR: 0.00000073  


Epoch 3 avg_val_loss: 0.1090  time: 3264s
Epoch 3 - content_rmse: 0.4261 - wording_rmse: 0.5132 - mcrmse: 0.4696
Epoch 3 - Save Best Score: 0.4696 Model
Epoch 3 avg_val_loss: 0.1112  time: 3360s
Epoch 3 - ema_content_rmse: 0.4295 - ema_wording_rmse: 0.5191 - ema_mcrmse: 0.4743


Epoch: [4][0/1292] Elapsed 0m 0s (remain 19m 14s) Loss: 0.0297(0.0297) Grad: 3.6454  LR: 0.00000073  


Epoch 4 avg_val_loss: 0.1092  time: 94s
Epoch 4 - content_rmse: 0.4261 - wording_rmse: 0.5137 - mcrmse: 0.4699
Epoch 4 avg_val_loss: 0.1111  time: 186s
Epoch 4 - ema_content_rmse: 0.4294 - ema_wording_rmse: 0.5189 - ema_mcrmse: 0.4741


Epoch: [4][100/1292] Elapsed 4m 5s (remain 48m 14s) Loss: 0.0414(0.0462) Grad: 3.2925  LR: 0.00000063  


Epoch 4 avg_val_loss: 0.1135  time: 338s
Epoch 4 - content_rmse: 0.4305 - wording_rmse: 0.5274 - mcrmse: 0.4790
Epoch 4 avg_val_loss: 0.1137  time: 431s
Epoch 4 - ema_content_rmse: 0.4286 - ema_wording_rmse: 0.5298 - ema_mcrmse: 0.4792


Epoch: [4][200/1292] Elapsed 8m 9s (remain 44m 17s) Loss: 0.0140(0.0424) Grad: 1.7108  LR: 0.00000053  


Epoch 4 avg_val_loss: 0.1166  time: 582s
Epoch 4 - content_rmse: 0.4309 - wording_rmse: 0.5394 - mcrmse: 0.4851
Epoch 4 avg_val_loss: 0.1168  time: 675s
Epoch 4 - ema_content_rmse: 0.4315 - ema_wording_rmse: 0.5395 - ema_mcrmse: 0.4855


Epoch: [4][300/1292] Elapsed 12m 13s (remain 40m 15s) Loss: 0.0178(0.0418) Grad: 1.6522  LR: 0.00000044  


Epoch 4 avg_val_loss: 0.1134  time: 826s
Epoch 4 - content_rmse: 0.4334 - wording_rmse: 0.5243 - mcrmse: 0.4788
Epoch 4 avg_val_loss: 0.1150  time: 919s
Epoch 4 - ema_content_rmse: 0.4311 - ema_wording_rmse: 0.5324 - ema_mcrmse: 0.4817


Epoch: [4][400/1292] Elapsed 16m 17s (remain 36m 12s) Loss: 0.0208(0.0414) Grad: 1.4298  LR: 0.00000036  


Epoch 4 avg_val_loss: 0.1112  time: 1071s
Epoch 4 - content_rmse: 0.4287 - wording_rmse: 0.5193 - mcrmse: 0.4740
Epoch 4 avg_val_loss: 0.1124  time: 1163s
Epoch 4 - ema_content_rmse: 0.4311 - ema_wording_rmse: 0.5222 - ema_mcrmse: 0.4766


Epoch: [4][500/1292] Elapsed 20m 22s (remain 32m 9s) Loss: 0.0996(0.0419) Grad: 3.3710  LR: 0.00000028  


Epoch 4 avg_val_loss: 0.1204  time: 1315s
Epoch 4 - content_rmse: 0.4362 - wording_rmse: 0.5497 - mcrmse: 0.4929
Epoch 4 avg_val_loss: 0.1165  time: 1408s
Epoch 4 - ema_content_rmse: 0.4338 - ema_wording_rmse: 0.5363 - ema_mcrmse: 0.4851


Epoch: [4][600/1292] Elapsed 24m 26s (remain 28m 5s) Loss: 0.0195(0.0418) Grad: 1.7199  LR: 0.00000022  


Epoch 4 avg_val_loss: 0.1188  time: 1559s
Epoch 4 - content_rmse: 0.4326 - wording_rmse: 0.5463 - mcrmse: 0.4895
Epoch 4 avg_val_loss: 0.1162  time: 1652s
Epoch 4 - ema_content_rmse: 0.4338 - ema_wording_rmse: 0.5352 - ema_mcrmse: 0.4845


Epoch: [4][700/1292] Elapsed 28m 30s (remain 24m 2s) Loss: 0.0205(0.0413) Grad: 3.1120  LR: 0.00000016  


Epoch 4 avg_val_loss: 0.1143  time: 1803s
Epoch 4 - content_rmse: 0.4335 - wording_rmse: 0.5278 - mcrmse: 0.4806
Epoch 4 avg_val_loss: 0.1154  time: 1896s
Epoch 4 - ema_content_rmse: 0.4336 - ema_wording_rmse: 0.5323 - ema_mcrmse: 0.4830


Epoch: [4][800/1292] Elapsed 32m 34s (remain 19m 58s) Loss: 0.0445(0.0411) Grad: 2.4572  LR: 0.00000011  


Epoch 4 avg_val_loss: 0.1170  time: 2048s
Epoch 4 - content_rmse: 0.4311 - wording_rmse: 0.5407 - mcrmse: 0.4859
Epoch 4 avg_val_loss: 0.1155  time: 2140s
Epoch 4 - ema_content_rmse: 0.4308 - ema_wording_rmse: 0.5350 - ema_mcrmse: 0.4829


Epoch: [4][900/1292] Elapsed 36m 38s (remain 15m 54s) Loss: 0.0539(0.0410) Grad: 3.0510  LR: 0.00000007  


Epoch 4 avg_val_loss: 0.1157  time: 2292s
Epoch 4 - content_rmse: 0.4340 - wording_rmse: 0.5331 - mcrmse: 0.4835
Epoch 4 avg_val_loss: 0.1165  time: 2384s
Epoch 4 - ema_content_rmse: 0.4328 - ema_wording_rmse: 0.5374 - ema_mcrmse: 0.4851


Epoch: [4][1000/1292] Elapsed 40m 43s (remain 11m 50s) Loss: 0.0283(0.0413) Grad: 3.7726  LR: 0.00000004  


Epoch 4 avg_val_loss: 0.1154  time: 2536s
Epoch 4 - content_rmse: 0.4349 - wording_rmse: 0.5311 - mcrmse: 0.4830
Epoch 4 avg_val_loss: 0.1154  time: 2629s
Epoch 4 - ema_content_rmse: 0.4349 - ema_wording_rmse: 0.5315 - ema_mcrmse: 0.4832


Epoch: [4][1100/1292] Elapsed 44m 47s (remain 7m 46s) Loss: 0.0254(0.0412) Grad: 3.2773  LR: 0.00000002  


Epoch 4 avg_val_loss: 0.1154  time: 2780s
Epoch 4 - content_rmse: 0.4337 - wording_rmse: 0.5322 - mcrmse: 0.4829
Epoch 4 avg_val_loss: 0.1154  time: 2873s
Epoch 4 - ema_content_rmse: 0.4344 - ema_wording_rmse: 0.5316 - ema_mcrmse: 0.4830


Epoch: [4][1200/1292] Elapsed 48m 51s (remain 3m 42s) Loss: 0.0495(0.0414) Grad: 5.8561  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1156  time: 3024s
Epoch 4 - content_rmse: 0.4334 - wording_rmse: 0.5332 - mcrmse: 0.4833
Epoch 4 avg_val_loss: 0.1155  time: 3117s
Epoch 4 - ema_content_rmse: 0.4336 - ema_wording_rmse: 0.5327 - ema_mcrmse: 0.4832


Epoch: [4][1291/1292] Elapsed 52m 50s (remain 0m 0s) Loss: 0.0195(0.0417) Grad: 2.9176  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1156  time: 3263s
Epoch 4 - content_rmse: 0.4334 - wording_rmse: 0.5332 - mcrmse: 0.4833
Epoch 4 avg_val_loss: 0.1156  time: 3356s
Epoch 4 - ema_content_rmse: 0.4334 - ema_wording_rmse: 0.5331 - ema_mcrmse: 0.4833


In [20]:
## total_complex = []
# for fold in range(4):
#     va_data = train_df[train_df['fold'] == fold]
#     preds = torch.load('/content/drive/MyDrive/deb_simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['predictions']
#     va_data['preds'] = preds
#     va_data = va_data[['id', 'preds', 'score']]
#     print(compute_metrics(va_data['preds'].values.reshape(-1,1), va_data['score'].values))
#     total_complex.append(va_data)
# total_complex = pd.concat(total_complex)
# compute_metrics(total_complex['preds'].values.reshape(-1,1), total_complex['score'].values)

In [21]:
# !mkdir -p /root/.kaggle
# !cp /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets init -p /content/drive/MyDrive/elc_mean/

In [22]:
#!kaggle datasets create -p /content/drive/MyDrive/elc_mean/

In [23]:
# deberta v3 large
# 1.5 0.8228
# 2 0.8197

# 1.5  8137
#2 8175
#2.5 8181
#3 8181
#3.5 8175

