## Log
* use 512 instead of 1024 as max_seq_len
* turn discriminative learning rates on
* use 1536 as max_seq_len

In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Model training')
    # params of training
    parser.add_argument(
        "--fold", dest="fold", help="Train fold", default=None, type=int)
    parser.add_argument(
        '--batch_size',
        dest='batch_size',
        help='Mini batch size of one gpu or cpu',
        type=int,
        default=None)
    return parser.parse_args()


# Config

In [3]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'pool'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 1536
    max_position_embeddings = 1536
    folds = [2]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = True
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
#     encoder_lr = 5e-6
#     head_lr = 5e-6
    encoder_lr = 6e-6
    head_lr = 1e-5
    
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 2
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    
    

## logger

In [4]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    if not os.path.exists(CFG.OUTPUT_DIR):
        os.makedirs(CFG.OUTPUT_DIR)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Preproc

In [5]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

df["fold"] = df["prompt_id"].map(id2fold)

In [6]:
df 

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,1
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,1
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710,1
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,1
...,...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990,3
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784,3
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294,3
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538,3


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5ac047b5e0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 432be5ad-1796-4a39-af51-faf5e8bb94dc)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [9]:
mask_lm_datacollator = DataCollatorForWholeWordMask(tokenizer)
def data_collator(batch):
    input_ids = [{'input_ids':i[0]} for i in batch]
    token_type_ids = [i[1] for i in batch]
    attention_mask = [i[2] for i in batch]
    labels = [i[3] for i in batch]
    masked_input = mask_lm_datacollator(input_ids)['input_ids']
    return masked_input,\
               torch.stack(token_type_ids),\
               torch.stack(attention_mask),\
               torch.stack(labels)

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.prompt_title = df['prompt_title'].values.astype(str)
        self.prompt_text = df['prompt_text'].values.astype(str)
        self.prompt_question = df['prompt_question'].values.astype(str)
        self.text = df['text'].values.astype(str)
        self.content = df['content'].values
        self.wording = df['wording'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.prompt_title)
    
    def tokenize(self, example):
        sep = self.tokenizer.sep_token
        if  CFG.input_type == '1':
            prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
        else:
            prompt = example["prompt_question"] 
        
        labels = [float(example["content"]), float(example["wording"])]

        tokenized = tokenizer(
            example["text"],
            prompt,
            padding='max_length',
            truncation=True,
            max_length=CFG.max_input_length,
            return_tensors=None,
        )
        
        return {
            **tokenized,
            "labels": labels,
        }
    
    def __getitem__(self, item):
        example = {
                    "prompt_title":self.prompt_title[item],
                    "prompt_text":self.prompt_text[item],
                    "prompt_question":self.prompt_question[item],
                    "text":self.text[item],
                    "content":self.content[item],
                    "wording":self.wording[item],
                  }
        
        out = self.tokenize(example)
       
        return {
                'input_ids': torch.as_tensor(out['input_ids'], dtype=torch.long),
                'token_type_ids': torch.as_tensor(out['token_type_ids'], dtype=torch.long),
                'attention_mask': torch.as_tensor(out['attention_mask'], dtype=torch.long),
                'labels': torch.as_tensor(out['labels'], dtype=torch.float),
        }
        
        
        

## Model

In [11]:
def init_params(module_lst):
    for module in module_lst:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

class Custom_Bert(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls = nn.Sequential(
            nn.Linear(dim,1)
        )
        init_params([self.cls,self.attention])

    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output = self.cls(logits)
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)


class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(torch.mean(output, dim=1))
        return SequenceClassifierOutput(
            loss=nn.MSELoss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )

class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret    

class Custom_Bert_Pool(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        print('no pretrained model loaded.')
        self.base = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        
#         print('load pretrained model ...');
#         self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model_1009', config = self.config)
        
        self.pool = GeMText()
        self.cls = nn.Linear(self.config.hidden_size,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(self.pool(output, attention_mask))
        return SequenceClassifierOutput(
            loss=nn.SmoothL1Loss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            print(f'Re-initialize {module}')
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class Custom_Bert_Mean(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states=True
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)

    def forward(self, input_ids, attention_mask,labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                            )


        output = base_output.hidden_states[-1]
        output = self.cls(self.dropout(torch.mean(output, dim=1)))
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

class Custom_Bert_M(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls_0 = nn.Sequential(
            nn.Linear(dim,1)
        )

        self.cls_1 = nn.Linear(dim,5)
        init_params([self.cls_0,self.cls_1,self.attention])

    def forward(self, input_ids, attention_mask, labels):
        base_output = self.base(input_ids=input_ids,
                    attention_mask=attention_mask,
                             )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output_0 = self.cls_0(logits)
        output_1 = self.cls_1(logits)
        if labels is None:
            return output_0

        else:
            regression_loss = nn.MSELoss()(torch.squeeze(output_0,1),labels)
            labels = labels.double()
            cls_labels = torch.where(labels==1.,4.0,labels)
            cls_labels = torch.where(cls_labels==0.25,1.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.5,2.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.75,3.0,cls_labels)
            cls_labels = cls_labels.long()
            cls_loss = nn.CrossEntropyLoss()(output_1, cls_labels)
            return ( 0.8 * regression_loss + 0.2 * cls_loss, output_0)

In [12]:
def build_model():
    if CFG.model_type == 'base':
        model_config = AutoConfig.from_pretrained(CFG.model_path)
        model_config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })

        #print(model_config)
        model = AutoModelForSequenceClassification.from_pretrained(
            CFG.model_path, config=model_config
        )
    if CFG.model_type == 'simple':
        model = Custom_Bert_Simple()
    if CFG.model_type == 'pool':
        model = Custom_Bert_Pool()
        if CFG.reinit_layers > 0:
            print("=="*40)
            print(f"Reinitialize the last {CFG.reinit_layers} layer(s).")
            for layer in model.base.encoder.layer[-CFG.reinit_layers:]:
                print("===")
                layer.apply(model._init_weights)
            print("=="*40)
        if CFG.load_pretrained:
            model.load_state_dict(torch.load('./pretrained/microsoft_deberta-v3-base_best_ema.pth')['model'])
    return model

# Train

In [13]:
from copy import deepcopy
class ModelEMA:
    """Model Exponential Moving Average from https://github.com/rwightman/
    pytorch-image-models Keep a moving average of everything in the model
    state_dict (parameters and buffers).

    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/
    ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training
    schemes to perform well.
    This class is sensitive where it is initialized in the sequence
    of model init, GPU assignment and distributed training wrappers.
    """

    def __init__(self, model, decay=0.9999, updates=0):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
            updates (int): counter of EMA updates.
        """
        # Create EMA(FP32)
        self.ema_model = deepcopy(model).eval()
        self.ema = self.ema_model
        self.updates = updates
        # decay exponential ramp (to help early epochs)
        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def update(self, model):
        # Update EMA parameters
        with torch.no_grad():
            self.updates += 1
            d = self.decay(self.updates)
            msd =  model.state_dict()# model state_dict
            for k, v in self.ema.state_dict().items():
                if v.dtype.is_floating_point:
                    v *= d
                    v += (1.0 - d) * msd[k].detach()

class EMAHook:
    """EMAHook used in BEVDepth.

    Modified from https://github.com/Megvii-Base
    Detection/BEVDepth/blob/main/callbacks/ema.py.
    """

    def __init__(self, model, init_updates=0, decay=0.9990, resume=None, logger=None):
        super().__init__()
        self.init_updates = init_updates
        self.resume = resume
        self.decay = decay
        self.ema_model = self.before_run(model)
        self.logger = logger

    def before_run(self, model):
        from torch.nn.modules.batchnorm import SyncBatchNorm

        bn_model_list = list()
        bn_model_dist_group_list = list()
        for model_ref in model.modules():
            if isinstance(model_ref, SyncBatchNorm):
                bn_model_list.append(model_ref)
                bn_model_dist_group_list.append(model_ref.process_group)
                model_ref.process_group = None
        ema_model = ModelEMA(model, self.decay)

        for bn_model, dist_group in zip(bn_model_list,
                                        bn_model_dist_group_list):
            bn_model.process_group = dist_group
        ema_model.updates = self.init_updates

        if self.resume is not None:
            self.logger.info(f'resume ema checkpoint from {self.resume}')
            cpt = torch.load(self.resume, map_location='cpu')
            load_state_dict(ema_model.ema, cpt['state_dict'])
            ema_model.updates = cpt['updates']

        return ema_model

    def after_train_iter(self, model):
        self.ema_model.update(model)

In [14]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [15]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [16]:
CFG.discriminative_learning_rate_num_groups

1

In [17]:
def get_optimizer_llr_params(model, type='s'):
    """
    Setup the optimizer.
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    MODIFIED VERSION:
    * added support for differential learning rates per layer

    reference: https://github.com/huggingface/transformers/blob/05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe/src/transformers/trainer.py#L804
    """

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    ### ADDED
    if CFG.discriminative_learning_rate:

        num_layers = model.config.num_hidden_layers

        learning_rate_powers = range(0, num_layers, num_layers//CFG.discriminative_learning_rate_num_groups)
        layer_wise_learning_rates = [
            pow(CFG.discriminative_learning_rate_decay_rate, power) * CFG.encoder_lr 
            for power in learning_rate_powers 
            for _ in range(num_layers//CFG.discriminative_learning_rate_num_groups)
          ]
        layer_wise_learning_rates = layer_wise_learning_rates[::-1]
        print('Layer-wise learning rates:', layer_wise_learning_rates)

        # group embedding paramters from the transformer encoder
        embedding_layer = model.base.embeddings
        optimizer_grouped_parameters = [
          {
              "params": [p for n, p in embedding_layer.named_parameters() if not any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": CFG.weight_decay,
          },
          {
              "params": [p for n, p in embedding_layer.named_parameters() if any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": 0.0,
          },
        ]

        # group encoding paramters from the transformer encoder
        encoding_layers = [layer for layer in model.base.encoder.layer]
        for i, layer in enumerate(encoding_layers):
            optimizer_grouped_parameters += [
                {
                    "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": 0.0,
                },
            ]    
        print(f"Detected unattached modules in model.encoder: {[n for n, p in model.base.encoder.named_parameters() if not n.startswith('layer')]}")
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and not any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": 0.0,
            },
        ]

        # group paramters from the task specific head
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and not any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": 0.0,
            },
        ]
    ### END ADDED
    else:
        # group paramters for the entire network
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": 0.0,
            },
        ]
    return optimizer_grouped_parameters

In [18]:
def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        with torch.no_grad():
            model_output = model(**batch)
        label = batch['labels']
        loss, logits = model_output.loss, model_output.logits
        losses.update(loss.item(), batch_size)
        preds.append(logits.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    del loss, logits
    torch.cuda.empty_cache()
    gc.collect()
    return losses.avg, predictions, labels

def train_fn(train_loader, model, optimizer, epoch, scheduler, device, valid_loader, start_time, best_score, best_score_ema,ema_hook,wandb, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        loss = model(**batch).loss
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        ema_hook.after_train_iter(model)
        global_step += 1
        scheduler.step()
        end = time.time()
        
        wandb.log({
                'train loss': loss.item(),
                'step': global_step,
                'epoch': epoch,
                'fold': fold,
                'batch_size':CFG.batch_size
            })
        
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
            # eval
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

            # scoring
            score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - content_rmse: {content_rmse:.4f} - wording_rmse: {wording_rmse:.4f} - mcrmse: {mcrmse:.4f}')
            
            
            if best_score > score['mcrmse']:
                if best_score != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
                best_score = score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
            
            
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, ema_hook.ema_model.ema, CFG.device)
            # ema scoring
            ema_score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(ema_score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - ema_content_rmse: {content_rmse:.4f} - ema_wording_rmse: {wording_rmse:.4f} - ema_mcrmse: {mcrmse:.4f}')
            
            
            if best_score_ema > ema_score['mcrmse']:
                if best_score_ema != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score_ema))
                best_score_ema = ema_score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - ema_Save Best Score: {best_score_ema:.4f} Model')
                torch.save({'model': ema_hook.ema_model.ema.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold,best_score_ema))
            
            wandb.log({
                'learning rate': optimizer.param_groups[0]['lr'],
                'validation mcrmse': score['mcrmse'],
                'validation ema mcrmse': ema_score['mcrmse'],
                'step': global_step,
                'epoch': epoch,
            })
            
        del batch, loss, grad_norm
        torch.cuda.empty_cache()
        gc.collect()
    return losses.avg, best_score, best_score_ema



def train_loop():
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    wandb.init(project='kaggle-commonlit-eval-student-summaries-3009')
    wandb.config = dict(epochs=CFG.epochs, 
                            batch_size=CFG.batch_size, 
                            learning_rate=CFG.encoder_lr,
                            save_checkpoint=True,
                            )
    for fold in CFG.folds:
        
        if CFG.pretraining:
            tr_data = pd.read_csv('tmp_pessudo.csv')
            tr_data['prompt_title'] = ''
            tr_data = tr_data[-(tr_data['prompt_question'].isin(pdf['prompt_question'].tolist()))]
            va_data = df #df[df['fold']==fold].reset_index(drop=True)
        else:
            tr_data = df[df['fold']!=fold].reset_index(drop=True)
            va_data = df[df['fold']==fold].reset_index(drop=True)
        train_dataset = TrainDataset(tr_data, tokenizer)
        valid_dataset = TrainDataset(va_data, tokenizer)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        # ====================================================
        # model & optimizer
        # ====================================================
        model = build_model()
        #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.to(CFG.device)
        # for param in model.base.parameters():
        #         param.requires_grad = False
        ema_hook = EMAHook(model, init_updates=3000, logger=LOGGER)
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': 0.0},
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_llr_params(model)
        optimizer = AdamW(optimizer_parameters, eps=CFG.eps, betas=CFG.betas)


        
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
            if cfg.scheduler == 'linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                    num_cycles=cfg.num_cycles
                )
            return scheduler

        num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        # ====================================================
        # loop
        # ====================================================
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

        # criterion = LabelSmoothingLoss()
        best_score = float('inf')
        best_score_ema = float('inf')
        for epoch in range(CFG.epochs):

            start_time = time.time()

            # train
            avg_loss, best_score, best_score_ema = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device, valid_loader, start_time, best_score, best_score_ema ,ema_hook, wandb,fold)


        torch.cuda.empty_cache()
        gc.collect()
        del scheduler, optimizer, model
    return 


In [19]:
train_loop()

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669448977336288, max=1.0…

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5a992f1ac0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 4f5e3306-f821-427a-b6f1-97ab6b7ccde2)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json


no pretrained model loaded.
Layer-wise learning rates: [6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06, 6e-06]
Detected unattached modules in model.encoder: ['rel_embeddings.weight', 'LayerNorm.weight', 'LayerNorm.bias']
Epoch: [1][0/2578] Elapsed 0m 2s (remain 88m 15s) Loss: 0.6022(0.6022) Grad: 14.0080  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.5362  time: 533s
Epoch 1 - content_rmse: 1.2815 - wording_rmse: 0.9582 - mcrmse: 1.1199
Epoch 1 - Save Best Score: 1.1199 Model
Epoch 1 avg_val_loss: 0.5506  time: 1066s
Epoch 1 - ema_content_rmse: 1.3035 - ema_wording_rmse: 0.9727 - ema_mcrmse: 1.1381
Epoch 1 - ema_Save Best Score: 1.1381 Model


Epoch: [1][100/2578] Elapsed 20m 21s (remain 499m 14s) Loss: 0.1278(0.2978) Grad: 7.9477  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.4376  time: 1753s
Epoch 1 - content_rmse: 0.9091 - wording_rmse: 1.0723 - mcrmse: 0.9907
Epoch 1 - Save Best Score: 0.9907 Model
Epoch 1 avg_val_loss: 0.3747  time: 2286s
Epoch 1 - ema_content_rmse: 0.8398 - ema_wording_rmse: 0.9779 - ema_mcrmse: 0.9088
Epoch 1 - ema_Save Best Score: 0.9088 Model


Epoch: [1][200/2578] Elapsed 40m 39s (remain 480m 52s) Loss: 0.1243(0.2460) Grad: 5.1945  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.2331  time: 2972s
Epoch 1 - content_rmse: 0.5750 - wording_rmse: 0.8425 - mcrmse: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
Epoch 1 avg_val_loss: 0.2833  time: 3506s
Epoch 1 - ema_content_rmse: 0.6621 - ema_wording_rmse: 0.9199 - ema_mcrmse: 0.7910
Epoch 1 - ema_Save Best Score: 0.7910 Model


Epoch: [1][300/2578] Elapsed 60m 58s (remain 461m 18s) Loss: 0.0312(0.2253) Grad: 3.4290  LR: 0.00000470  


Epoch 1 avg_val_loss: 0.2222  time: 4189s
Epoch 1 - content_rmse: 0.5539 - wording_rmse: 0.8374 - mcrmse: 0.6957
Epoch 1 - Save Best Score: 0.6957 Model
Epoch 1 avg_val_loss: 0.2858  time: 4723s
Epoch 1 - ema_content_rmse: 0.6247 - ema_wording_rmse: 0.9783 - ema_mcrmse: 0.8015


Epoch: [1][400/2578] Elapsed 81m 12s (remain 440m 53s) Loss: 0.0857(0.2096) Grad: 4.1975  LR: 0.00000470  


Epoch 1 avg_val_loss: 0.2435  time: 5404s
Epoch 1 - content_rmse: 0.5917 - wording_rmse: 0.8700 - mcrmse: 0.7309
Epoch 1 avg_val_loss: 0.2913  time: 5935s
Epoch 1 - ema_content_rmse: 0.6651 - ema_wording_rmse: 0.9467 - ema_mcrmse: 0.8059


Epoch: [1][500/2578] Elapsed 101m 24s (remain 420m 26s) Loss: 0.3427(0.1959) Grad: 18.0536  LR: 0.00000469  


Epoch 1 avg_val_loss: 0.4040  time: 6617s
Epoch 1 - content_rmse: 0.8319 - wording_rmse: 1.1028 - mcrmse: 0.9673
Epoch 1 avg_val_loss: 0.3367  time: 7148s
Epoch 1 - ema_content_rmse: 0.7329 - ema_wording_rmse: 1.0188 - ema_mcrmse: 0.8758


Epoch: [1][600/2578] Elapsed 121m 37s (remain 400m 5s) Loss: 0.1304(0.1865) Grad: 8.6544  LR: 0.00000467  


Epoch 1 avg_val_loss: 0.2519  time: 7829s
Epoch 1 - content_rmse: 0.5789 - wording_rmse: 0.9112 - mcrmse: 0.7451
Epoch 1 avg_val_loss: 0.2853  time: 8360s
Epoch 1 - ema_content_rmse: 0.6349 - ema_wording_rmse: 0.9624 - ema_mcrmse: 0.7986


Epoch: [1][700/2578] Elapsed 141m 50s (remain 379m 47s) Loss: 0.0941(0.1819) Grad: 3.5276  LR: 0.00000466  


Epoch 1 avg_val_loss: 0.2939  time: 9041s
Epoch 1 - content_rmse: 0.7531 - wording_rmse: 0.8891 - mcrmse: 0.8211
Epoch 1 avg_val_loss: 0.2653  time: 9572s
Epoch 1 - ema_content_rmse: 0.7078 - ema_wording_rmse: 0.8399 - ema_mcrmse: 0.7738
Epoch 1 - ema_Save Best Score: 0.7738 Model


Epoch: [1][800/2578] Elapsed 162m 5s (remain 359m 35s) Loss: 0.3037(0.1755) Grad: 13.9654  LR: 0.00000464  


Epoch 1 avg_val_loss: 0.2637  time: 10257s
Epoch 1 - content_rmse: 0.6095 - wording_rmse: 0.9379 - mcrmse: 0.7737
Epoch 1 avg_val_loss: 0.2520  time: 10788s
Epoch 1 - ema_content_rmse: 0.6027 - ema_wording_rmse: 0.9063 - ema_mcrmse: 0.7545
Epoch 1 - ema_Save Best Score: 0.7545 Model


Epoch: [1][900/2578] Elapsed 182m 25s (remain 339m 32s) Loss: 0.3047(0.1707) Grad: 9.6131  LR: 0.00000463  


Epoch 1 avg_val_loss: 0.2101  time: 11477s
Epoch 1 - content_rmse: 0.5845 - wording_rmse: 0.7719 - mcrmse: 0.6782
Epoch 1 - Save Best Score: 0.6782 Model
Epoch 1 avg_val_loss: 0.2056  time: 12012s
Epoch 1 - ema_content_rmse: 0.5690 - ema_wording_rmse: 0.7668 - ema_mcrmse: 0.6679
Epoch 1 - ema_Save Best Score: 0.6679 Model


Epoch: [1][1000/2578] Elapsed 202m 44s (remain 319m 24s) Loss: 0.5769(0.1658) Grad: 17.0220  LR: 0.00000461  


Epoch 1 avg_val_loss: 0.2020  time: 12695s
Epoch 1 - content_rmse: 0.5322 - wording_rmse: 0.7928 - mcrmse: 0.6625
Epoch 1 - Save Best Score: 0.6625 Model
Epoch 1 avg_val_loss: 0.2061  time: 13228s
Epoch 1 - ema_content_rmse: 0.5208 - ema_wording_rmse: 0.8142 - ema_mcrmse: 0.6675
Epoch 1 - ema_Save Best Score: 0.6675 Model


Epoch: [1][1100/2578] Elapsed 223m 0s (remain 299m 10s) Loss: 0.0134(0.1629) Grad: 1.9312  LR: 0.00000458  


Epoch 1 avg_val_loss: 0.2171  time: 13912s
Epoch 1 - content_rmse: 0.5721 - wording_rmse: 0.8196 - mcrmse: 0.6958
Epoch 1 avg_val_loss: 0.2235  time: 14443s
Epoch 1 - ema_content_rmse: 0.5735 - ema_wording_rmse: 0.8377 - ema_mcrmse: 0.7056


Epoch: [1][1200/2578] Elapsed 243m 14s (remain 278m 53s) Loss: 0.0299(0.1580) Grad: 3.4412  LR: 0.00000456  


Epoch 1 avg_val_loss: 0.2354  time: 15126s
Epoch 1 - content_rmse: 0.5082 - wording_rmse: 0.9305 - mcrmse: 0.7194
Epoch 1 avg_val_loss: 0.2808  time: 15657s
Epoch 1 - ema_content_rmse: 0.5609 - ema_wording_rmse: 1.0308 - ema_mcrmse: 0.7959


Epoch: [1][1300/2578] Elapsed 263m 29s (remain 258m 37s) Loss: 0.0295(0.1542) Grad: 1.5759  LR: 0.00000453  


Epoch 1 avg_val_loss: 0.2222  time: 16341s
Epoch 1 - content_rmse: 0.5045 - wording_rmse: 0.8892 - mcrmse: 0.6969
Epoch 1 avg_val_loss: 0.2324  time: 16872s
Epoch 1 - ema_content_rmse: 0.5323 - ema_wording_rmse: 0.9032 - ema_mcrmse: 0.7177


Epoch: [1][1400/2578] Elapsed 283m 42s (remain 238m 20s) Loss: 0.1861(0.1538) Grad: 9.4352  LR: 0.00000450  


Epoch 1 avg_val_loss: 0.2386  time: 17555s
Epoch 1 - content_rmse: 0.6217 - wording_rmse: 0.8365 - mcrmse: 0.7291
Epoch 1 avg_val_loss: 0.2217  time: 18085s
Epoch 1 - ema_content_rmse: 0.5870 - ema_wording_rmse: 0.8154 - ema_mcrmse: 0.7012


Epoch: [1][1500/2578] Elapsed 303m 55s (remain 218m 4s) Loss: 0.0711(0.1518) Grad: 5.1481  LR: 0.00000447  


Epoch 1 avg_val_loss: 0.2112  time: 18767s
Epoch 1 - content_rmse: 0.5833 - wording_rmse: 0.7780 - mcrmse: 0.6807
Epoch 1 avg_val_loss: 0.2105  time: 19298s
Epoch 1 - ema_content_rmse: 0.5274 - ema_wording_rmse: 0.8425 - ema_mcrmse: 0.6850


Epoch: [1][1600/2578] Elapsed 324m 8s (remain 197m 48s) Loss: 0.1429(0.1496) Grad: 11.0712  LR: 0.00000444  


Epoch 1 avg_val_loss: 0.2064  time: 19979s
Epoch 1 - content_rmse: 0.5005 - wording_rmse: 0.8487 - mcrmse: 0.6746
Epoch 1 avg_val_loss: 0.2166  time: 20509s
Epoch 1 - ema_content_rmse: 0.5236 - ema_wording_rmse: 0.8662 - ema_mcrmse: 0.6949


Epoch: [1][1700/2578] Elapsed 344m 20s (remain 177m 31s) Loss: 0.1033(0.1482) Grad: 7.9873  LR: 0.00000440  


Epoch 1 avg_val_loss: 0.2281  time: 21191s
Epoch 1 - content_rmse: 0.5279 - wording_rmse: 0.9105 - mcrmse: 0.7192
Epoch 1 avg_val_loss: 0.2763  time: 21722s
Epoch 1 - ema_content_rmse: 0.5826 - ema_wording_rmse: 1.0163 - ema_mcrmse: 0.7994


Epoch: [1][1800/2578] Elapsed 364m 31s (remain 157m 16s) Loss: 0.0466(0.1469) Grad: 2.9197  LR: 0.00000437  


Epoch 1 avg_val_loss: 0.2244  time: 22403s
Epoch 1 - content_rmse: 0.5871 - wording_rmse: 0.8353 - mcrmse: 0.7112
Epoch 1 avg_val_loss: 0.2134  time: 22934s
Epoch 1 - ema_content_rmse: 0.5577 - ema_wording_rmse: 0.8255 - ema_mcrmse: 0.6916


Epoch: [1][1900/2578] Elapsed 384m 44s (remain 137m 1s) Loss: 0.0420(0.1446) Grad: 2.7557  LR: 0.00000433  


Epoch 1 avg_val_loss: 0.2275  time: 23615s
Epoch 1 - content_rmse: 0.5242 - wording_rmse: 0.9021 - mcrmse: 0.7131
Epoch 1 avg_val_loss: 0.2673  time: 24146s
Epoch 1 - ema_content_rmse: 0.5596 - ema_wording_rmse: 0.9995 - ema_mcrmse: 0.7795


Epoch: [1][2000/2578] Elapsed 404m 56s (remain 116m 45s) Loss: 0.0268(0.1424) Grad: 1.3120  LR: 0.00000429  


Epoch 1 avg_val_loss: 0.1831  time: 24828s
Epoch 1 - content_rmse: 0.5043 - wording_rmse: 0.7597 - mcrmse: 0.6320
Epoch 1 - Save Best Score: 0.6320 Model
Epoch 1 avg_val_loss: 0.2069  time: 25362s
Epoch 1 - ema_content_rmse: 0.5233 - ema_wording_rmse: 0.8324 - ema_mcrmse: 0.6779


Epoch: [1][2100/2578] Elapsed 425m 14s (remain 96m 32s) Loss: 0.0250(0.1409) Grad: 3.1018  LR: 0.00000425  


Epoch 1 avg_val_loss: 0.2260  time: 26046s
Epoch 1 - content_rmse: 0.5348 - wording_rmse: 0.8816 - mcrmse: 0.7082
Epoch 1 avg_val_loss: 0.2400  time: 26577s
Epoch 1 - ema_content_rmse: 0.5638 - ema_wording_rmse: 0.9056 - ema_mcrmse: 0.7347


Epoch: [1][2200/2578] Elapsed 445m 28s (remain 76m 18s) Loss: 0.1471(0.1389) Grad: 9.5807  LR: 0.00000420  


Epoch 1 avg_val_loss: 0.1922  time: 27261s
Epoch 1 - content_rmse: 0.5211 - wording_rmse: 0.7613 - mcrmse: 0.6412
Epoch 1 avg_val_loss: 0.1860  time: 27792s
Epoch 1 - ema_content_rmse: 0.5013 - ema_wording_rmse: 0.7675 - ema_mcrmse: 0.6344
Epoch 1 - ema_Save Best Score: 0.6344 Model


Epoch: [1][2300/2578] Elapsed 465m 51s (remain 56m 4s) Loss: 0.0212(0.1382) Grad: 2.0552  LR: 0.00000416  


Epoch 1 avg_val_loss: 0.2634  time: 28484s
Epoch 1 - content_rmse: 0.5787 - wording_rmse: 0.9691 - mcrmse: 0.7739
Epoch 1 avg_val_loss: 0.2317  time: 29015s
Epoch 1 - ema_content_rmse: 0.5341 - ema_wording_rmse: 0.9074 - ema_mcrmse: 0.7207


Epoch: [1][2400/2578] Elapsed 486m 5s (remain 35m 50s) Loss: 0.1084(0.1370) Grad: 7.4028  LR: 0.00000411  


Epoch 1 avg_val_loss: 0.2009  time: 29697s
Epoch 1 - content_rmse: 0.5014 - wording_rmse: 0.8236 - mcrmse: 0.6625
Epoch 1 avg_val_loss: 0.2006  time: 30228s
Epoch 1 - ema_content_rmse: 0.5192 - ema_wording_rmse: 0.8085 - ema_mcrmse: 0.6639


Epoch: [1][2500/2578] Elapsed 506m 19s (remain 15m 35s) Loss: 0.0240(0.1359) Grad: 1.2281  LR: 0.00000406  


Epoch 1 avg_val_loss: 0.1983  time: 30910s
Epoch 1 - content_rmse: 0.5119 - wording_rmse: 0.8123 - mcrmse: 0.6621
Epoch 1 avg_val_loss: 0.2385  time: 31442s
Epoch 1 - ema_content_rmse: 0.5564 - ema_wording_rmse: 0.9135 - ema_mcrmse: 0.7349


Epoch: [1][2577/2578] Elapsed 526m 3s (remain 0m 0s) Loss: 0.0235(0.1349) Grad: 2.3248  LR: 0.00000402  


Epoch 1 avg_val_loss: 0.1981  time: 32096s
Epoch 1 - content_rmse: 0.5488 - wording_rmse: 0.7679 - mcrmse: 0.6583
Epoch 1 avg_val_loss: 0.1978  time: 32627s
Epoch 1 - ema_content_rmse: 0.5435 - ema_wording_rmse: 0.7723 - ema_mcrmse: 0.6579


Epoch: [2][0/2578] Elapsed 0m 1s (remain 63m 30s) Loss: 0.0749(0.0749) Grad: 5.3975  LR: 0.00000402  


Epoch 2 avg_val_loss: 0.1978  time: 534s
Epoch 2 - content_rmse: 0.5466 - wording_rmse: 0.7688 - mcrmse: 0.6577
Epoch 2 avg_val_loss: 0.1978  time: 1065s
Epoch 2 - ema_content_rmse: 0.5436 - ema_wording_rmse: 0.7721 - ema_mcrmse: 0.6579


Epoch: [2][100/2578] Elapsed 20m 18s (remain 497m 53s) Loss: 0.0016(0.0844) Grad: 0.8350  LR: 0.00000397  


Epoch 2 avg_val_loss: 0.2361  time: 1750s
Epoch 2 - content_rmse: 0.5812 - wording_rmse: 0.8788 - mcrmse: 0.7300
Epoch 2 avg_val_loss: 0.2041  time: 2281s
Epoch 2 - ema_content_rmse: 0.5356 - ema_wording_rmse: 0.8069 - ema_mcrmse: 0.6713


Epoch: [2][200/2578] Elapsed 40m 34s (remain 479m 45s) Loss: 0.0560(0.0833) Grad: 4.3733  LR: 0.00000392  


Epoch 2 avg_val_loss: 0.1949  time: 2966s
Epoch 2 - content_rmse: 0.5126 - wording_rmse: 0.8025 - mcrmse: 0.6576
Epoch 2 avg_val_loss: 0.1992  time: 3497s
Epoch 2 - ema_content_rmse: 0.5167 - ema_wording_rmse: 0.8090 - ema_mcrmse: 0.6629


Epoch: [2][300/2578] Elapsed 60m 49s (remain 460m 7s) Loss: 0.0735(0.0782) Grad: 6.9161  LR: 0.00000386  


Epoch 2 avg_val_loss: 0.1873  time: 4181s
Epoch 2 - content_rmse: 0.5310 - wording_rmse: 0.7410 - mcrmse: 0.6360
Epoch 2 avg_val_loss: 0.1988  time: 4712s
Epoch 2 - ema_content_rmse: 0.5060 - ema_wording_rmse: 0.8245 - ema_mcrmse: 0.6653


Epoch: [2][400/2578] Elapsed 81m 3s (remain 440m 5s) Loss: 0.0183(0.0770) Grad: 3.0849  LR: 0.00000381  


Epoch 2 avg_val_loss: 0.1906  time: 5395s
Epoch 2 - content_rmse: 0.5085 - wording_rmse: 0.7770 - mcrmse: 0.6428
Epoch 2 avg_val_loss: 0.1928  time: 5925s
Epoch 2 - ema_content_rmse: 0.5101 - ema_wording_rmse: 0.7917 - ema_mcrmse: 0.6509


Epoch: [2][500/2578] Elapsed 101m 17s (remain 419m 56s) Loss: 0.1683(0.0793) Grad: 10.2183  LR: 0.00000375  


Epoch 2 avg_val_loss: 0.2722  time: 6610s
Epoch 2 - content_rmse: 0.5564 - wording_rmse: 1.0260 - mcrmse: 0.7912
Epoch 2 avg_val_loss: 0.2118  time: 7142s
Epoch 2 - ema_content_rmse: 0.5231 - ema_wording_rmse: 0.8632 - ema_mcrmse: 0.6932


Epoch: [2][600/2578] Elapsed 121m 34s (remain 399m 56s) Loss: 0.0043(0.0812) Grad: 1.3493  LR: 0.00000369  


Epoch 2 avg_val_loss: 0.1783  time: 7826s
Epoch 2 - content_rmse: 0.5179 - wording_rmse: 0.7199 - mcrmse: 0.6189
Epoch 2 - Save Best Score: 0.6189 Model
Epoch 2 avg_val_loss: 0.1828  time: 8360s
Epoch 2 - ema_content_rmse: 0.5064 - ema_wording_rmse: 0.7559 - ema_mcrmse: 0.6311
Epoch 2 - ema_Save Best Score: 0.6311 Model


Epoch: [2][700/2578] Elapsed 141m 55s (remain 380m 0s) Loss: 0.0264(0.0811) Grad: 3.5557  LR: 0.00000363  


Epoch 2 avg_val_loss: 0.1889  time: 9047s
Epoch 2 - content_rmse: 0.4952 - wording_rmse: 0.7996 - mcrmse: 0.6474
Epoch 2 avg_val_loss: 0.1949  time: 9579s
Epoch 2 - ema_content_rmse: 0.4978 - ema_wording_rmse: 0.8179 - ema_mcrmse: 0.6578


Epoch: [2][800/2578] Elapsed 162m 10s (remain 359m 47s) Loss: 0.0308(0.0812) Grad: 2.0814  LR: 0.00000357  


Epoch 2 avg_val_loss: 0.1971  time: 10263s
Epoch 2 - content_rmse: 0.5106 - wording_rmse: 0.8138 - mcrmse: 0.6622
Epoch 2 avg_val_loss: 0.2011  time: 10794s
Epoch 2 - ema_content_rmse: 0.5051 - ema_wording_rmse: 0.8366 - ema_mcrmse: 0.6708


Epoch: [2][900/2578] Elapsed 182m 25s (remain 339m 33s) Loss: 0.0773(0.0807) Grad: 3.3237  LR: 0.00000351  


Epoch 2 avg_val_loss: 0.2743  time: 11478s
Epoch 2 - content_rmse: 0.5243 - wording_rmse: 1.0529 - mcrmse: 0.7886
Epoch 2 avg_val_loss: 0.2367  time: 12009s
Epoch 2 - ema_content_rmse: 0.5239 - ema_wording_rmse: 0.9402 - ema_mcrmse: 0.7320


Epoch: [2][1000/2578] Elapsed 202m 41s (remain 319m 19s) Loss: 0.0924(0.0811) Grad: 8.0790  LR: 0.00000345  


Epoch 2 avg_val_loss: 0.2103  time: 12692s
Epoch 2 - content_rmse: 0.5455 - wording_rmse: 0.8315 - mcrmse: 0.6885
Epoch 2 avg_val_loss: 0.2267  time: 13223s
Epoch 2 - ema_content_rmse: 0.5435 - ema_wording_rmse: 0.8904 - ema_mcrmse: 0.7169


Epoch: [2][1100/2578] Elapsed 222m 55s (remain 299m 3s) Loss: 0.0815(0.0821) Grad: 4.8788  LR: 0.00000338  


Epoch 2 avg_val_loss: 0.2322  time: 13906s
Epoch 2 - content_rmse: 0.5536 - wording_rmse: 0.8934 - mcrmse: 0.7235
Epoch 2 avg_val_loss: 0.2098  time: 14437s
Epoch 2 - ema_content_rmse: 0.5338 - ema_wording_rmse: 0.8373 - ema_mcrmse: 0.6855


Epoch: [2][1200/2578] Elapsed 243m 9s (remain 278m 47s) Loss: 0.1289(0.0811) Grad: 8.0568  LR: 0.00000332  


Epoch 2 avg_val_loss: 0.1949  time: 15121s
Epoch 2 - content_rmse: 0.5017 - wording_rmse: 0.8153 - mcrmse: 0.6585
Epoch 2 avg_val_loss: 0.1961  time: 15652s
Epoch 2 - ema_content_rmse: 0.5107 - ema_wording_rmse: 0.8143 - ema_mcrmse: 0.6625


Epoch: [2][1300/2578] Elapsed 263m 24s (remain 258m 32s) Loss: 0.0564(0.0812) Grad: 3.7745  LR: 0.00000325  


Epoch 2 avg_val_loss: 0.1948  time: 16337s
Epoch 2 - content_rmse: 0.5147 - wording_rmse: 0.7985 - mcrmse: 0.6566
Epoch 2 avg_val_loss: 0.2143  time: 16867s
Epoch 2 - ema_content_rmse: 0.5229 - ema_wording_rmse: 0.8650 - ema_mcrmse: 0.6939


Epoch: [2][1400/2578] Elapsed 283m 39s (remain 238m 18s) Loss: 0.2687(0.0804) Grad: 6.7523  LR: 0.00000318  


Epoch 2 avg_val_loss: 0.1926  time: 17552s
Epoch 2 - content_rmse: 0.5129 - wording_rmse: 0.7981 - mcrmse: 0.6555
Epoch 2 avg_val_loss: 0.2042  time: 18083s
Epoch 2 - ema_content_rmse: 0.5163 - ema_wording_rmse: 0.8422 - ema_mcrmse: 0.6792


Epoch: [2][1500/2578] Elapsed 303m 55s (remain 218m 4s) Loss: 0.1037(0.0801) Grad: 9.1398  LR: 0.00000312  


Epoch 2 avg_val_loss: 0.1996  time: 18766s
Epoch 2 - content_rmse: 0.5321 - wording_rmse: 0.7988 - mcrmse: 0.6654
Epoch 2 avg_val_loss: 0.2151  time: 19298s
Epoch 2 - ema_content_rmse: 0.5244 - ema_wording_rmse: 0.8656 - ema_mcrmse: 0.6950


Epoch: [2][1600/2578] Elapsed 324m 10s (remain 197m 49s) Loss: 0.1496(0.0806) Grad: 9.8350  LR: 0.00000305  


Epoch 2 avg_val_loss: 0.1736  time: 19982s
Epoch 2 - content_rmse: 0.5274 - wording_rmse: 0.6830 - mcrmse: 0.6052
Epoch 2 - Save Best Score: 0.6052 Model
Epoch 2 avg_val_loss: 0.1909  time: 20517s
Epoch 2 - ema_content_rmse: 0.5014 - ema_wording_rmse: 0.7941 - ema_mcrmse: 0.6478


Epoch: [2][1700/2578] Elapsed 344m 29s (remain 177m 36s) Loss: 0.0541(0.0818) Grad: 4.8944  LR: 0.00000298  


Epoch 2 avg_val_loss: 0.1898  time: 21201s
Epoch 2 - content_rmse: 0.5020 - wording_rmse: 0.7875 - mcrmse: 0.6448
Epoch 2 avg_val_loss: 0.1989  time: 21732s
Epoch 2 - ema_content_rmse: 0.5019 - ema_wording_rmse: 0.8222 - ema_mcrmse: 0.6621


Epoch: [2][1800/2578] Elapsed 364m 44s (remain 157m 21s) Loss: 0.1109(0.0816) Grad: 4.8034  LR: 0.00000291  


Epoch 2 avg_val_loss: 0.1971  time: 22416s
Epoch 2 - content_rmse: 0.5235 - wording_rmse: 0.7961 - mcrmse: 0.6598
Epoch 2 avg_val_loss: 0.1988  time: 22948s
Epoch 2 - ema_content_rmse: 0.5190 - ema_wording_rmse: 0.8094 - ema_mcrmse: 0.6642


Epoch: [2][1900/2578] Elapsed 385m 0s (remain 137m 6s) Loss: 0.0459(0.0822) Grad: 2.4808  LR: 0.00000284  


Epoch 2 avg_val_loss: 0.2147  time: 23632s
Epoch 2 - content_rmse: 0.5083 - wording_rmse: 0.8819 - mcrmse: 0.6951
Epoch 2 avg_val_loss: 0.2147  time: 24163s
Epoch 2 - ema_content_rmse: 0.5141 - ema_wording_rmse: 0.8735 - ema_mcrmse: 0.6938


Epoch: [2][2000/2578] Elapsed 405m 15s (remain 116m 51s) Loss: 0.0221(0.0821) Grad: 3.8123  LR: 0.00000277  


Epoch 2 avg_val_loss: 0.2288  time: 24846s
Epoch 2 - content_rmse: 0.5216 - wording_rmse: 0.9158 - mcrmse: 0.7187
Epoch 2 avg_val_loss: 0.2097  time: 25377s
Epoch 2 - ema_content_rmse: 0.5061 - ema_wording_rmse: 0.8634 - ema_mcrmse: 0.6848


Epoch: [2][2100/2578] Elapsed 425m 30s (remain 96m 36s) Loss: 0.0183(0.0813) Grad: 1.9427  LR: 0.00000270  


Epoch 2 avg_val_loss: 0.1989  time: 26062s
Epoch 2 - content_rmse: 0.5076 - wording_rmse: 0.8192 - mcrmse: 0.6634
Epoch 2 avg_val_loss: 0.1938  time: 26593s
Epoch 2 - ema_content_rmse: 0.5079 - ema_wording_rmse: 0.8050 - ema_mcrmse: 0.6564


Epoch: [2][2200/2578] Elapsed 445m 45s (remain 76m 21s) Loss: 0.1243(0.0813) Grad: 3.9129  LR: 0.00000263  


Epoch 2 avg_val_loss: 0.2143  time: 27276s
Epoch 2 - content_rmse: 0.4982 - wording_rmse: 0.8888 - mcrmse: 0.6935
Epoch 2 avg_val_loss: 0.1993  time: 27807s
Epoch 2 - ema_content_rmse: 0.5057 - ema_wording_rmse: 0.8290 - ema_mcrmse: 0.6674


Epoch: [2][2300/2578] Elapsed 466m 0s (remain 56m 5s) Loss: 0.0239(0.0809) Grad: 2.1089  LR: 0.00000256  


Epoch 2 avg_val_loss: 0.2167  time: 28491s
Epoch 2 - content_rmse: 0.5065 - wording_rmse: 0.8791 - mcrmse: 0.6928
Epoch 2 avg_val_loss: 0.2018  time: 29022s
Epoch 2 - ema_content_rmse: 0.4956 - ema_wording_rmse: 0.8379 - ema_mcrmse: 0.6668


Epoch: [2][2400/2578] Elapsed 486m 13s (remain 35m 50s) Loss: 0.0574(0.0807) Grad: 2.7872  LR: 0.00000248  


Epoch 2 avg_val_loss: 0.1949  time: 29704s
Epoch 2 - content_rmse: 0.4885 - wording_rmse: 0.8189 - mcrmse: 0.6537
Epoch 2 avg_val_loss: 0.1916  time: 30236s
Epoch 2 - ema_content_rmse: 0.4928 - ema_wording_rmse: 0.8029 - ema_mcrmse: 0.6478


Epoch: [2][2500/2578] Elapsed 506m 28s (remain 15m 35s) Loss: 0.0087(0.0806) Grad: 2.0328  LR: 0.00000241  


Epoch 2 avg_val_loss: 0.2232  time: 30919s
Epoch 2 - content_rmse: 0.5175 - wording_rmse: 0.9038 - mcrmse: 0.7107
Epoch 2 avg_val_loss: 0.2167  time: 31451s
Epoch 2 - ema_content_rmse: 0.5059 - ema_wording_rmse: 0.8877 - ema_mcrmse: 0.6968


Epoch: [2][2577/2578] Elapsed 526m 8s (remain 0m 0s) Loss: 0.0375(0.0803) Grad: 2.2992  LR: 0.00000236  


Epoch 2 avg_val_loss: 0.1797  time: 32100s
Epoch 2 - content_rmse: 0.4933 - wording_rmse: 0.7520 - mcrmse: 0.6226
Epoch 2 avg_val_loss: 0.1950  time: 32631s
Epoch 2 - ema_content_rmse: 0.4998 - ema_wording_rmse: 0.8110 - ema_mcrmse: 0.6554


Epoch: [3][0/2578] Elapsed 0m 1s (remain 62m 22s) Loss: 0.0285(0.0285) Grad: 1.9828  LR: 0.00000236  


Epoch 3 avg_val_loss: 0.1803  time: 533s
Epoch 3 - content_rmse: 0.4935 - wording_rmse: 0.7540 - mcrmse: 0.6238
Epoch 3 avg_val_loss: 0.1947  time: 1064s
Epoch 3 - ema_content_rmse: 0.4997 - ema_wording_rmse: 0.8099 - ema_mcrmse: 0.6548


Epoch: [3][100/2578] Elapsed 20m 16s (remain 497m 11s) Loss: 0.0139(0.0470) Grad: 3.3137  LR: 0.00000228  


Epoch 3 avg_val_loss: 0.2226  time: 1748s
Epoch 3 - content_rmse: 0.5286 - wording_rmse: 0.8889 - mcrmse: 0.7088
Epoch 3 avg_val_loss: 0.2073  time: 2280s
Epoch 3 - ema_content_rmse: 0.5150 - ema_wording_rmse: 0.8474 - ema_mcrmse: 0.6812


Epoch: [3][200/2578] Elapsed 40m 31s (remain 479m 13s) Loss: 0.1049(0.0457) Grad: 5.6100  LR: 0.00000221  


Epoch 3 avg_val_loss: 0.2106  time: 2962s
Epoch 3 - content_rmse: 0.5138 - wording_rmse: 0.8578 - mcrmse: 0.6858
Epoch 3 avg_val_loss: 0.2071  time: 3493s
Epoch 3 - ema_content_rmse: 0.5087 - ema_wording_rmse: 0.8509 - ema_mcrmse: 0.6798


Epoch: [3][300/2578] Elapsed 60m 45s (remain 459m 39s) Loss: 0.0136(0.0440) Grad: 1.8894  LR: 0.00000214  


Epoch 3 avg_val_loss: 0.2372  time: 4177s
Epoch 3 - content_rmse: 0.5037 - wording_rmse: 0.9583 - mcrmse: 0.7310
Epoch 3 avg_val_loss: 0.2171  time: 4708s
Epoch 3 - ema_content_rmse: 0.5068 - ema_wording_rmse: 0.8873 - ema_mcrmse: 0.6971


Epoch: [3][400/2578] Elapsed 81m 0s (remain 439m 47s) Loss: 0.0398(0.0454) Grad: 4.6294  LR: 0.00000207  


Epoch 3 avg_val_loss: 0.2047  time: 5393s
Epoch 3 - content_rmse: 0.4953 - wording_rmse: 0.8514 - mcrmse: 0.6733
Epoch 3 avg_val_loss: 0.2185  time: 5924s
Epoch 3 - ema_content_rmse: 0.4958 - ema_wording_rmse: 0.9000 - ema_mcrmse: 0.6979


Epoch: [3][500/2578] Elapsed 101m 16s (remain 419m 50s) Loss: 0.0383(0.0448) Grad: 1.7800  LR: 0.00000200  


Epoch 3 avg_val_loss: 0.2002  time: 6609s
Epoch 3 - content_rmse: 0.5023 - wording_rmse: 0.8271 - mcrmse: 0.6647
Epoch 3 avg_val_loss: 0.2079  time: 7140s
Epoch 3 - ema_content_rmse: 0.4967 - ema_wording_rmse: 0.8608 - ema_mcrmse: 0.6788


Epoch: [3][600/2578] Elapsed 121m 32s (remain 399m 47s) Loss: 0.0673(0.0438) Grad: 6.7331  LR: 0.00000193  


Epoch 3 avg_val_loss: 0.2038  time: 7822s
Epoch 3 - content_rmse: 0.4958 - wording_rmse: 0.8507 - mcrmse: 0.6733
Epoch 3 avg_val_loss: 0.2124  time: 8353s
Epoch 3 - ema_content_rmse: 0.4954 - ema_wording_rmse: 0.8802 - ema_mcrmse: 0.6878


Epoch: [3][700/2578] Elapsed 141m 46s (remain 379m 36s) Loss: 0.0286(0.0432) Grad: 4.4552  LR: 0.00000186  


Epoch 3 avg_val_loss: 0.2096  time: 9037s
Epoch 3 - content_rmse: 0.5158 - wording_rmse: 0.8522 - mcrmse: 0.6840
Epoch 3 avg_val_loss: 0.2008  time: 9569s
Epoch 3 - ema_content_rmse: 0.4967 - ema_wording_rmse: 0.8360 - ema_mcrmse: 0.6663


Epoch: [3][800/2578] Elapsed 162m 2s (remain 359m 29s) Loss: 0.0130(0.0426) Grad: 2.4945  LR: 0.00000179  


Epoch 3 avg_val_loss: 0.2093  time: 10254s
Epoch 3 - content_rmse: 0.5047 - wording_rmse: 0.8543 - mcrmse: 0.6795
Epoch 3 avg_val_loss: 0.2030  time: 10785s
Epoch 3 - ema_content_rmse: 0.4974 - ema_wording_rmse: 0.8442 - ema_mcrmse: 0.6708


Epoch: [3][900/2578] Elapsed 182m 18s (remain 339m 18s) Loss: 0.1010(0.0428) Grad: 8.2593  LR: 0.00000172  


Epoch 3 avg_val_loss: 0.2041  time: 11470s
Epoch 3 - content_rmse: 0.5107 - wording_rmse: 0.8379 - mcrmse: 0.6743
Epoch 3 avg_val_loss: 0.2038  time: 12002s
Epoch 3 - ema_content_rmse: 0.4969 - ema_wording_rmse: 0.8477 - ema_mcrmse: 0.6723


Epoch: [3][1000/2578] Elapsed 202m 35s (remain 319m 9s) Loss: 0.0315(0.0425) Grad: 4.9682  LR: 0.00000165  


Epoch 3 avg_val_loss: 0.2023  time: 12688s
Epoch 3 - content_rmse: 0.4911 - wording_rmse: 0.8423 - mcrmse: 0.6667
Epoch 3 avg_val_loss: 0.2015  time: 13219s
Epoch 3 - ema_content_rmse: 0.4890 - ema_wording_rmse: 0.8420 - ema_mcrmse: 0.6655


Epoch: [3][1100/2578] Elapsed 222m 50s (remain 298m 56s) Loss: 0.0166(0.0427) Grad: 2.3395  LR: 0.00000158  


Epoch 3 avg_val_loss: 0.2239  time: 13902s
Epoch 3 - content_rmse: 0.4979 - wording_rmse: 0.9128 - mcrmse: 0.7053
Epoch 3 avg_val_loss: 0.2114  time: 14434s
Epoch 3 - ema_content_rmse: 0.4929 - ema_wording_rmse: 0.8743 - ema_mcrmse: 0.6836


Epoch: [3][1200/2578] Elapsed 243m 5s (remain 278m 42s) Loss: 0.0111(0.0420) Grad: 1.7857  LR: 0.00000151  


Epoch 3 avg_val_loss: 0.2230  time: 15118s
Epoch 3 - content_rmse: 0.5001 - wording_rmse: 0.9064 - mcrmse: 0.7033
Epoch 3 avg_val_loss: 0.2144  time: 15649s
Epoch 3 - ema_content_rmse: 0.4944 - ema_wording_rmse: 0.8852 - ema_mcrmse: 0.6898


Epoch: [3][1300/2578] Elapsed 263m 21s (remain 258m 29s) Loss: 0.0361(0.0415) Grad: 3.7881  LR: 0.00000145  


Epoch 3 avg_val_loss: 0.2133  time: 16333s
Epoch 3 - content_rmse: 0.4857 - wording_rmse: 0.8801 - mcrmse: 0.6829
Epoch 3 avg_val_loss: 0.2073  time: 16864s
Epoch 3 - ema_content_rmse: 0.4898 - ema_wording_rmse: 0.8619 - ema_mcrmse: 0.6759


Epoch: [3][1400/2578] Elapsed 283m 34s (remain 238m 14s) Loss: 0.1938(0.0415) Grad: 9.0706  LR: 0.00000138  


Epoch 3 avg_val_loss: 0.2013  time: 17546s
Epoch 3 - content_rmse: 0.4853 - wording_rmse: 0.8419 - mcrmse: 0.6636
Epoch 3 avg_val_loss: 0.2021  time: 18076s
Epoch 3 - ema_content_rmse: 0.4859 - ema_wording_rmse: 0.8447 - ema_mcrmse: 0.6653


Epoch: [3][1500/2578] Elapsed 303m 46s (remain 217m 58s) Loss: 0.0743(0.0413) Grad: 5.0483  LR: 0.00000132  


Epoch 3 avg_val_loss: 0.2228  time: 18758s
Epoch 3 - content_rmse: 0.4978 - wording_rmse: 0.9101 - mcrmse: 0.7039
Epoch 3 avg_val_loss: 0.2055  time: 19289s
Epoch 3 - ema_content_rmse: 0.4878 - ema_wording_rmse: 0.8577 - ema_mcrmse: 0.6728


Epoch: [3][1600/2578] Elapsed 323m 58s (remain 197m 42s) Loss: 0.0354(0.0412) Grad: 3.8115  LR: 0.00000125  


Epoch 3 avg_val_loss: 0.2076  time: 19971s
Epoch 3 - content_rmse: 0.4980 - wording_rmse: 0.8636 - mcrmse: 0.6808


KeyboardInterrupt: 

In [20]:
## total_complex = []
# for fold in range(4):
#     va_data = train_df[train_df['fold'] == fold]
#     preds = torch.load('/content/drive/MyDrive/deb_simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['predictions']
#     va_data['preds'] = preds
#     va_data = va_data[['id', 'preds', 'score']]
#     print(compute_metrics(va_data['preds'].values.reshape(-1,1), va_data['score'].values))
#     total_complex.append(va_data)
# total_complex = pd.concat(total_complex)
# compute_metrics(total_complex['preds'].values.reshape(-1,1), total_complex['score'].values)

In [21]:
# !mkdir -p /root/.kaggle
# !cp /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets init -p /content/drive/MyDrive/elc_mean/

In [22]:
#!kaggle datasets create -p /content/drive/MyDrive/elc_mean/

In [23]:
# deberta v3 large
# 1.5 0.8228
# 2 0.8197

# 1.5  8137
#2 8175
#2.5 8181
#3 8181
#3.5 8175

