In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Model training')
    # params of training
    parser.add_argument(
        "--fold", dest="fold", help="Train fold", default=None, type=int)
    parser.add_argument(
        '--batch_size',
        dest='batch_size',
        help='Mini batch size of one gpu or cpu',
        type=int,
        default=None)
    return parser.parse_args()


# Config

In [3]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'pool'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 1024
    max_position_embeddings = 1024
    folds = [3]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = False
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
    encoder_lr = 5e-6
    head_lr = 5e-6
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 4
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    
    

## logger

In [4]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    if not os.path.exists(CFG.OUTPUT_DIR):
        os.makedirs(CFG.OUTPUT_DIR)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Preproc

In [5]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

df["fold"] = df["prompt_id"].map(id2fold)

In [6]:
df 

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,1
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,1
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710,1
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,1
...,...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990,3
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784,3
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294,3
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538,3


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [9]:
mask_lm_datacollator = DataCollatorForWholeWordMask(tokenizer)
def data_collator(batch):
    input_ids = [{'input_ids':i[0]} for i in batch]
    token_type_ids = [i[1] for i in batch]
    attention_mask = [i[2] for i in batch]
    labels = [i[3] for i in batch]
    masked_input = mask_lm_datacollator(input_ids)['input_ids']
    return masked_input,\
               torch.stack(token_type_ids),\
               torch.stack(attention_mask),\
               torch.stack(labels)

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.prompt_title = df['prompt_title'].values.astype(str)
        self.prompt_text = df['prompt_text'].values.astype(str)
        self.prompt_question = df['prompt_question'].values.astype(str)
        self.text = df['text'].values.astype(str)
        self.content = df['content'].values
        self.wording = df['wording'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.prompt_title)
    
    def tokenize(self, example):
        sep = self.tokenizer.sep_token
        if  CFG.input_type == '1':
            prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
        else:
            prompt = example["prompt_question"] 
        
        labels = [float(example["content"]), float(example["wording"])]

        tokenized = tokenizer(
            example["text"],
            prompt,
            padding='max_length',
            truncation=True,
            max_length=CFG.max_input_length,
            return_tensors=None,
        )
        
        return {
            **tokenized,
            "labels": labels,
        }
    
    def __getitem__(self, item):
        example = {
                    "prompt_title":self.prompt_title[item],
                    "prompt_text":self.prompt_text[item],
                    "prompt_question":self.prompt_question[item],
                    "text":self.text[item],
                    "content":self.content[item],
                    "wording":self.wording[item],
                  }
        
        out = self.tokenize(example)
       
        return {
                'input_ids': torch.as_tensor(out['input_ids'], dtype=torch.long),
                'token_type_ids': torch.as_tensor(out['token_type_ids'], dtype=torch.long),
                'attention_mask': torch.as_tensor(out['attention_mask'], dtype=torch.long),
                'labels': torch.as_tensor(out['labels'], dtype=torch.float),
        }
        
        
        

## Model

In [11]:
def init_params(module_lst):
    for module in module_lst:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

class Custom_Bert(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls = nn.Sequential(
            nn.Linear(dim,1)
        )
        init_params([self.cls,self.attention])

    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output = self.cls(logits)
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)


class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(torch.mean(output, dim=1))
        return SequenceClassifierOutput(
            loss=nn.MSELoss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )

class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret    

class Custom_Bert_Pool(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        #self.base = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        print('load pretrained model ...');
        self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model', config = self.config)
        
        self.pool = GeMText()
        self.cls = nn.Linear(self.config.hidden_size,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(self.pool(output, attention_mask))
        return SequenceClassifierOutput(
            loss=nn.SmoothL1Loss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            print(f'Re-initialize {module}')
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class Custom_Bert_Mean(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states=True
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)

    def forward(self, input_ids, attention_mask,labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                            )


        output = base_output.hidden_states[-1]
        output = self.cls(self.dropout(torch.mean(output, dim=1)))
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

class Custom_Bert_M(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls_0 = nn.Sequential(
            nn.Linear(dim,1)
        )

        self.cls_1 = nn.Linear(dim,5)
        init_params([self.cls_0,self.cls_1,self.attention])

    def forward(self, input_ids, attention_mask, labels):
        base_output = self.base(input_ids=input_ids,
                    attention_mask=attention_mask,
                             )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output_0 = self.cls_0(logits)
        output_1 = self.cls_1(logits)
        if labels is None:
            return output_0

        else:
            regression_loss = nn.MSELoss()(torch.squeeze(output_0,1),labels)
            labels = labels.double()
            cls_labels = torch.where(labels==1.,4.0,labels)
            cls_labels = torch.where(cls_labels==0.25,1.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.5,2.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.75,3.0,cls_labels)
            cls_labels = cls_labels.long()
            cls_loss = nn.CrossEntropyLoss()(output_1, cls_labels)
            return ( 0.8 * regression_loss + 0.2 * cls_loss, output_0)

In [12]:
def build_model():
    if CFG.model_type == 'base':
        model_config = AutoConfig.from_pretrained(CFG.model_path)
        model_config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })

        #print(model_config)
        model = AutoModelForSequenceClassification.from_pretrained(
            CFG.model_path, config=model_config
        )
    if CFG.model_type == 'simple':
        model = Custom_Bert_Simple()
    if CFG.model_type == 'pool':
        model = Custom_Bert_Pool()
        if CFG.reinit_layers > 0:
            print("=="*40)
            print(f"Reinitialize the last {CFG.reinit_layers} layer(s).")
            for layer in model.base.encoder.layer[-CFG.reinit_layers:]:
                print("===")
                layer.apply(model._init_weights)
            print("=="*40)
        if CFG.load_pretrained:
            model.load_state_dict(torch.load('./pretrained/microsoft_deberta-v3-base_best_ema.pth')['model'])
    return model

# Train

In [13]:
from copy import deepcopy
class ModelEMA:
    """Model Exponential Moving Average from https://github.com/rwightman/
    pytorch-image-models Keep a moving average of everything in the model
    state_dict (parameters and buffers).

    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/
    ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training
    schemes to perform well.
    This class is sensitive where it is initialized in the sequence
    of model init, GPU assignment and distributed training wrappers.
    """

    def __init__(self, model, decay=0.9999, updates=0):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
            updates (int): counter of EMA updates.
        """
        # Create EMA(FP32)
        self.ema_model = deepcopy(model).eval()
        self.ema = self.ema_model
        self.updates = updates
        # decay exponential ramp (to help early epochs)
        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def update(self, model):
        # Update EMA parameters
        with torch.no_grad():
            self.updates += 1
            d = self.decay(self.updates)
            msd =  model.state_dict()# model state_dict
            for k, v in self.ema.state_dict().items():
                if v.dtype.is_floating_point:
                    v *= d
                    v += (1.0 - d) * msd[k].detach()

class EMAHook:
    """EMAHook used in BEVDepth.

    Modified from https://github.com/Megvii-Base
    Detection/BEVDepth/blob/main/callbacks/ema.py.
    """

    def __init__(self, model, init_updates=0, decay=0.9990, resume=None, logger=None):
        super().__init__()
        self.init_updates = init_updates
        self.resume = resume
        self.decay = decay
        self.ema_model = self.before_run(model)
        self.logger = logger

    def before_run(self, model):
        from torch.nn.modules.batchnorm import SyncBatchNorm

        bn_model_list = list()
        bn_model_dist_group_list = list()
        for model_ref in model.modules():
            if isinstance(model_ref, SyncBatchNorm):
                bn_model_list.append(model_ref)
                bn_model_dist_group_list.append(model_ref.process_group)
                model_ref.process_group = None
        ema_model = ModelEMA(model, self.decay)

        for bn_model, dist_group in zip(bn_model_list,
                                        bn_model_dist_group_list):
            bn_model.process_group = dist_group
        ema_model.updates = self.init_updates

        if self.resume is not None:
            self.logger.info(f'resume ema checkpoint from {self.resume}')
            cpt = torch.load(self.resume, map_location='cpu')
            load_state_dict(ema_model.ema, cpt['state_dict'])
            ema_model.updates = cpt['updates']

        return ema_model

    def after_train_iter(self, model):
        self.ema_model.update(model)

In [14]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [15]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [16]:
CFG.discriminative_learning_rate_num_groups

1

In [17]:
def get_optimizer_llr_params(model, type='s'):
    """
    Setup the optimizer.
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    MODIFIED VERSION:
    * added support for differential learning rates per layer

    reference: https://github.com/huggingface/transformers/blob/05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe/src/transformers/trainer.py#L804
    """

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    ### ADDED
    if CFG.discriminative_learning_rate:

        num_layers = model.config.num_hidden_layers

        learning_rate_powers = range(0, num_layers, num_layers//CFG.discriminative_learning_rate_num_groups)
        layer_wise_learning_rates = [
            pow(CFG.discriminative_learning_rate_decay_rate, power) * CFG.encoder_lr 
            for power in learning_rate_powers 
            for _ in range(num_layers//CFG.discriminative_learning_rate_num_groups)
          ]
        layer_wise_learning_rates = layer_wise_learning_rates[::-1]
        print('Layer-wise learning rates:', layer_wise_learning_rates)

        # group embedding paramters from the transformer encoder
        embedding_layer = model.base.embeddings
        optimizer_grouped_parameters = [
          {
              "params": [p for n, p in embedding_layer.named_parameters() if not any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": CFG.weight_decay,
          },
          {
              "params": [p for n, p in embedding_layer.named_parameters() if any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": 0.0,
          },
        ]

        # group encoding paramters from the transformer encoder
        encoding_layers = [layer for layer in model.base.encoder.layer]
        for i, layer in enumerate(encoding_layers):
            optimizer_grouped_parameters += [
                {
                    "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": 0.0,
                },
            ]    
        print(f"Detected unattached modules in model.encoder: {[n for n, p in model.base.encoder.named_parameters() if not n.startswith('layer')]}")
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and not any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": 0.0,
            },
        ]

        # group paramters from the task specific head
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and not any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": 0.0,
            },
        ]
    ### END ADDED
    else:
        # group paramters for the entire network
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": 0.0,
            },
        ]
    return optimizer_grouped_parameters

In [18]:
def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        with torch.no_grad():
            model_output = model(**batch)
        label = batch['labels']
        loss, logits = model_output.loss, model_output.logits
        losses.update(loss.item(), batch_size)
        preds.append(logits.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    return losses.avg, predictions, labels

def train_fn(train_loader, model, optimizer, epoch, scheduler, device, valid_loader, start_time, best_score, best_score_ema,ema_hook,wandb, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        loss = model(**batch).loss
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        ema_hook.after_train_iter(model)
        global_step += 1
        scheduler.step()
        end = time.time()
        
        wandb.log({
                'train loss': loss.item(),
                'step': global_step,
                'epoch': epoch,
                'fold': fold,
                'batch_size':CFG.batch_size
            })
        
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
            # eval
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

            # scoring
            score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - content_rmse: {content_rmse:.4f} - wording_rmse: {wording_rmse:.4f} - mcrmse: {mcrmse:.4f}')
            
            
            if best_score > score['mcrmse']:
                if best_score != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
                best_score = score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
            
            
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, ema_hook.ema_model.ema, CFG.device)
            # ema scoring
            ema_score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(ema_score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - ema_content_rmse: {content_rmse:.4f} - ema_wording_rmse: {wording_rmse:.4f} - ema_mcrmse: {mcrmse:.4f}')
            
            
            if best_score_ema > ema_score['mcrmse']:
                if best_score_ema != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score_ema))
                best_score_ema = ema_score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - ema_Save Best Score: {best_score_ema:.4f} Model')
                torch.save({'model': ema_hook.ema_model.ema.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold,best_score_ema))
            
            wandb.log({
            'learning rate': optimizer.param_groups[0]['lr'],
            'validation mcrmse': score['mcrmse'],
            'validation ema mcrmse': ema_score['mcrmse'],
            'step': global_step,
            'epoch': epoch,
        })
            
            model.train()
    return losses.avg, best_score, best_score_ema



def train_loop():
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    wandb.init(project='kaggle-commonlit-eval-student-summaries-0509')
    wandb.config = dict(epochs=CFG.epochs, 
                            batch_size=CFG.batch_size, 
                            learning_rate=CFG.encoder_lr,
                            save_checkpoint=True,
                            )
    for fold in CFG.folds:
        
        if CFG.pretraining:
            tr_data = pd.read_csv('tmp_pessudo.csv')
            tr_data['prompt_title'] = ''
            tr_data = tr_data[-(tr_data['prompt_question'].isin(pdf['prompt_question'].tolist()))]
            va_data = df #df[df['fold']==fold].reset_index(drop=True)
        else:
            tr_data = df[df['fold']!=fold].reset_index(drop=True)
            va_data = df[df['fold']==fold].reset_index(drop=True)
        train_dataset = TrainDataset(tr_data, tokenizer)
        valid_dataset = TrainDataset(va_data, tokenizer)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        # ====================================================
        # model & optimizer
        # ====================================================
        model = build_model()
        #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.to(CFG.device)
        # for param in model.base.parameters():
        #         param.requires_grad = False
        ema_hook = EMAHook(model, init_updates=3000, logger=LOGGER)
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': 0.0},
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_llr_params(model)
        optimizer = AdamW(optimizer_parameters, eps=CFG.eps, betas=CFG.betas)


        
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
            if cfg.scheduler == 'linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                    num_cycles=cfg.num_cycles
                )
            return scheduler

        num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        # ====================================================
        # loop
        # ====================================================
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

        # criterion = LabelSmoothingLoss()
        best_score = float('inf')
        best_score_ema = float('inf')
        for epoch in range(CFG.epochs):

            start_time = time.time()

            # train
            avg_loss, best_score, best_score_ema = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device, valid_loader, start_time, best_score, best_score_ema ,ema_hook, wandb,fold)


        torch.cuda.empty_cache()
        gc.collect()
        del scheduler, optimizer, model
    return 


In [19]:
train_loop()

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669552416230242, max=1.0…

load pretrained model ...
Epoch: [1][0/1292] Elapsed 0m 2s (remain 45m 52s) Loss: 0.6433(0.6433) Grad: 13.3274  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.4907  time: 257s
Epoch 1 - content_rmse: 1.1861 - wording_rmse: 0.9419 - mcrmse: 1.0640
Epoch 1 - Save Best Score: 1.0640 Model
Epoch 1 avg_val_loss: 0.5089  time: 516s
Epoch 1 - ema_content_rmse: 1.2172 - ema_wording_rmse: 0.9561 - ema_mcrmse: 1.0866
Epoch 1 - ema_Save Best Score: 1.0866 Model


Epoch: [1][100/1292] Elapsed 10m 51s (remain 128m 3s) Loss: 0.2233(0.3319) Grad: 6.6434  LR: 0.00000500  


Epoch 1 avg_val_loss: 0.2557  time: 907s
Epoch 1 - content_rmse: 0.6035 - wording_rmse: 0.8529 - mcrmse: 0.7282
Epoch 1 - Save Best Score: 0.7282 Model
Epoch 1 avg_val_loss: 0.2356  time: 1166s
Epoch 1 - ema_content_rmse: 0.5582 - ema_wording_rmse: 0.8301 - ema_mcrmse: 0.6941
Epoch 1 - ema_Save Best Score: 0.6941 Model


Epoch: [1][200/1292] Elapsed 21m 42s (remain 117m 47s) Loss: 0.0782(0.2471) Grad: 6.3141  LR: 0.00000498  


Epoch 1 avg_val_loss: 0.1714  time: 1557s
Epoch 1 - content_rmse: 0.5184 - wording_rmse: 0.6712 - mcrmse: 0.5948
Epoch 1 - Save Best Score: 0.5948 Model
Epoch 1 avg_val_loss: 0.1719  time: 1816s
Epoch 1 - ema_content_rmse: 0.5167 - ema_wording_rmse: 0.6735 - ema_mcrmse: 0.5951
Epoch 1 - ema_Save Best Score: 0.5951 Model


Epoch: [1][300/1292] Elapsed 32m 31s (remain 107m 5s) Loss: 0.2948(0.2180) Grad: 12.1504  LR: 0.00000496  


Epoch 1 avg_val_loss: 0.1514  time: 2207s
Epoch 1 - content_rmse: 0.4948 - wording_rmse: 0.6176 - mcrmse: 0.5562
Epoch 1 - Save Best Score: 0.5562 Model
Epoch 1 avg_val_loss: 0.1576  time: 2465s
Epoch 1 - ema_content_rmse: 0.4860 - ema_wording_rmse: 0.6458 - ema_mcrmse: 0.5659
Epoch 1 - ema_Save Best Score: 0.5659 Model


Epoch: [1][400/1292] Elapsed 43m 21s (remain 96m 19s) Loss: 0.1158(0.2023) Grad: 7.9900  LR: 0.00000493  


Epoch 1 avg_val_loss: 0.1694  time: 2856s
Epoch 1 - content_rmse: 0.4886 - wording_rmse: 0.6866 - mcrmse: 0.5876
Epoch 1 avg_val_loss: 0.1659  time: 3112s
Epoch 1 - ema_content_rmse: 0.4914 - ema_wording_rmse: 0.6713 - ema_mcrmse: 0.5813


Epoch: [1][500/1292] Elapsed 54m 4s (remain 85m 23s) Loss: 0.1519(0.1924) Grad: 3.6263  LR: 0.00000488  


Epoch 1 avg_val_loss: 0.1230  time: 3500s
Epoch 1 - content_rmse: 0.4518 - wording_rmse: 0.5492 - mcrmse: 0.5005
Epoch 1 - Save Best Score: 0.5005 Model
Epoch 1 avg_val_loss: 0.1235  time: 3759s
Epoch 1 - ema_content_rmse: 0.4528 - ema_wording_rmse: 0.5508 - ema_mcrmse: 0.5018
Epoch 1 - ema_Save Best Score: 0.5018 Model


Epoch: [1][600/1292] Elapsed 64m 54s (remain 74m 38s) Loss: 0.2190(0.1857) Grad: 5.2220  LR: 0.00000484  


Epoch 1 avg_val_loss: 0.1140  time: 4150s
Epoch 1 - content_rmse: 0.4362 - wording_rmse: 0.5242 - mcrmse: 0.4802
Epoch 1 - Save Best Score: 0.4802 Model
Epoch 1 avg_val_loss: 0.1143  time: 4408s
Epoch 1 - ema_content_rmse: 0.4347 - ema_wording_rmse: 0.5263 - ema_mcrmse: 0.4805
Epoch 1 - ema_Save Best Score: 0.4805 Model


Epoch: [1][700/1292] Elapsed 75m 44s (remain 63m 51s) Loss: 0.0571(0.1792) Grad: 5.6324  LR: 0.00000478  


Epoch 1 avg_val_loss: 0.1258  time: 4800s
Epoch 1 - content_rmse: 0.4549 - wording_rmse: 0.5531 - mcrmse: 0.5040
Epoch 1 avg_val_loss: 0.1159  time: 5055s
Epoch 1 - ema_content_rmse: 0.4281 - ema_wording_rmse: 0.5377 - ema_mcrmse: 0.4829


Epoch: [1][800/1292] Elapsed 86m 28s (remain 53m 0s) Loss: 0.0448(0.1723) Grad: 1.7217  LR: 0.00000471  


Epoch 1 avg_val_loss: 0.1133  time: 5444s
Epoch 1 - content_rmse: 0.4265 - wording_rmse: 0.5295 - mcrmse: 0.4780
Epoch 1 - Save Best Score: 0.4780 Model
Epoch 1 avg_val_loss: 0.1221  time: 5703s
Epoch 1 - ema_content_rmse: 0.4316 - ema_wording_rmse: 0.5598 - ema_mcrmse: 0.4957


Epoch: [1][900/1292] Elapsed 97m 16s (remain 42m 12s) Loss: 0.1616(0.1669) Grad: 3.6976  LR: 0.00000463  


Epoch 1 avg_val_loss: 0.1399  time: 6092s
Epoch 1 - content_rmse: 0.4773 - wording_rmse: 0.5887 - mcrmse: 0.5330
Epoch 1 avg_val_loss: 0.1407  time: 6348s
Epoch 1 - ema_content_rmse: 0.4679 - ema_wording_rmse: 0.6021 - ema_mcrmse: 0.5350


Epoch: [1][1000/1292] Elapsed 108m 1s (remain 31m 24s) Loss: 0.1448(0.1624) Grad: 5.8580  LR: 0.00000455  


Epoch 1 avg_val_loss: 0.1306  time: 6737s
Epoch 1 - content_rmse: 0.4677 - wording_rmse: 0.5587 - mcrmse: 0.5132
Epoch 1 avg_val_loss: 0.1233  time: 6992s
Epoch 1 - ema_content_rmse: 0.4500 - ema_wording_rmse: 0.5463 - ema_mcrmse: 0.4981


Epoch: [1][1100/1292] Elapsed 118m 45s (remain 20m 36s) Loss: 0.0856(0.1586) Grad: 5.4312  LR: 0.00000446  


Epoch 1 avg_val_loss: 0.1320  time: 7381s
Epoch 1 - content_rmse: 0.4336 - wording_rmse: 0.5928 - mcrmse: 0.5132
Epoch 1 avg_val_loss: 0.1356  time: 7637s
Epoch 1 - ema_content_rmse: 0.4424 - ema_wording_rmse: 0.5977 - ema_mcrmse: 0.5200


Epoch: [1][1200/1292] Elapsed 129m 30s (remain 9m 48s) Loss: 0.2404(0.1558) Grad: 4.5654  LR: 0.00000436  


Epoch 1 avg_val_loss: 0.1194  time: 8026s
Epoch 1 - content_rmse: 0.4446 - wording_rmse: 0.5477 - mcrmse: 0.4961
Epoch 1 avg_val_loss: 0.1200  time: 8281s
Epoch 1 - ema_content_rmse: 0.4422 - ema_wording_rmse: 0.5497 - ema_mcrmse: 0.4960


Epoch: [1][1291/1292] Elapsed 140m 2s (remain 0m 0s) Loss: 0.2700(0.1535) Grad: 12.2806  LR: 0.00000427  


Epoch 1 avg_val_loss: 0.1285  time: 8658s
Epoch 1 - content_rmse: 0.4769 - wording_rmse: 0.5471 - mcrmse: 0.5120
Epoch 1 avg_val_loss: 0.1345  time: 8914s
Epoch 1 - ema_content_rmse: 0.4618 - ema_wording_rmse: 0.5853 - ema_mcrmse: 0.5235


Epoch: [2][0/1292] Elapsed 0m 1s (remain 33m 18s) Loss: 0.2382(0.2382) Grad: 6.5344  LR: 0.00000427  


Epoch 2 avg_val_loss: 0.1260  time: 257s
Epoch 2 - content_rmse: 0.4788 - wording_rmse: 0.5346 - mcrmse: 0.5067
Epoch 2 avg_val_loss: 0.1331  time: 512s
Epoch 2 - ema_content_rmse: 0.4635 - ema_wording_rmse: 0.5779 - ema_mcrmse: 0.5207


Epoch: [2][100/1292] Elapsed 10m 46s (remain 126m 59s) Loss: 0.0385(0.0951) Grad: 3.3556  LR: 0.00000416  


Epoch 2 avg_val_loss: 0.1384  time: 901s
Epoch 2 - content_rmse: 0.4411 - wording_rmse: 0.6116 - mcrmse: 0.5264
Epoch 2 avg_val_loss: 0.1366  time: 1157s
Epoch 2 - ema_content_rmse: 0.4361 - ema_wording_rmse: 0.6086 - ema_mcrmse: 0.5224


Epoch: [2][200/1292] Elapsed 21m 30s (remain 116m 42s) Loss: 0.0760(0.0920) Grad: 3.8933  LR: 0.00000404  


Epoch 2 avg_val_loss: 0.1204  time: 1545s
Epoch 2 - content_rmse: 0.4665 - wording_rmse: 0.5219 - mcrmse: 0.4942
Epoch 2 avg_val_loss: 0.1142  time: 1801s
Epoch 2 - ema_content_rmse: 0.4427 - ema_wording_rmse: 0.5184 - ema_mcrmse: 0.4805


Epoch: [2][300/1292] Elapsed 32m 14s (remain 106m 7s) Loss: 0.0284(0.0923) Grad: 1.8707  LR: 0.00000392  


Epoch 2 avg_val_loss: 0.1149  time: 2189s
Epoch 2 - content_rmse: 0.4307 - wording_rmse: 0.5308 - mcrmse: 0.4807
Epoch 2 avg_val_loss: 0.1160  time: 2445s
Epoch 2 - ema_content_rmse: 0.4371 - ema_wording_rmse: 0.5295 - ema_mcrmse: 0.4833


Epoch: [2][400/1292] Elapsed 42m 58s (remain 95m 29s) Loss: 0.1031(0.0952) Grad: 3.3970  LR: 0.00000379  


Epoch 2 avg_val_loss: 0.1217  time: 2834s
Epoch 2 - content_rmse: 0.4560 - wording_rmse: 0.5384 - mcrmse: 0.4972
Epoch 2 avg_val_loss: 0.1148  time: 3089s
Epoch 2 - ema_content_rmse: 0.4326 - ema_wording_rmse: 0.5294 - ema_mcrmse: 0.4810


Epoch: [2][500/1292] Elapsed 53m 42s (remain 84m 47s) Loss: 0.0490(0.0940) Grad: 3.9690  LR: 0.00000366  


Epoch 2 avg_val_loss: 0.1088  time: 3478s
Epoch 2 - content_rmse: 0.4259 - wording_rmse: 0.5118 - mcrmse: 0.4689
Epoch 2 - Save Best Score: 0.4689 Model
Epoch 2 avg_val_loss: 0.1071  time: 3737s
Epoch 2 - ema_content_rmse: 0.4173 - ema_wording_rmse: 0.5117 - ema_mcrmse: 0.4645
Epoch 2 - ema_Save Best Score: 0.4645 Model


Epoch: [2][600/1292] Elapsed 64m 32s (remain 74m 12s) Loss: 0.0581(0.0956) Grad: 2.9866  LR: 0.00000352  


Epoch 2 avg_val_loss: 0.1063  time: 4128s
Epoch 2 - content_rmse: 0.4143 - wording_rmse: 0.5103 - mcrmse: 0.4623
Epoch 2 - Save Best Score: 0.4623 Model
Epoch 2 avg_val_loss: 0.1163  time: 4386s
Epoch 2 - ema_content_rmse: 0.4207 - ema_wording_rmse: 0.5446 - ema_mcrmse: 0.4827


Epoch: [2][700/1292] Elapsed 75m 19s (remain 63m 30s) Loss: 0.0388(0.0933) Grad: 5.1436  LR: 0.00000338  


Epoch 2 avg_val_loss: 0.1131  time: 4775s
Epoch 2 - content_rmse: 0.4187 - wording_rmse: 0.5331 - mcrmse: 0.4759
Epoch 2 avg_val_loss: 0.1196  time: 5031s
Epoch 2 - ema_content_rmse: 0.4215 - ema_wording_rmse: 0.5557 - ema_mcrmse: 0.4886


Epoch: [2][800/1292] Elapsed 86m 4s (remain 52m 45s) Loss: 0.1588(0.0930) Grad: 5.0957  LR: 0.00000324  


Epoch 2 avg_val_loss: 0.1156  time: 5420s
Epoch 2 - content_rmse: 0.4239 - wording_rmse: 0.5411 - mcrmse: 0.4825
Epoch 2 avg_val_loss: 0.1137  time: 5675s
Epoch 2 - ema_content_rmse: 0.4258 - ema_wording_rmse: 0.5330 - ema_mcrmse: 0.4794


Epoch: [2][900/1292] Elapsed 96m 48s (remain 42m 0s) Loss: 0.1037(0.0934) Grad: 5.7582  LR: 0.00000309  


Epoch 2 avg_val_loss: 0.1121  time: 6064s
Epoch 2 - content_rmse: 0.4293 - wording_rmse: 0.5200 - mcrmse: 0.4746
Epoch 2 avg_val_loss: 0.1077  time: 6319s
Epoch 2 - ema_content_rmse: 0.4212 - ema_wording_rmse: 0.5087 - ema_mcrmse: 0.4650


Epoch: [2][1000/1292] Elapsed 107m 32s (remain 31m 15s) Loss: 0.0342(0.0927) Grad: 3.7011  LR: 0.00000294  


Epoch 2 avg_val_loss: 0.1105  time: 6708s
Epoch 2 - content_rmse: 0.4142 - wording_rmse: 0.5259 - mcrmse: 0.4701
Epoch 2 avg_val_loss: 0.1123  time: 6964s
Epoch 2 - ema_content_rmse: 0.4167 - ema_wording_rmse: 0.5317 - ema_mcrmse: 0.4742


Epoch: [2][1100/1292] Elapsed 118m 17s (remain 20m 31s) Loss: 0.0458(0.0933) Grad: 2.4769  LR: 0.00000279  


Epoch 2 avg_val_loss: 0.1223  time: 7353s
Epoch 2 - content_rmse: 0.4319 - wording_rmse: 0.5570 - mcrmse: 0.4945
Epoch 2 avg_val_loss: 0.1112  time: 7608s
Epoch 2 - ema_content_rmse: 0.4210 - ema_wording_rmse: 0.5235 - ema_mcrmse: 0.4722


Epoch: [2][1200/1292] Elapsed 129m 1s (remain 9m 46s) Loss: 0.0585(0.0926) Grad: 3.8886  LR: 0.00000264  


Epoch 2 avg_val_loss: 0.1188  time: 7997s
Epoch 2 - content_rmse: 0.4232 - wording_rmse: 0.5511 - mcrmse: 0.4872
Epoch 2 avg_val_loss: 0.1152  time: 8252s
Epoch 2 - ema_content_rmse: 0.4254 - ema_wording_rmse: 0.5355 - ema_mcrmse: 0.4804


Epoch: [2][1291/1292] Elapsed 139m 33s (remain 0m 0s) Loss: 0.0317(0.0924) Grad: 2.9553  LR: 0.00000250  


Epoch 2 avg_val_loss: 0.1145  time: 8629s
Epoch 2 - content_rmse: 0.4320 - wording_rmse: 0.5296 - mcrmse: 0.4808
Epoch 2 avg_val_loss: 0.1145  time: 8884s
Epoch 2 - ema_content_rmse: 0.4169 - ema_wording_rmse: 0.5414 - ema_mcrmse: 0.4792


Epoch: [3][0/1292] Elapsed 0m 2s (remain 44m 6s) Loss: 0.0560(0.0560) Grad: 4.1548  LR: 0.00000250  


Epoch 3 avg_val_loss: 0.1148  time: 258s
Epoch 3 - content_rmse: 0.4348 - wording_rmse: 0.5284 - mcrmse: 0.4816
Epoch 3 avg_val_loss: 0.1142  time: 513s
Epoch 3 - ema_content_rmse: 0.4172 - ema_wording_rmse: 0.5401 - ema_mcrmse: 0.4787


Epoch: [3][100/1292] Elapsed 10m 46s (remain 127m 2s) Loss: 0.0921(0.0685) Grad: 5.0340  LR: 0.00000235  


Epoch 3 avg_val_loss: 0.1144  time: 902s
Epoch 3 - content_rmse: 0.4146 - wording_rmse: 0.5408 - mcrmse: 0.4777
Epoch 3 avg_val_loss: 0.1127  time: 1157s
Epoch 3 - ema_content_rmse: 0.4146 - ema_wording_rmse: 0.5343 - ema_mcrmse: 0.4745


Epoch: [3][200/1292] Elapsed 21m 30s (remain 116m 44s) Loss: 0.0582(0.0671) Grad: 1.6158  LR: 0.00000220  


Epoch 3 avg_val_loss: 0.1097  time: 1546s
Epoch 3 - content_rmse: 0.4143 - wording_rmse: 0.5226 - mcrmse: 0.4684
Epoch 3 avg_val_loss: 0.1080  time: 1801s
Epoch 3 - ema_content_rmse: 0.4154 - ema_wording_rmse: 0.5153 - ema_mcrmse: 0.4654


Epoch: [3][300/1292] Elapsed 32m 14s (remain 106m 8s) Loss: 0.0287(0.0660) Grad: 2.5671  LR: 0.00000205  


Epoch 3 avg_val_loss: 0.1048  time: 2190s
Epoch 3 - content_rmse: 0.4255 - wording_rmse: 0.4953 - mcrmse: 0.4604
Epoch 3 - Save Best Score: 0.4604 Model
Epoch 3 avg_val_loss: 0.1061  time: 2449s
Epoch 3 - ema_content_rmse: 0.4184 - ema_wording_rmse: 0.5055 - ema_mcrmse: 0.4619
Epoch 3 - ema_Save Best Score: 0.4619 Model


Epoch: [3][400/1292] Elapsed 43m 4s (remain 95m 43s) Loss: 0.0333(0.0642) Grad: 2.5433  LR: 0.00000190  


Epoch 3 avg_val_loss: 0.1160  time: 2840s
Epoch 3 - content_rmse: 0.4188 - wording_rmse: 0.5446 - mcrmse: 0.4817
Epoch 3 avg_val_loss: 0.1131  time: 3095s
Epoch 3 - ema_content_rmse: 0.4205 - ema_wording_rmse: 0.5320 - ema_mcrmse: 0.4762


Epoch: [3][500/1292] Elapsed 53m 48s (remain 84m 57s) Loss: 0.0805(0.0624) Grad: 6.3382  LR: 0.00000175  


Epoch 3 avg_val_loss: 0.1128  time: 3484s
Epoch 3 - content_rmse: 0.4230 - wording_rmse: 0.5307 - mcrmse: 0.4768
Epoch 3 avg_val_loss: 0.1089  time: 3740s
Epoch 3 - ema_content_rmse: 0.4130 - ema_wording_rmse: 0.5222 - ema_mcrmse: 0.4676


Epoch: [3][600/1292] Elapsed 64m 33s (remain 74m 13s) Loss: 0.0351(0.0613) Grad: 3.3041  LR: 0.00000161  


Epoch 3 avg_val_loss: 0.1081  time: 4129s
Epoch 3 - content_rmse: 0.4169 - wording_rmse: 0.5157 - mcrmse: 0.4663
Epoch 3 avg_val_loss: 0.1065  time: 4384s
Epoch 3 - ema_content_rmse: 0.4161 - ema_wording_rmse: 0.5098 - ema_mcrmse: 0.4630


Epoch: [3][700/1292] Elapsed 75m 17s (remain 63m 28s) Loss: 0.0422(0.0615) Grad: 2.4590  LR: 0.00000147  


Epoch 3 avg_val_loss: 0.1080  time: 4773s
Epoch 3 - content_rmse: 0.4191 - wording_rmse: 0.5128 - mcrmse: 0.4660
Epoch 3 avg_val_loss: 0.1037  time: 5028s
Epoch 3 - ema_content_rmse: 0.4162 - ema_wording_rmse: 0.4982 - ema_mcrmse: 0.4572
Epoch 3 - ema_Save Best Score: 0.4572 Model


Epoch: [3][800/1292] Elapsed 86m 4s (remain 52m 45s) Loss: 0.0255(0.0605) Grad: 2.0015  LR: 0.00000133  


Epoch 3 avg_val_loss: 0.1074  time: 5420s
Epoch 3 - content_rmse: 0.4174 - wording_rmse: 0.5108 - mcrmse: 0.4641
Epoch 3 avg_val_loss: 0.1056  time: 5675s
Epoch 3 - ema_content_rmse: 0.4170 - ema_wording_rmse: 0.5041 - ema_mcrmse: 0.4605


Epoch: [3][900/1292] Elapsed 96m 49s (remain 42m 0s) Loss: 0.0354(0.0601) Grad: 4.1613  LR: 0.00000120  


Epoch 3 avg_val_loss: 0.1047  time: 6064s
Epoch 3 - content_rmse: 0.4113 - wording_rmse: 0.5050 - mcrmse: 0.4581
Epoch 3 - Save Best Score: 0.4581 Model
Epoch 3 avg_val_loss: 0.1047  time: 6324s
Epoch 3 - ema_content_rmse: 0.4128 - ema_wording_rmse: 0.5038 - ema_mcrmse: 0.4583


Epoch: [3][1000/1292] Elapsed 107m 36s (remain 31m 17s) Loss: 0.0739(0.0607) Grad: 5.4903  LR: 0.00000107  


Epoch 3 avg_val_loss: 0.1014  time: 6712s
Epoch 3 - content_rmse: 0.4108 - wording_rmse: 0.4925 - mcrmse: 0.4516
Epoch 3 - Save Best Score: 0.4516 Model
Epoch 3 avg_val_loss: 0.1018  time: 6971s
Epoch 3 - ema_content_rmse: 0.4090 - ema_wording_rmse: 0.4954 - ema_mcrmse: 0.4522
Epoch 3 - ema_Save Best Score: 0.4522 Model


Epoch: [3][1100/1292] Elapsed 118m 27s (remain 20m 33s) Loss: 0.0823(0.0602) Grad: 6.1501  LR: 0.00000095  


Epoch 3 avg_val_loss: 0.1030  time: 7363s
Epoch 3 - content_rmse: 0.4133 - wording_rmse: 0.4972 - mcrmse: 0.4552
Epoch 3 avg_val_loss: 0.1038  time: 7619s
Epoch 3 - ema_content_rmse: 0.4131 - ema_wording_rmse: 0.5001 - ema_mcrmse: 0.4566


Epoch: [3][1200/1292] Elapsed 129m 12s (remain 9m 47s) Loss: 0.0811(0.0596) Grad: 6.4799  LR: 0.00000083  


Epoch 3 avg_val_loss: 0.1038  time: 8008s
Epoch 3 - content_rmse: 0.4154 - wording_rmse: 0.4982 - mcrmse: 0.4568
Epoch 3 avg_val_loss: 0.1039  time: 8263s
Epoch 3 - ema_content_rmse: 0.4115 - ema_wording_rmse: 0.5016 - ema_mcrmse: 0.4566


Epoch: [3][1291/1292] Elapsed 139m 44s (remain 0m 0s) Loss: 0.0920(0.0595) Grad: 3.5704  LR: 0.00000073  


Epoch 3 avg_val_loss: 0.1061  time: 8640s
Epoch 3 - content_rmse: 0.4101 - wording_rmse: 0.5118 - mcrmse: 0.4610
Epoch 3 avg_val_loss: 0.1071  time: 8895s
Epoch 3 - ema_content_rmse: 0.4094 - ema_wording_rmse: 0.5163 - ema_mcrmse: 0.4628


Epoch: [4][0/1292] Elapsed 0m 1s (remain 33m 45s) Loss: 0.0200(0.0200) Grad: 2.1681  LR: 0.00000073  


Epoch 4 avg_val_loss: 0.1062  time: 257s
Epoch 4 - content_rmse: 0.4100 - wording_rmse: 0.5122 - mcrmse: 0.4611
Epoch 4 avg_val_loss: 0.1071  time: 513s
Epoch 4 - ema_content_rmse: 0.4094 - ema_wording_rmse: 0.5161 - ema_mcrmse: 0.4627


Epoch: [4][100/1292] Elapsed 10m 45s (remain 126m 56s) Loss: 0.0245(0.0435) Grad: 1.9923  LR: 0.00000063  


Epoch 4 avg_val_loss: 0.1071  time: 901s
Epoch 4 - content_rmse: 0.4119 - wording_rmse: 0.5145 - mcrmse: 0.4632
Epoch 4 avg_val_loss: 0.1070  time: 1157s
Epoch 4 - ema_content_rmse: 0.4115 - ema_wording_rmse: 0.5142 - ema_mcrmse: 0.4629


Epoch: [4][200/1292] Elapsed 21m 30s (remain 116m 44s) Loss: 0.0347(0.0412) Grad: 2.4873  LR: 0.00000053  


Epoch 4 avg_val_loss: 0.1074  time: 1546s
Epoch 4 - content_rmse: 0.4114 - wording_rmse: 0.5158 - mcrmse: 0.4636
Epoch 4 avg_val_loss: 0.1072  time: 1801s
Epoch 4 - ema_content_rmse: 0.4119 - ema_wording_rmse: 0.5149 - ema_mcrmse: 0.4634


Epoch: [4][300/1292] Elapsed 32m 15s (remain 106m 11s) Loss: 0.0805(0.0403) Grad: 4.0934  LR: 0.00000044  


Epoch 4 avg_val_loss: 0.1075  time: 2191s
Epoch 4 - content_rmse: 0.4118 - wording_rmse: 0.5161 - mcrmse: 0.4639
Epoch 4 avg_val_loss: 0.1062  time: 2446s
Epoch 4 - ema_content_rmse: 0.4119 - ema_wording_rmse: 0.5109 - ema_mcrmse: 0.4614


Epoch: [4][400/1292] Elapsed 42m 59s (remain 95m 30s) Loss: 0.0503(0.0418) Grad: 4.7571  LR: 0.00000036  


Epoch 4 avg_val_loss: 0.1068  time: 2834s
Epoch 4 - content_rmse: 0.4141 - wording_rmse: 0.5114 - mcrmse: 0.4627
Epoch 4 avg_val_loss: 0.1068  time: 3090s
Epoch 4 - ema_content_rmse: 0.4141 - ema_wording_rmse: 0.5115 - ema_mcrmse: 0.4628


Epoch: [4][500/1292] Elapsed 53m 43s (remain 84m 48s) Loss: 0.0232(0.0407) Grad: 1.7933  LR: 0.00000028  


Epoch 4 avg_val_loss: 0.1064  time: 3478s
Epoch 4 - content_rmse: 0.4173 - wording_rmse: 0.5076 - mcrmse: 0.4624
Epoch 4 avg_val_loss: 0.1070  time: 3734s
Epoch 4 - ema_content_rmse: 0.4132 - ema_wording_rmse: 0.5130 - ema_mcrmse: 0.4631


Epoch: [4][600/1292] Elapsed 64m 27s (remain 74m 6s) Loss: 0.0735(0.0404) Grad: 4.4796  LR: 0.00000022  


Epoch 4 avg_val_loss: 0.1061  time: 4123s
Epoch 4 - content_rmse: 0.4127 - wording_rmse: 0.5100 - mcrmse: 0.4614
Epoch 4 avg_val_loss: 0.1063  time: 4378s
Epoch 4 - ema_content_rmse: 0.4136 - ema_wording_rmse: 0.5101 - ema_mcrmse: 0.4619


Epoch: [4][700/1292] Elapsed 75m 11s (remain 63m 23s) Loss: 0.0201(0.0401) Grad: 1.6478  LR: 0.00000016  


Epoch 4 avg_val_loss: 0.1054  time: 4767s
Epoch 4 - content_rmse: 0.4136 - wording_rmse: 0.5066 - mcrmse: 0.4601
Epoch 4 avg_val_loss: 0.1056  time: 5022s
Epoch 4 - ema_content_rmse: 0.4138 - ema_wording_rmse: 0.5070 - ema_mcrmse: 0.4604


Epoch: [4][800/1292] Elapsed 85m 55s (remain 52m 40s) Loss: 0.0201(0.0402) Grad: 2.7246  LR: 0.00000011  


Epoch 4 avg_val_loss: 0.1057  time: 5411s
Epoch 4 - content_rmse: 0.4133 - wording_rmse: 0.5079 - mcrmse: 0.4606
Epoch 4 avg_val_loss: 0.1056  time: 5666s
Epoch 4 - ema_content_rmse: 0.4131 - ema_wording_rmse: 0.5076 - ema_mcrmse: 0.4604


Epoch: [4][900/1292] Elapsed 96m 39s (remain 41m 56s) Loss: 0.0150(0.0399) Grad: 2.5146  LR: 0.00000007  


Epoch 4 avg_val_loss: 0.1055  time: 6055s
Epoch 4 - content_rmse: 0.4145 - wording_rmse: 0.5060 - mcrmse: 0.4603
Epoch 4 avg_val_loss: 0.1055  time: 6310s
Epoch 4 - ema_content_rmse: 0.4135 - ema_wording_rmse: 0.5070 - ema_mcrmse: 0.4603


Epoch: [4][1000/1292] Elapsed 107m 23s (remain 31m 13s) Loss: 0.0287(0.0400) Grad: 1.9929  LR: 0.00000004  


Epoch 4 avg_val_loss: 0.1047  time: 6699s
Epoch 4 - content_rmse: 0.4146 - wording_rmse: 0.5029 - mcrmse: 0.4588
Epoch 4 avg_val_loss: 0.1049  time: 6954s
Epoch 4 - ema_content_rmse: 0.4145 - ema_wording_rmse: 0.5038 - ema_mcrmse: 0.4592


Epoch: [4][1100/1292] Elapsed 118m 7s (remain 20m 29s) Loss: 0.0696(0.0399) Grad: 6.3669  LR: 0.00000002  


Epoch 4 avg_val_loss: 0.1049  time: 7343s
Epoch 4 - content_rmse: 0.4140 - wording_rmse: 0.5041 - mcrmse: 0.4591
Epoch 4 avg_val_loss: 0.1048  time: 7598s
Epoch 4 - ema_content_rmse: 0.4142 - ema_wording_rmse: 0.5035 - ema_mcrmse: 0.4589


Epoch: [4][1200/1292] Elapsed 128m 52s (remain 9m 45s) Loss: 0.0172(0.0396) Grad: 2.0420  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1049  time: 7987s
Epoch 4 - content_rmse: 0.4139 - wording_rmse: 0.5042 - mcrmse: 0.4591
Epoch 4 avg_val_loss: 0.1049  time: 8243s
Epoch 4 - ema_content_rmse: 0.4140 - ema_wording_rmse: 0.5042 - ema_mcrmse: 0.4591


Epoch: [4][1291/1292] Elapsed 139m 23s (remain 0m 0s) Loss: 0.0135(0.0397) Grad: 2.5051  LR: 0.00000000  


Epoch 4 avg_val_loss: 0.1049  time: 8619s
Epoch 4 - content_rmse: 0.4139 - wording_rmse: 0.5042 - mcrmse: 0.4591
Epoch 4 avg_val_loss: 0.1049  time: 8875s
Epoch 4 - ema_content_rmse: 0.4139 - ema_wording_rmse: 0.5042 - ema_mcrmse: 0.4591


In [20]:
## total_complex = []
# for fold in range(4):
#     va_data = train_df[train_df['fold'] == fold]
#     preds = torch.load('/content/drive/MyDrive/deb_simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['predictions']
#     va_data['preds'] = preds
#     va_data = va_data[['id', 'preds', 'score']]
#     print(compute_metrics(va_data['preds'].values.reshape(-1,1), va_data['score'].values))
#     total_complex.append(va_data)
# total_complex = pd.concat(total_complex)
# compute_metrics(total_complex['preds'].values.reshape(-1,1), total_complex['score'].values)

In [21]:
# !mkdir -p /root/.kaggle
# !cp /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets init -p /content/drive/MyDrive/elc_mean/

In [22]:
#!kaggle datasets create -p /content/drive/MyDrive/elc_mean/

In [23]:
# deberta v3 large
# 1.5 0.8228
# 2 0.8197

# 1.5  8137
#2 8175
#2.5 8181
#3 8181
#3.5 8175

