## Log
* turn discriminative learning rates on
* use 1536 as max length, no pretraining, use discriminative learning rate

In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Model training')
    # params of training
    parser.add_argument(
        "--fold", dest="fold", help="Train fold", default=None, type=int)
    parser.add_argument(
        '--batch_size',
        dest='batch_size',
        help='Mini batch size of one gpu or cpu',
        type=int,
        default=None)
    return parser.parse_args()


# Config

In [3]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'custom'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 1536
    max_position_embeddings = 1536
    folds = [0]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = True
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
#     encoder_lr = 5e-6
#     head_lr = 5e-6
    encoder_lr = 4e-6
    head_lr = 1e-5
    
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 1e-4
    dropout = 0
    num_fold = 5
    batch_size = 2
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    
    

## logger

In [4]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    if not os.path.exists(CFG.OUTPUT_DIR):
        os.makedirs(CFG.OUTPUT_DIR)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Preproc

In [5]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

df["fold"] = df["prompt_id"].map(id2fold)

In [6]:
df 

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,fold
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,1
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,1
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710,1
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,1
...,...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990,3
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784,3
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294,3
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538,3


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-large/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f319c560490>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 5e887517-74dc-4ebb-9cab-9f2e591d5689)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-large/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [9]:
mask_lm_datacollator = DataCollatorForWholeWordMask(tokenizer)
def data_collator(batch):
    input_ids = [{'input_ids':i[0]} for i in batch]
    token_type_ids = [i[1] for i in batch]
    attention_mask = [i[2] for i in batch]
    labels = [i[3] for i in batch]
    masked_input = mask_lm_datacollator(input_ids)['input_ids']
    return masked_input,\
               torch.stack(token_type_ids),\
               torch.stack(attention_mask),\
               torch.stack(labels)

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.prompt_title = df['prompt_title'].values.astype(str)
        self.prompt_text = df['prompt_text'].values.astype(str)
        self.prompt_question = df['prompt_question'].values.astype(str)
        self.text = df['text'].values.astype(str)
        self.content = df['content'].values
        self.wording = df['wording'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.prompt_title)
    
    def tokenize(self, example):
        sep = self.tokenizer.sep_token
        if  CFG.input_type == '1':
            prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
        else:
            prompt = example["prompt_question"] 
        
        labels = [float(example["content"]), float(example["wording"])]

        tokenized = tokenizer(
            example["text"],
            prompt,
            padding='max_length',
            truncation=True,
            max_length=CFG.max_input_length,
            return_tensors=None,
        )
        
        return {
            **tokenized,
            "labels": labels,
        }
    
    def __getitem__(self, item):
        example = {
                    "prompt_title":self.prompt_title[item],
                    "prompt_text":self.prompt_text[item],
                    "prompt_question":self.prompt_question[item],
                    "text":self.text[item],
                    "content":self.content[item],
                    "wording":self.wording[item],
                  }
        
        out = self.tokenize(example)
       
        return {
                'input_ids': torch.as_tensor(out['input_ids'], dtype=torch.long),
                'token_type_ids': torch.as_tensor(out['token_type_ids'], dtype=torch.long),
                'attention_mask': torch.as_tensor(out['attention_mask'], dtype=torch.long),
                'labels': torch.as_tensor(out['labels'], dtype=torch.float),
        }
        
        
        

## Model

In [11]:
def init_params(module_lst):
    for module in module_lst:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

class Custom_Bert(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained('./pretrain/model_base/')
        self.config.update({"output_hidden_states":True})
        print('No pretrained model loaded ...')
        self.base = AutoModel.from_pretrained('./pretrain/model_base/', config=self.config)
        
#         print('load pretrained model ...');
#         self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model_1009', config = config)
        
        dim = self.config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, self.config.hidden_size),
            nn.Tanh(),
            nn.Linear(self.config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls = nn.Sequential(
            nn.Linear(dim,2)
        )
        init_params([self.cls,self.attention])

        
    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                                )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output = self.cls(logits)
        if labels is None:
            return output

        else:
            #return (nn.MSELoss()(torch.squeeze(output,1),labels), output)
            return SequenceClassifierOutput(
                loss=nn.MSELoss()(output,labels),
                logits=output, 
                hidden_states=None,
                attentions=None
            )


class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(torch.mean(output, dim=1))
        return SequenceClassifierOutput(
            loss=nn.MSELoss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )

class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret    

class Custom_Bert_Pool(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })
        #self.base = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        print('load pretrained model ...');
        self.base = AutoModel.from_pretrained('./input/pretrain/pretrained_model_1009', config = self.config)
        
        self.pool = GeMText()
        self.cls = nn.Linear(self.config.hidden_size,2)

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
                               )
        output = base_output.last_hidden_state
        output = self.cls(self.pool(output, attention_mask))
        return SequenceClassifierOutput(
            loss=nn.SmoothL1Loss()(output,labels),
            logits=output, 
            hidden_states=None,
            attentions=None
        )
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            print(f'Re-initialize {module}')
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            print(f'Re-initialize {module}')
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class Custom_Bert_Mean(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states=True
        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)

    def forward(self, input_ids, attention_mask,labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                            )


        output = base_output.hidden_states[-1]
        output = self.cls(self.dropout(torch.mean(output, dim=1)))
        if labels is None:
            return output

        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

class Custom_Bert_M(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.update({"output_hidden_states":True})

        self.base = AutoModel.from_pretrained(CFG.model_path, config=config)

        dim = config.hidden_size

        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = 24
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.attention = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Tanh(),
            nn.Linear(config.hidden_size, 1),
            nn.Softmax(dim=1)
        )
        self.cls_0 = nn.Sequential(
            nn.Linear(dim,1)
        )

        self.cls_1 = nn.Linear(dim,5)
        init_params([self.cls_0,self.cls_1,self.attention])

    def forward(self, input_ids, attention_mask, labels):
        base_output = self.base(input_ids=input_ids,
                    attention_mask=attention_mask,
                             )

        cls_outputs = torch.stack(
            [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)

        logits = torch.mean(
            torch.stack(
                [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        output_0 = self.cls_0(logits)
        output_1 = self.cls_1(logits)
        if labels is None:
            return output_0

        else:
            regression_loss = nn.MSELoss()(torch.squeeze(output_0,1),labels)
            labels = labels.double()
            cls_labels = torch.where(labels==1.,4.0,labels)
            cls_labels = torch.where(cls_labels==0.25,1.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.5,2.0,cls_labels)
            cls_labels = torch.where(cls_labels==0.75,3.0,cls_labels)
            cls_labels = cls_labels.long()
            cls_loss = nn.CrossEntropyLoss()(output_1, cls_labels)
            return ( 0.8 * regression_loss + 0.2 * cls_loss, output_0)

In [12]:
def build_model():
    if CFG.model_type == 'base':
        model_config = AutoConfig.from_pretrained(CFG.model_path)
        model_config.update({
            "hidden_dropout_prob": CFG.dropout,
            "attention_probs_dropout_prob": CFG.dropout,
            "num_labels": 2,
            "problem_type": "regression",
            "max_position_embeddings": CFG.max_position_embeddings
        })

        #print(model_config)
        model = AutoModelForSequenceClassification.from_pretrained(
            CFG.model_path, config=model_config
        )
    if CFG.model_type == 'simple':
        model = Custom_Bert_Simple()
    if CFG.model_type == 'pool':
        model = Custom_Bert_Pool()
        if CFG.reinit_layers > 0:
            print("=="*40)
            print(f"Reinitialize the last {CFG.reinit_layers} layer(s).")
            for layer in model.base.encoder.layer[-CFG.reinit_layers:]:
                print("===")
                layer.apply(model._init_weights)
            print("=="*40)
        if CFG.load_pretrained:
            model.load_state_dict(torch.load('./pretrained/microsoft_deberta-v3-base_best_ema.pth')['model'])
    
    if CFG.model_type == 'custom':
        model = Custom_Bert();
    
    return model

    
        

# Train

In [13]:
from copy import deepcopy
class ModelEMA:
    """Model Exponential Moving Average from https://github.com/rwightman/
    pytorch-image-models Keep a moving average of everything in the model
    state_dict (parameters and buffers).

    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/
    ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training
    schemes to perform well.
    This class is sensitive where it is initialized in the sequence
    of model init, GPU assignment and distributed training wrappers.
    """

    def __init__(self, model, decay=0.9999, updates=0):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
            updates (int): counter of EMA updates.
        """
        # Create EMA(FP32)
        self.ema_model = deepcopy(model).eval()
        self.ema = self.ema_model
        self.updates = updates
        # decay exponential ramp (to help early epochs)
        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def update(self, model):
        # Update EMA parameters
        with torch.no_grad():
            self.updates += 1
            d = self.decay(self.updates)
            msd =  model.state_dict()# model state_dict
            for k, v in self.ema.state_dict().items():
                if v.dtype.is_floating_point:
                    v *= d
                    v += (1.0 - d) * msd[k].detach()

class EMAHook:
    """EMAHook used in BEVDepth.

    Modified from https://github.com/Megvii-Base
    Detection/BEVDepth/blob/main/callbacks/ema.py.
    """

    def __init__(self, model, init_updates=0, decay=0.9990, resume=None, logger=None):
        super().__init__()
        self.init_updates = init_updates
        self.resume = resume
        self.decay = decay
        self.ema_model = self.before_run(model)
        self.logger = logger

    def before_run(self, model):
        from torch.nn.modules.batchnorm import SyncBatchNorm

        bn_model_list = list()
        bn_model_dist_group_list = list()
        for model_ref in model.modules():
            if isinstance(model_ref, SyncBatchNorm):
                bn_model_list.append(model_ref)
                bn_model_dist_group_list.append(model_ref.process_group)
                model_ref.process_group = None
        ema_model = ModelEMA(model, self.decay)

        for bn_model, dist_group in zip(bn_model_list,
                                        bn_model_dist_group_list):
            bn_model.process_group = dist_group
        ema_model.updates = self.init_updates

        if self.resume is not None:
            self.logger.info(f'resume ema checkpoint from {self.resume}')
            cpt = torch.load(self.resume, map_location='cpu')
            load_state_dict(ema_model.ema, cpt['state_dict'])
            ema_model.updates = cpt['updates']

        return ema_model

    def after_train_iter(self, model):
        self.ema_model.update(model)

In [14]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [15]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [16]:
CFG.discriminative_learning_rate_num_groups

1

In [17]:
def get_optimizer_llr_params(model, type='s'):
    """
    Setup the optimizer.
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    MODIFIED VERSION:
    * added support for differential learning rates per layer

    reference: https://github.com/huggingface/transformers/blob/05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe/src/transformers/trainer.py#L804
    """

    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    ### ADDED
    if CFG.discriminative_learning_rate:

        num_layers = model.config.num_hidden_layers

        learning_rate_powers = range(0, num_layers, num_layers//CFG.discriminative_learning_rate_num_groups)
        layer_wise_learning_rates = [
            pow(CFG.discriminative_learning_rate_decay_rate, power) * CFG.encoder_lr 
            for power in learning_rate_powers 
            for _ in range(num_layers//CFG.discriminative_learning_rate_num_groups)
          ]
        layer_wise_learning_rates = layer_wise_learning_rates[::-1]
        print('Layer-wise learning rates:', layer_wise_learning_rates)

        # group embedding paramters from the transformer encoder
        embedding_layer = model.base.embeddings
        optimizer_grouped_parameters = [
          {
              "params": [p for n, p in embedding_layer.named_parameters() if not any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": CFG.weight_decay,
          },
          {
              "params": [p for n, p in embedding_layer.named_parameters() if any(nd in n for nd in no_decay)],
              "lr": pow(CFG.discriminative_learning_rate_decay_rate, num_layers) * CFG.encoder_lr ,
              "weight_decay": 0.0,
          },
        ]

        # group encoding paramters from the transformer encoder
        encoding_layers = [layer for layer in model.base.encoder.layer]
        for i, layer in enumerate(encoding_layers):
            optimizer_grouped_parameters += [
                {
                    "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                    "lr": layer_wise_learning_rates[i],
                    "weight_decay": 0.0,
                },
            ]    
        print(f"Detected unattached modules in model.encoder: {[n for n, p in model.base.encoder.named_parameters() if not n.startswith('layer')]}")
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and not any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.base.encoder.named_parameters() if not n.startswith('layer') and any(nd in n for nd in no_decay)],
                "lr": layer_wise_learning_rates[-1],
                "weight_decay": 0.0,
            },
        ]

        # group paramters from the task specific head
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and not any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if 'base' not in n and any(nd in n for nd in no_decay)],
                "lr": CFG.head_lr,
                "weight_decay": 0.0,
            },
        ]
    ### END ADDED
    else:
        # group paramters for the entire network
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": CFG.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "lr": CFG.encoder_lr,
                "weight_decay": 0.0,
            },
        ]
    return optimizer_grouped_parameters

In [18]:
def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        with torch.no_grad():
            model_output = model(**batch)
        label = batch['labels']
        loss, logits = model_output.loss, model_output.logits
        losses.update(loss.item(), batch_size)
        preds.append(logits.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        del model_output, loss, logits
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    torch.cuda.empty_cache()
    return losses.avg, predictions, labels

def train_fn(train_loader, model, optimizer, epoch, scheduler, device, valid_loader, start_time, best_score, best_score_ema,ema_hook,wandb, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        for key, value in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
        loss = model(**batch).loss
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        ema_hook.after_train_iter(model)
        global_step += 1
        scheduler.step()
        end = time.time()
        
        wandb.log({
                'train loss': loss.item(),
                'step': global_step,
                'epoch': epoch,
                'fold': fold,
                'batch_size':CFG.batch_size
            })
        
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
            # eval
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

            # scoring
            score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - content_rmse: {content_rmse:.4f} - wording_rmse: {wording_rmse:.4f} - mcrmse: {mcrmse:.4f}')
            
            
            if best_score > score['mcrmse']:
                if best_score != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
                best_score = score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score))
            
            
            avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, ema_hook.ema_model.ema, CFG.device)
            # ema scoring
            ema_score = compute_mcrmse((predictions, valid_labels))


            content_rmse, wording_rmse, mcrmse = list(ema_score.values())
            elapsed = time.time() - start_time

            LOGGER.info(
                f'Epoch {epoch + 1} avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch + 1} - ema_content_rmse: {content_rmse:.4f} - ema_wording_rmse: {wording_rmse:.4f} - ema_mcrmse: {mcrmse:.4f}')
            
            
            if best_score_ema > ema_score['mcrmse']:
                if best_score_ema != float('inf'):
                    os.remove(CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold, best_score_ema))
                best_score_ema = ema_score['mcrmse']
                best_predictions = predictions
                LOGGER.info(f'Epoch {epoch + 1} - ema_Save Best Score: {best_score_ema:.4f} Model')
                torch.save({'model': ema_hook.ema_model.ema.state_dict(),
                            'predictions': predictions},
                           CFG.OUTPUT_DIR + "{}_best_ema{}_{}.pth".format(CFG.model_path.replace('/', '_'),fold,best_score_ema))
            
            wandb.log({
                'learning rate': optimizer.param_groups[0]['lr'],
                'validation mcrmse': score['mcrmse'],
                'validation ema mcrmse': ema_score['mcrmse'],
                'step': global_step,
                'epoch': epoch,
            })
            
        ## release memory
        del batch, loss
        torch.cuda.empty_cache()
        gc.collect()
    return losses.avg, best_score, best_score_ema



def train_loop():
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================
    wandb.init(project='kaggle-commonlit-eval-student-summaries-2909')
    wandb.config = dict(epochs=CFG.epochs, 
                            batch_size=CFG.batch_size, 
                            learning_rate=CFG.encoder_lr,
                            save_checkpoint=True,
                            )
    for fold in CFG.folds:
        
        if CFG.pretraining:
            tr_data = pd.read_csv('tmp_pessudo.csv')
            tr_data['prompt_title'] = ''
            tr_data = tr_data[-(tr_data['prompt_question'].isin(pdf['prompt_question'].tolist()))]
            va_data = df #df[df['fold']==fold].reset_index(drop=True)
        else:
            tr_data = df[df['fold']!=fold].reset_index(drop=True)
            va_data = df[df['fold']==fold].reset_index(drop=True)
        train_dataset = TrainDataset(tr_data, tokenizer)
        valid_dataset = TrainDataset(va_data, tokenizer)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=False,
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

        # ====================================================
        # model & optimizer
        # ====================================================
        model = build_model()
        #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.to(CFG.device)
        # for param in model.base.parameters():
        #         param.requires_grad = False
        ema_hook = EMAHook(model, init_updates=3000, logger=LOGGER)
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 'lr': encoder_lr, 'weight_decay': 0.0},
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_llr_params(model)
        optimizer = AdamW(optimizer_parameters, eps=CFG.eps, betas=CFG.betas)


        
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
            if cfg.scheduler == 'linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                    num_cycles=cfg.num_cycles
                )
            return scheduler

        num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        # ====================================================
        # loop
        # ====================================================
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

        # criterion = LabelSmoothingLoss()
        best_score = float('inf')
        best_score_ema = float('inf')
        for epoch in range(CFG.epochs):

            start_time = time.time()

            # train
            avg_loss, best_score, best_score_ema = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device, valid_loader, start_time, best_score, best_score_ema ,ema_hook, wandb,fold)

        
        torch.cuda.empty_cache()
        gc.collect()
        del scheduler, optimizer, model
    return 


In [19]:
train_loop()

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670159778247276, max=1.0…

No pretrained model loaded ...
Layer-wise learning rates: [4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 4e-06]
Detected unattached modules in model.encoder: ['rel_embeddings.weight', 'LayerNorm.weight', 'LayerNorm.bias']
Epoch: [1][0/3031] Elapsed 0m 2s (remain 122m 45s) Loss: 0.5405(0.5405) Grad: 12.6490  LR: 0.00000314  


Epoch 1 avg_val_loss: 1.7253  time: 297s
Epoch 1 - content_rmse: 1.2889 - wording_rmse: 1.3377 - mcrmse: 1.3133
Epoch 1 - Save Best Score: 1.3133 Model
Epoch 1 avg_val_loss: 1.7518  time: 593s
Epoch 1 - ema_content_rmse: 1.3040 - ema_wording_rmse: 1.3428 - ema_mcrmse: 1.3234
Epoch 1 - ema_Save Best Score: 1.3234 Model


Epoch: [1][100/3031] Elapsed 12m 29s (remain 362m 14s) Loss: 0.1334(0.8029) Grad: 18.2456  LR: 0.00000314  


Epoch 1 avg_val_loss: 0.4876  time: 1044s
Epoch 1 - content_rmse: 0.5970 - wording_rmse: 0.7867 - mcrmse: 0.6918
Epoch 1 - Save Best Score: 0.6918 Model
Epoch 1 avg_val_loss: 0.4984  time: 1340s
Epoch 1 - ema_content_rmse: 0.6235 - ema_wording_rmse: 0.7798 - ema_mcrmse: 0.7017
Epoch 1 - ema_Save Best Score: 0.7017 Model


Epoch: [1][200/3031] Elapsed 24m 56s (remain 351m 9s) Loss: 0.1985(0.5949) Grad: 10.8193  LR: 0.00000314  


Epoch 1 avg_val_loss: 0.5708  time: 1790s
Epoch 1 - content_rmse: 0.6233 - wording_rmse: 0.8678 - mcrmse: 0.7455
Epoch 1 avg_val_loss: 0.5423  time: 2084s
Epoch 1 - ema_content_rmse: 0.6267 - ema_wording_rmse: 0.8318 - ema_mcrmse: 0.7293


Epoch: [1][300/3031] Elapsed 37m 17s (remain 338m 14s) Loss: 0.0374(0.5172) Grad: 8.2033  LR: 0.00000314  


Epoch 1 avg_val_loss: 0.4547  time: 2531s
Epoch 1 - content_rmse: 0.5754 - wording_rmse: 0.7604 - mcrmse: 0.6679
Epoch 1 - Save Best Score: 0.6679 Model
Epoch 1 avg_val_loss: 0.4610  time: 2829s
Epoch 1 - ema_content_rmse: 0.5948 - ema_wording_rmse: 0.7539 - ema_mcrmse: 0.6743
Epoch 1 - ema_Save Best Score: 0.6743 Model


Epoch: [1][400/3031] Elapsed 49m 45s (remain 326m 20s) Loss: 0.0692(0.4676) Grad: 5.0569  LR: 0.00000313  


Epoch 1 avg_val_loss: 0.4695  time: 3280s
Epoch 1 - content_rmse: 0.6027 - wording_rmse: 0.7587 - mcrmse: 0.6807
Epoch 1 avg_val_loss: 0.4760  time: 3575s
Epoch 1 - ema_content_rmse: 0.5975 - ema_wording_rmse: 0.7713 - ema_mcrmse: 0.6844


Epoch: [1][500/3031] Elapsed 62m 8s (remain 313m 46s) Loss: 0.1191(0.4287) Grad: 12.4826  LR: 0.00000313  


Epoch 1 avg_val_loss: 0.3826  time: 4023s
Epoch 1 - content_rmse: 0.5234 - wording_rmse: 0.7009 - mcrmse: 0.6122
Epoch 1 - Save Best Score: 0.6122 Model
Epoch 1 avg_val_loss: 0.4083  time: 4319s
Epoch 1 - ema_content_rmse: 0.5423 - ema_wording_rmse: 0.7228 - ema_mcrmse: 0.6326
Epoch 1 - ema_Save Best Score: 0.6326 Model


Epoch: [1][600/3031] Elapsed 74m 35s (remain 301m 36s) Loss: 0.4707(0.4000) Grad: 22.9751  LR: 0.00000312  


Epoch 1 avg_val_loss: 0.5205  time: 4769s
Epoch 1 - content_rmse: 0.5451 - wording_rmse: 0.8626 - mcrmse: 0.7038
Epoch 1 avg_val_loss: 0.4416  time: 5064s
Epoch 1 - ema_content_rmse: 0.5433 - ema_wording_rmse: 0.7669 - ema_mcrmse: 0.6551


Epoch: [1][700/3031] Elapsed 86m 56s (remain 289m 0s) Loss: 0.1843(0.3874) Grad: 13.8699  LR: 0.00000312  


Epoch 1 avg_val_loss: 0.4765  time: 5510s
Epoch 1 - content_rmse: 0.6333 - wording_rmse: 0.7429 - mcrmse: 0.6881
Epoch 1 avg_val_loss: 0.4414  time: 5805s
Epoch 1 - ema_content_rmse: 0.5829 - ema_wording_rmse: 0.7369 - ema_mcrmse: 0.6599


Epoch: [1][800/3031] Elapsed 99m 18s (remain 276m 28s) Loss: 0.2572(0.3702) Grad: 16.2669  LR: 0.00000311  


Epoch 1 avg_val_loss: 0.4688  time: 6252s
Epoch 1 - content_rmse: 0.6197 - wording_rmse: 0.7441 - mcrmse: 0.6819
Epoch 1 avg_val_loss: 0.4360  time: 6547s
Epoch 1 - ema_content_rmse: 0.5922 - ema_wording_rmse: 0.7220 - ema_mcrmse: 0.6571


Epoch: [1][900/3031] Elapsed 111m 41s (remain 264m 2s) Loss: 0.5676(0.3545) Grad: 27.8181  LR: 0.00000310  


Epoch 1 avg_val_loss: 0.4900  time: 6995s
Epoch 1 - content_rmse: 0.6029 - wording_rmse: 0.7852 - mcrmse: 0.6940
Epoch 1 avg_val_loss: 0.4706  time: 7289s
Epoch 1 - ema_content_rmse: 0.5895 - ema_wording_rmse: 0.7705 - ema_mcrmse: 0.6800


Epoch: [1][1000/3031] Elapsed 124m 2s (remain 251m 33s) Loss: 1.1352(0.3455) Grad: 29.2772  LR: 0.00000309  


Epoch 1 avg_val_loss: 0.4299  time: 7737s
Epoch 1 - content_rmse: 0.6216 - wording_rmse: 0.6880 - mcrmse: 0.6548
Epoch 1 avg_val_loss: 0.3970  time: 8031s
Epoch 1 - ema_content_rmse: 0.5648 - ema_wording_rmse: 0.6892 - ema_mcrmse: 0.6270
Epoch 1 - ema_Save Best Score: 0.6270 Model


Epoch: [1][1100/3031] Elapsed 136m 27s (remain 239m 12s) Loss: 0.0620(0.3404) Grad: 7.9122  LR: 0.00000308  


Epoch 1 avg_val_loss: 0.4195  time: 8482s
Epoch 1 - content_rmse: 0.5576 - wording_rmse: 0.7267 - mcrmse: 0.6421
Epoch 1 avg_val_loss: 0.4210  time: 8776s
Epoch 1 - ema_content_rmse: 0.5666 - ema_wording_rmse: 0.7218 - ema_mcrmse: 0.6442


Epoch: [1][1200/3031] Elapsed 148m 49s (remain 226m 46s) Loss: 0.1755(0.3370) Grad: 11.8291  LR: 0.00000307  


Epoch 1 avg_val_loss: 0.3917  time: 9223s
Epoch 1 - content_rmse: 0.5596 - wording_rmse: 0.6858 - mcrmse: 0.6227
Epoch 1 avg_val_loss: 0.3940  time: 9517s
Epoch 1 - ema_content_rmse: 0.5831 - ema_wording_rmse: 0.6694 - ema_mcrmse: 0.6262
Epoch 1 - ema_Save Best Score: 0.6262 Model


Epoch: [1][1300/3031] Elapsed 161m 13s (remain 214m 23s) Loss: 0.1118(0.3292) Grad: 9.7548  LR: 0.00000305  


Epoch 1 avg_val_loss: 0.3888  time: 9967s
Epoch 1 - content_rmse: 0.5066 - wording_rmse: 0.7218 - mcrmse: 0.6142
Epoch 1 avg_val_loss: 0.3756  time: 10261s
Epoch 1 - ema_content_rmse: 0.5042 - ema_wording_rmse: 0.7050 - ema_mcrmse: 0.6046
Epoch 1 - ema_Save Best Score: 0.6046 Model


Epoch: [1][1400/3031] Elapsed 173m 38s (remain 202m 1s) Loss: 0.5047(0.3227) Grad: 35.6672  LR: 0.00000304  


Epoch 1 avg_val_loss: 0.3935  time: 10712s
Epoch 1 - content_rmse: 0.5319 - wording_rmse: 0.7101 - mcrmse: 0.6210
Epoch 1 avg_val_loss: 0.3760  time: 11007s
Epoch 1 - ema_content_rmse: 0.5429 - ema_wording_rmse: 0.6762 - ema_mcrmse: 0.6095


Epoch: [1][1500/3031] Elapsed 185m 59s (remain 189m 35s) Loss: 0.0535(0.3172) Grad: 6.5140  LR: 0.00000303  


Epoch 1 avg_val_loss: 0.3755  time: 11455s
Epoch 1 - content_rmse: 0.5487 - wording_rmse: 0.6707 - mcrmse: 0.6097
Epoch 1 - Save Best Score: 0.6097 Model
Epoch 1 avg_val_loss: 0.3924  time: 11752s
Epoch 1 - ema_content_rmse: 0.5843 - ema_wording_rmse: 0.6658 - ema_mcrmse: 0.6251


Epoch: [1][1600/3031] Elapsed 198m 25s (remain 177m 13s) Loss: 1.7258(0.3139) Grad: 52.2774  LR: 0.00000301  


Epoch 1 avg_val_loss: 0.4777  time: 12199s
Epoch 1 - content_rmse: 0.6037 - wording_rmse: 0.7688 - mcrmse: 0.6862
Epoch 1 avg_val_loss: 0.4910  time: 12493s
Epoch 1 - ema_content_rmse: 0.6369 - ema_wording_rmse: 0.7592 - ema_mcrmse: 0.6980


Epoch: [1][1700/3031] Elapsed 210m 46s (remain 164m 47s) Loss: 0.1382(0.3118) Grad: 9.2869  LR: 0.00000299  


Epoch 1 avg_val_loss: 0.3681  time: 12940s
Epoch 1 - content_rmse: 0.5371 - wording_rmse: 0.6690 - mcrmse: 0.6031
Epoch 1 - Save Best Score: 0.6031 Model
Epoch 1 avg_val_loss: 0.3900  time: 13237s
Epoch 1 - ema_content_rmse: 0.5645 - ema_wording_rmse: 0.6792 - ema_mcrmse: 0.6218


Epoch: [1][1800/3031] Elapsed 223m 10s (remain 152m 25s) Loss: 0.3397(0.3101) Grad: 9.6568  LR: 0.00000297  


Epoch 1 avg_val_loss: 0.4597  time: 13685s
Epoch 1 - content_rmse: 0.5989 - wording_rmse: 0.7488 - mcrmse: 0.6739
Epoch 1 avg_val_loss: 0.4382  time: 13979s
Epoch 1 - ema_content_rmse: 0.5962 - ema_wording_rmse: 0.7217 - ema_mcrmse: 0.6590


Epoch: [1][1900/3031] Elapsed 235m 32s (remain 140m 0s) Loss: 0.5102(0.3046) Grad: 27.0045  LR: 0.00000296  


Epoch 1 avg_val_loss: 0.4194  time: 14427s
Epoch 1 - content_rmse: 0.6116 - wording_rmse: 0.6817 - mcrmse: 0.6466
Epoch 1 avg_val_loss: 0.4450  time: 14722s
Epoch 1 - ema_content_rmse: 0.6336 - ema_wording_rmse: 0.6990 - ema_mcrmse: 0.6663


Epoch: [1][2000/3031] Elapsed 247m 55s (remain 127m 37s) Loss: 0.1405(0.3026) Grad: 11.8688  LR: 0.00000294  


Epoch 1 avg_val_loss: 0.4592  time: 15171s
Epoch 1 - content_rmse: 0.6717 - wording_rmse: 0.6836 - mcrmse: 0.6777
Epoch 1 avg_val_loss: 0.5043  time: 15466s
Epoch 1 - ema_content_rmse: 0.7204 - ema_wording_rmse: 0.6997 - ema_mcrmse: 0.7101


Epoch: [1][2100/3031] Elapsed 260m 19s (remain 115m 13s) Loss: 0.1819(0.2979) Grad: 15.8006  LR: 0.00000292  


Epoch 1 avg_val_loss: 0.4285  time: 15914s
Epoch 1 - content_rmse: 0.5889 - wording_rmse: 0.7143 - mcrmse: 0.6516
Epoch 1 avg_val_loss: 0.4528  time: 16208s
Epoch 1 - ema_content_rmse: 0.6199 - ema_wording_rmse: 0.7220 - ema_mcrmse: 0.6710


Epoch: [1][2200/3031] Elapsed 272m 41s (remain 102m 50s) Loss: 0.2483(0.2956) Grad: 19.7539  LR: 0.00000289  


Epoch 1 avg_val_loss: 0.3933  time: 16656s
Epoch 1 - content_rmse: 0.5726 - wording_rmse: 0.6773 - mcrmse: 0.6249
Epoch 1 avg_val_loss: 0.4097  time: 16950s
Epoch 1 - ema_content_rmse: 0.5931 - ema_wording_rmse: 0.6839 - ema_mcrmse: 0.6385


Epoch: [1][2300/3031] Elapsed 285m 4s (remain 90m 26s) Loss: 0.4183(0.2939) Grad: 18.7001  LR: 0.00000287  


Epoch 1 avg_val_loss: 0.4171  time: 17398s
Epoch 1 - content_rmse: 0.6119 - wording_rmse: 0.6781 - mcrmse: 0.6450
Epoch 1 avg_val_loss: 0.4698  time: 17692s
Epoch 1 - ema_content_rmse: 0.6847 - ema_wording_rmse: 0.6862 - ema_mcrmse: 0.6854


Epoch: [1][2400/3031] Elapsed 297m 25s (remain 78m 2s) Loss: 0.0941(0.2918) Grad: 5.3956  LR: 0.00000285  


Epoch 1 avg_val_loss: 0.4350  time: 18139s
Epoch 1 - content_rmse: 0.6125 - wording_rmse: 0.7034 - mcrmse: 0.6580
Epoch 1 avg_val_loss: 0.4476  time: 18435s
Epoch 1 - ema_content_rmse: 0.6074 - ema_wording_rmse: 0.7254 - ema_mcrmse: 0.6664


Epoch: [1][2500/3031] Elapsed 309m 48s (remain 65m 39s) Loss: 0.5088(0.2904) Grad: 26.4168  LR: 0.00000282  


Epoch 1 avg_val_loss: 0.5122  time: 18883s
Epoch 1 - content_rmse: 0.7124 - wording_rmse: 0.7189 - mcrmse: 0.7157
Epoch 1 avg_val_loss: 0.3986  time: 19177s
Epoch 1 - ema_content_rmse: 0.5892 - ema_wording_rmse: 0.6708 - ema_mcrmse: 0.6300


Epoch: [1][2600/3031] Elapsed 322m 11s (remain 53m 15s) Loss: 0.0834(0.2887) Grad: 7.5094  LR: 0.00000280  


Epoch 1 avg_val_loss: 0.4156  time: 19627s
Epoch 1 - content_rmse: 0.5921 - wording_rmse: 0.6933 - mcrmse: 0.6427
Epoch 1 avg_val_loss: 0.4383  time: 19921s
Epoch 1 - ema_content_rmse: 0.6409 - ema_wording_rmse: 0.6826 - ema_mcrmse: 0.6617


Epoch: [1][2700/3031] Elapsed 334m 35s (remain 40m 52s) Loss: 0.3964(0.2868) Grad: 12.8578  LR: 0.00000277  


Epoch 1 avg_val_loss: 0.4805  time: 20371s
Epoch 1 - content_rmse: 0.6722 - wording_rmse: 0.7136 - mcrmse: 0.6929
Epoch 1 avg_val_loss: 0.4021  time: 20665s
Epoch 1 - ema_content_rmse: 0.5876 - ema_wording_rmse: 0.6775 - ema_mcrmse: 0.6326


Epoch: [1][2800/3031] Elapsed 347m 0s (remain 28m 29s) Loss: 0.2841(0.2870) Grad: 19.8927  LR: 0.00000275  


Epoch 1 avg_val_loss: 0.4250  time: 21115s
Epoch 1 - content_rmse: 0.5994 - wording_rmse: 0.7006 - mcrmse: 0.6500
Epoch 1 avg_val_loss: 0.4231  time: 21410s
Epoch 1 - ema_content_rmse: 0.5989 - ema_wording_rmse: 0.6982 - ema_mcrmse: 0.6485


Epoch: [1][2900/3031] Elapsed 359m 25s (remain 16m 6s) Loss: 0.1125(0.2834) Grad: 6.5780  LR: 0.00000272  


Epoch 1 avg_val_loss: 0.4454  time: 21861s
Epoch 1 - content_rmse: 0.6702 - wording_rmse: 0.6646 - mcrmse: 0.6674
Epoch 1 avg_val_loss: 0.3972  time: 22155s
Epoch 1 - ema_content_rmse: 0.6060 - ema_wording_rmse: 0.6537 - ema_mcrmse: 0.6298


Epoch: [1][3000/3031] Elapsed 371m 50s (remain 3m 43s) Loss: 0.0603(0.2812) Grad: 4.0122  LR: 0.00000269  


Epoch 1 avg_val_loss: 0.6105  time: 22606s
Epoch 1 - content_rmse: 0.7277 - wording_rmse: 0.8315 - mcrmse: 0.7796
Epoch 1 avg_val_loss: 0.5002  time: 22901s
Epoch 1 - ema_content_rmse: 0.6864 - ema_wording_rmse: 0.7275 - ema_mcrmse: 0.7070


Epoch: [1][3030/3031] Elapsed 382m 27s (remain 0m 0s) Loss: 0.2230(0.2809) Grad: 12.9226  LR: 0.00000268  


Epoch 1 avg_val_loss: 0.5387  time: 23242s
Epoch 1 - content_rmse: 0.7698 - wording_rmse: 0.6962 - mcrmse: 0.7330
Epoch 1 avg_val_loss: 0.4638  time: 23536s
Epoch 1 - ema_content_rmse: 0.6678 - ema_wording_rmse: 0.6939 - ema_mcrmse: 0.6809


OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 31.74 GiB total capacity; 31.04 GiB already allocated; 161.12 MiB free; 31.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
## total_complex = []
# for fold in range(4):
#     va_data = train_df[train_df['fold'] == fold]
#     preds = torch.load('/content/drive/MyDrive/deb_simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['predictions']
#     va_data['preds'] = preds
#     va_data = va_data[['id', 'preds', 'score']]
#     print(compute_metrics(va_data['preds'].values.reshape(-1,1), va_data['score'].values))
#     total_complex.append(va_data)
# total_complex = pd.concat(total_complex)
# compute_metrics(total_complex['preds'].values.reshape(-1,1), total_complex['score'].values)

In [None]:
# !mkdir -p /root/.kaggle
# !cp /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets init -p /content/drive/MyDrive/elc_mean/

In [None]:
#!kaggle datasets create -p /content/drive/MyDrive/elc_mean/

In [None]:
# deberta v3 large
# 1.5 0.8228
# 2 0.8197

# 1.5  8137
#2 8175
#2.5 8181
#3 8181
#3.5 8175

