

### Logs
* 30.08: use 2048 for max_len and max_position_embeddings

In [2]:
!pip install transformers -q
!pip install sentencepiece -q
!pip install datasets -q
!pip install accelerate -U -q



In [3]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

from datasets import Dataset, disable_progress_bar
import pandas as pd

pdf = pd.read_csv("./input/prompts_train.csv")
sdf = pd.read_csv("./input/summaries_train.csv")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large")
## use the pretrained model
# from transformers import AutoConfig
# config = AutoConfig.from_pretrained('./input/pretrain/pretrained_model/')
# model = AutoModelForSequenceClassification.from_pretrained('./input/pretrain/pretrained_model', config = config)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
df = pdf.merge(sdf, on="prompt_id")
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


## Train

In [14]:
%%writefile train.py

import os
import logging
import warnings
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    set_seed,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorWithPadding,
)
from datasets import Dataset, disable_progress_bar
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-commonlit-eval-student-summaries-3008'

disable_progress_bar()

@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        #default=1600,
        #default = 512,
        default = 2048,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        #default=1600,
        #default=512,
        default = 2048,
        metadata={"help": "Amount of dropout to apply"},
    )


def tokenize(example, tokenizer, config):
    sep = tokenizer.sep_token

    # if config.add_prompt_question:
    #     text = sep.join(
    #         [example["prompt_question"], example["prompt_text"], example["text"]]
    #     )
    # elif config.add_prompt_text:
    #     text = sep.join([example["prompt_text"], example["text"]])
    # else:
    #     text = example["text"]
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    labels = [example["content"], example["wording"]]

    tokenized = tokenizer(
#         prompt,
#         example["text"],
        example['text'],
        prompt,
        padding=False,
        truncation=True, # changed by Peng, turn on the truncation
        max_length=config.max_seq_length,
    )

    return {
        **tokenized,
        "labels": labels,
    }


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


def main():
    parser = HfArgumentParser((Config, TrainingArguments))

    config, training_args = parser.parse_args_into_dataclasses()

    set_seed(training_args.seed)

    if "wandb" in training_args.report_to:
        import wandb

        try:
            wandb.login()
        except:
            print("Could not log in to WandB")

    tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
    model_config = AutoConfig.from_pretrained(config.model_name_or_path)

    model_config.update({
        "hidden_dropout_prob": config.dropout,
        "attention_probs_dropout_prob": config.dropout,
        "num_labels": 2,
        "problem_type": "regression",
        "max_position_embeddings": config.max_position_embeddings,
        "cfg": config.__dict__,
    })

    print(model_config)
    
    # Do not use pretrained model
#     model = AutoModelForSequenceClassification.from_pretrained(
#         config.model_name_or_path, config=model_config
#     )
    
    # use pretrained model
    print('use pretrained_model')
    model = AutoModelForSequenceClassification.from_pretrained('./input/pretrain/pretrained_model/', config = model_config)

    #pdf = pd.read_csv(f"{config.data_dir}/prompts_train.csv")
    pdf = pd.read_csv(f"./input/prompts_train.csv")
    #sdf = pd.read_csv(f"{config.data_dir}/summaries_train.csv")
    sdf = pd.read_csv(f"./input/summaries_train.csv")

    df = pdf.merge(sdf, on="prompt_id")

    # 4 prompt ids, 4 folds
    id2fold = {
        "814d6b": 0,
        "39c16e": 1,
        "3b9047": 2,
        "ebad26": 3,
    }

    df["fold"] = df["prompt_id"].map(id2fold)

    train_ds = Dataset.from_pandas(df[df["fold"] != config.fold])
    val_ds = Dataset.from_pandas(df[df["fold"] == config.fold])

    train_ds = train_ds.map(
        tokenize,
        batched=False,
        num_proc=config.num_proc,
        fn_kwargs={"tokenizer": tokenizer, "config": config},
    )

    val_ds = val_ds.map(
        tokenize,
        batched=False,
        num_proc=config.num_proc,
        fn_kwargs={"tokenizer": tokenizer, "config": config},
    )

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        pad_to_multiple_of=16 if training_args.fp16 else None,
    )

    training_args.bf16 =True
    training_args.gradient_accumulation_steps = 1
    training_args.load_best_model_at_end = True
    training_args.greater_is_better = False
    training_args.metric_for_best_model = 'mcrmse'
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_mcrmse,
    )

    trainer.train()

    model.config.best_metric = trainer.state.best_metric
    model.config.save_pretrained(training_args.output_dir)

    trainer.log({"eval_best_mcrmse": trainer.state.best_metric})


if __name__ == "__main__":
    main()

Overwriting train.py


In [15]:
from pathlib import Path

seed = 42

fold = 0

output = f"output_fold{fold}_seed{seed}_3008"

!python train.py \
  --model_name_or_path "microsoft/deberta-v3-large" \
  --add_prompt_question True \
  --fold $fold \
  --data_dir "./" \
  --output_dir $output \
  --fp16 \
  --num_train_epochs 4 \
  --dataloader_num_workers 4 \
  --learning_rate 2e-6 \
  --weight_decay 0.01 \
  --warmup_ratio 0 \
  --optim "adamw_torch" \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 2 \
  --evaluation_strategy "steps" \
  --eval_steps 150 \
  --save_strategy "steps" \
  --save_steps 150 \
  --save_total_limit 1 \
  --report_to "wandb" \
  --metric_for_best_model "mcrmse" \
  --greater_is_better False \
  --logging_steps 10 \
  --log_level "error" \
  --disable_tqdm True \
  --ddp_find_unused_parameters False \
  --dropout 0 \
  --seed $seed


output_dir = Path.cwd() / output
# add json files
for json_file in output_dir.glob("checkpoint*/*token*.json"):
    json_file.rename(output_dir/json_file.name)

# model files
for model_file in output_dir.glob("checkpoint*/*model*"):
    model_file.rename(output_dir/model_file.name)

# remove optimizer states and other files
to_delete = str(list(output_dir.glob("checkpoint*"))[0])
!rm -r $to_delete

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "cfg": {
    "add_prompt_question": true,
    "add_prompt_text": false,
    "data_dir": "./",
    "dropout": 0.0,
    "fold": 0,
    "max_position_embeddings": 2048,
    "max_seq_length": 2048,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "num_proc": 4
  },
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 2048,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "po

{'loss': 0.2595, 'learning_rate': 1.8804025074232926e-06, 'epoch': 0.24}
{'loss': 0.202, 'learning_rate': 1.8787528868360276e-06, 'epoch': 0.24}
{'loss': 0.4289, 'learning_rate': 1.8771032662487628e-06, 'epoch': 0.25}
{'eval_loss': 0.5938531160354614, 'eval_content_rmse': 0.564494788646698, 'eval_wording_rmse': 0.9322299361228943, 'eval_mcrmse': 0.7483623623847961, 'eval_runtime': 57.7166, 'eval_samples_per_second': 19.111, 'eval_steps_per_second': 9.564, 'epoch': 0.25}
{'loss': 0.1766, 'learning_rate': 1.8754536456614978e-06, 'epoch': 0.25}
{'loss': 0.3043, 'learning_rate': 1.8738040250742327e-06, 'epoch': 0.25}
{'loss': 0.297, 'learning_rate': 1.872154404486968e-06, 'epoch': 0.26}
{'loss': 0.3252, 'learning_rate': 1.870504783899703e-06, 'epoch': 0.26}
{'loss': 0.2763, 'learning_rate': 1.8688551633124382e-06, 'epoch': 0.26}
{'loss': 0.3304, 'learning_rate': 1.8672055427251732e-06, 'epoch': 0.27}
{'loss': 0.4189, 'learning_rate': 1.8655559221379081e-06, 'epoch': 0.27}
{'loss': 0.1782, 

{'loss': 0.3804, 'learning_rate': 1.728637413394919e-06, 'epoch': 0.54}
{'eval_loss': 0.43455880880355835, 'eval_content_rmse': 0.498887300491333, 'eval_wording_rmse': 0.7875463366508484, 'eval_mcrmse': 0.6432168483734131, 'eval_runtime': 57.413, 'eval_samples_per_second': 19.212, 'eval_steps_per_second': 9.615, 'epoch': 0.54}
{'loss': 0.1832, 'learning_rate': 1.726987792807654e-06, 'epoch': 0.55}
{'loss': 0.1842, 'learning_rate': 1.7253381722203893e-06, 'epoch': 0.55}
{'loss': 0.2424, 'learning_rate': 1.7236885516331243e-06, 'epoch': 0.55}
{'loss': 0.2542, 'learning_rate': 1.7220389310458595e-06, 'epoch': 0.56}
{'loss': 0.3139, 'learning_rate': 1.7203893104585945e-06, 'epoch': 0.56}
{'loss': 0.2503, 'learning_rate': 1.7187396898713295e-06, 'epoch': 0.56}
{'loss': 0.1664, 'learning_rate': 1.7170900692840647e-06, 'epoch': 0.57}
{'loss': 0.2156, 'learning_rate': 1.7154404486967997e-06, 'epoch': 0.57}
{'loss': 0.2444, 'learning_rate': 1.713790828109535e-06, 'epoch': 0.57}
{'loss': 0.2129,

{'loss': 0.2418, 'learning_rate': 1.5785219399538106e-06, 'epoch': 0.84}
{'loss': 0.2574, 'learning_rate': 1.5768723193665456e-06, 'epoch': 0.85}
{'loss': 0.1713, 'learning_rate': 1.5752226987792806e-06, 'epoch': 0.85}
{'loss': 0.1236, 'learning_rate': 1.5735730781920158e-06, 'epoch': 0.85}
{'loss': 0.4371, 'learning_rate': 1.5719234576047508e-06, 'epoch': 0.86}
{'loss': 0.1606, 'learning_rate': 1.570273837017486e-06, 'epoch': 0.86}
{'loss': 0.2896, 'learning_rate': 1.568624216430221e-06, 'epoch': 0.86}
{'loss': 0.3826, 'learning_rate': 1.566974595842956e-06, 'epoch': 0.87}
{'loss': 0.2252, 'learning_rate': 1.5653249752556912e-06, 'epoch': 0.87}
{'loss': 0.2492, 'learning_rate': 1.5636753546684262e-06, 'epoch': 0.87}
{'loss': 0.2332, 'learning_rate': 1.5620257340811614e-06, 'epoch': 0.88}
{'loss': 0.2777, 'learning_rate': 1.5603761134938964e-06, 'epoch': 0.88}
{'loss': 0.1742, 'learning_rate': 1.5587264929066314e-06, 'epoch': 0.88}
{'loss': 0.2048, 'learning_rate': 1.5570768723193666e-

{'loss': 0.1065, 'learning_rate': 1.4267568459254372e-06, 'epoch': 1.15}
{'loss': 0.2251, 'learning_rate': 1.4251072253381722e-06, 'epoch': 1.15}
{'loss': 0.2407, 'learning_rate': 1.4234576047509072e-06, 'epoch': 1.15}
{'loss': 0.2425, 'learning_rate': 1.4218079841636424e-06, 'epoch': 1.16}
{'loss': 0.2762, 'learning_rate': 1.4201583635763774e-06, 'epoch': 1.16}
{'loss': 0.1613, 'learning_rate': 1.4185087429891126e-06, 'epoch': 1.16}
{'loss': 0.1414, 'learning_rate': 1.4168591224018476e-06, 'epoch': 1.17}
{'loss': 0.2766, 'learning_rate': 1.4152095018145826e-06, 'epoch': 1.17}
{'loss': 0.1579, 'learning_rate': 1.4135598812273178e-06, 'epoch': 1.17}
{'loss': 0.1208, 'learning_rate': 1.4119102606400528e-06, 'epoch': 1.18}
{'loss': 0.1773, 'learning_rate': 1.410260640052788e-06, 'epoch': 1.18}
{'loss': 0.1209, 'learning_rate': 1.408611019465523e-06, 'epoch': 1.18}
{'loss': 0.3137, 'learning_rate': 1.406961398878258e-06, 'epoch': 1.19}
{'eval_loss': 0.5800656080245972, 'eval_content_rmse':

{'loss': 0.1634, 'learning_rate': 1.2749917518970637e-06, 'epoch': 1.45}
{'loss': 0.3281, 'learning_rate': 1.2733421313097987e-06, 'epoch': 1.45}
{'loss': 0.2343, 'learning_rate': 1.271692510722534e-06, 'epoch': 1.46}
{'loss': 0.1885, 'learning_rate': 1.270042890135269e-06, 'epoch': 1.46}
{'loss': 0.2305, 'learning_rate': 1.2683932695480039e-06, 'epoch': 1.46}
{'loss': 0.1366, 'learning_rate': 1.266743648960739e-06, 'epoch': 1.47}
{'loss': 0.1951, 'learning_rate': 1.265094028373474e-06, 'epoch': 1.47}
{'loss': 0.126, 'learning_rate': 1.2634444077862093e-06, 'epoch': 1.47}
{'loss': 0.1732, 'learning_rate': 1.2617947871989443e-06, 'epoch': 1.48}
{'loss': 0.196, 'learning_rate': 1.2601451666116793e-06, 'epoch': 1.48}
{'loss': 0.1789, 'learning_rate': 1.2584955460244145e-06, 'epoch': 1.48}
{'eval_loss': 0.4550977051258087, 'eval_content_rmse': 0.5696753263473511, 'eval_wording_rmse': 0.7652876973152161, 'eval_mcrmse': 0.667481541633606, 'eval_runtime': 57.3654, 'eval_samples_per_second': 1

{'loss': 0.1995, 'learning_rate': 1.1232266578686902e-06, 'epoch': 1.76}
{'loss': 0.2519, 'learning_rate': 1.1215770372814252e-06, 'epoch': 1.76}
{'loss': 0.2243, 'learning_rate': 1.1199274166941604e-06, 'epoch': 1.76}
{'loss': 0.1183, 'learning_rate': 1.1182777961068954e-06, 'epoch': 1.77}
{'loss': 0.1348, 'learning_rate': 1.1166281755196304e-06, 'epoch': 1.77}
{'loss': 0.3813, 'learning_rate': 1.1149785549323656e-06, 'epoch': 1.77}
{'loss': 0.2128, 'learning_rate': 1.1133289343451006e-06, 'epoch': 1.77}
{'loss': 0.1681, 'learning_rate': 1.1116793137578358e-06, 'epoch': 1.78}
{'loss': 0.1507, 'learning_rate': 1.1100296931705708e-06, 'epoch': 1.78}
{'eval_loss': 0.46251180768013, 'eval_content_rmse': 0.5960699319839478, 'eval_wording_rmse': 0.7548007369041443, 'eval_mcrmse': 0.6754353046417236, 'eval_runtime': 57.3279, 'eval_samples_per_second': 19.24, 'eval_steps_per_second': 9.629, 'epoch': 1.78}
{'loss': 0.2164, 'learning_rate': 1.1083800725833056e-06, 'epoch': 1.78}
{'loss': 0.193,

{'loss': 0.1315, 'learning_rate': 9.714615638403166e-07, 'epoch': 2.06}
{'loss': 0.1857, 'learning_rate': 9.698119432530518e-07, 'epoch': 2.06}
{'loss': 0.1332, 'learning_rate': 9.681623226657868e-07, 'epoch': 2.07}
{'loss': 0.2077, 'learning_rate': 9.66512702078522e-07, 'epoch': 2.07}
{'loss': 0.1211, 'learning_rate': 9.64863081491257e-07, 'epoch': 2.07}
{'loss': 0.1972, 'learning_rate': 9.63213460903992e-07, 'epoch': 2.08}
{'loss': 0.1107, 'learning_rate': 9.615638403167272e-07, 'epoch': 2.08}
{'eval_loss': 0.3663038909435272, 'eval_content_rmse': 0.49338510632514954, 'eval_wording_rmse': 0.6994134187698364, 'eval_mcrmse': 0.5963992476463318, 'eval_runtime': 57.6055, 'eval_samples_per_second': 19.147, 'eval_steps_per_second': 9.582, 'epoch': 2.08}
{'loss': 0.1236, 'learning_rate': 9.599142197294622e-07, 'epoch': 2.08}
{'loss': 0.1899, 'learning_rate': 9.582645991421974e-07, 'epoch': 2.09}
{'loss': 0.1526, 'learning_rate': 9.566149785549324e-07, 'epoch': 2.09}
{'loss': 0.1525, 'learni

{'loss': 0.0806, 'learning_rate': 8.180468492246783e-07, 'epoch': 2.37}
{'loss': 0.1678, 'learning_rate': 8.163972286374133e-07, 'epoch': 2.37}
{'loss': 0.1246, 'learning_rate': 8.147476080501484e-07, 'epoch': 2.37}
{'loss': 0.2142, 'learning_rate': 8.130979874628835e-07, 'epoch': 2.38}
{'eval_loss': 0.33713018894195557, 'eval_content_rmse': 0.474677175283432, 'eval_wording_rmse': 0.6700313091278076, 'eval_mcrmse': 0.572354257106781, 'eval_runtime': 57.3794, 'eval_samples_per_second': 19.223, 'eval_steps_per_second': 9.62, 'epoch': 2.38}
{'loss': 0.1687, 'learning_rate': 8.114483668756186e-07, 'epoch': 2.38}
{'loss': 0.1639, 'learning_rate': 8.097987462883537e-07, 'epoch': 2.38}
{'loss': 0.0951, 'learning_rate': 8.081491257010887e-07, 'epoch': 2.39}
{'loss': 0.2103, 'learning_rate': 8.064995051138238e-07, 'epoch': 2.39}
{'loss': 0.1618, 'learning_rate': 8.048498845265589e-07, 'epoch': 2.39}
{'loss': 0.1276, 'learning_rate': 8.03200263939294e-07, 'epoch': 2.4}
{'loss': 0.1183, 'learning

{'loss': 0.1789, 'learning_rate': 6.646321346090399e-07, 'epoch': 2.67}
{'eval_loss': 0.34932202100753784, 'eval_content_rmse': 0.48992183804512024, 'eval_wording_rmse': 0.6772155165672302, 'eval_mcrmse': 0.5835686922073364, 'eval_runtime': 57.4773, 'eval_samples_per_second': 19.19, 'eval_steps_per_second': 9.604, 'epoch': 2.67}
{'loss': 0.1773, 'learning_rate': 6.629825140217749e-07, 'epoch': 2.68}
{'loss': 0.1442, 'learning_rate': 6.6133289343451e-07, 'epoch': 2.68}
{'loss': 0.1716, 'learning_rate': 6.596832728472451e-07, 'epoch': 2.68}
{'loss': 0.233, 'learning_rate': 6.580336522599802e-07, 'epoch': 2.69}
{'loss': 0.183, 'learning_rate': 6.563840316727153e-07, 'epoch': 2.69}
{'loss': 0.1312, 'learning_rate': 6.547344110854503e-07, 'epoch': 2.69}
{'loss': 0.1079, 'learning_rate': 6.530847904981854e-07, 'epoch': 2.7}
{'loss': 0.1468, 'learning_rate': 6.514351699109205e-07, 'epoch': 2.7}
{'loss': 0.1368, 'learning_rate': 6.497855493236555e-07, 'epoch': 2.7}
{'loss': 0.2042, 'learning_r

{'loss': 0.1847, 'learning_rate': 5.146816232266578e-07, 'epoch': 2.97}
{'loss': 0.1246, 'learning_rate': 5.130320026393929e-07, 'epoch': 2.98}
{'loss': 0.0927, 'learning_rate': 5.11382382052128e-07, 'epoch': 2.98}
{'loss': 0.1448, 'learning_rate': 5.097327614648631e-07, 'epoch': 2.98}
{'loss': 0.0872, 'learning_rate': 5.080831408775982e-07, 'epoch': 2.99}
{'loss': 0.1131, 'learning_rate': 5.064335202903331e-07, 'epoch': 2.99}
{'loss': 0.1378, 'learning_rate': 5.047838997030682e-07, 'epoch': 2.99}
{'loss': 0.252, 'learning_rate': 5.031342791158033e-07, 'epoch': 3.0}
{'loss': 0.1856, 'learning_rate': 5.014846585285384e-07, 'epoch': 3.0}
{'loss': 0.1352, 'learning_rate': 4.998350379412734e-07, 'epoch': 3.0}
{'loss': 0.1198, 'learning_rate': 4.981854173540085e-07, 'epoch': 3.01}
{'loss': 0.1143, 'learning_rate': 4.965357967667436e-07, 'epoch': 3.01}
{'loss': 0.0613, 'learning_rate': 4.948861761794787e-07, 'epoch': 3.01}
{'loss': 0.1332, 'learning_rate': 4.932365555922138e-07, 'epoch': 3.0

{'loss': 0.2445, 'learning_rate': 3.6126690861101946e-07, 'epoch': 3.28}
{'loss': 0.1068, 'learning_rate': 3.596172880237545e-07, 'epoch': 3.28}
{'loss': 0.0796, 'learning_rate': 3.579676674364896e-07, 'epoch': 3.29}
{'loss': 0.0929, 'learning_rate': 3.5631804684922465e-07, 'epoch': 3.29}
{'loss': 0.1708, 'learning_rate': 3.5466842626195976e-07, 'epoch': 3.29}
{'loss': 0.1247, 'learning_rate': 3.5301880567469486e-07, 'epoch': 3.3}
{'loss': 0.1001, 'learning_rate': 3.5136918508742985e-07, 'epoch': 3.3}
{'loss': 0.0796, 'learning_rate': 3.4971956450016495e-07, 'epoch': 3.3}
{'loss': 0.1856, 'learning_rate': 3.480699439129e-07, 'epoch': 3.31}
{'loss': 0.1181, 'learning_rate': 3.464203233256351e-07, 'epoch': 3.31}
{'loss': 0.1601, 'learning_rate': 3.4477070273837015e-07, 'epoch': 3.31}
{'loss': 0.1372, 'learning_rate': 3.4312108215110525e-07, 'epoch': 3.32}
{'eval_loss': 0.3410272002220154, 'eval_content_rmse': 0.4852299988269806, 'eval_wording_rmse': 0.6682860255241394, 'eval_mcrmse': 0.5

{'loss': 0.1008, 'learning_rate': 2.0801715605410755e-07, 'epoch': 3.59}
{'loss': 0.1412, 'learning_rate': 2.0636753546684263e-07, 'epoch': 3.59}
{'loss': 0.0957, 'learning_rate': 2.0471791487957767e-07, 'epoch': 3.59}
{'loss': 0.1198, 'learning_rate': 2.0306829429231275e-07, 'epoch': 3.6}
{'loss': 0.1864, 'learning_rate': 2.0141867370504785e-07, 'epoch': 3.6}
{'loss': 0.2027, 'learning_rate': 1.997690531177829e-07, 'epoch': 3.6}
{'loss': 0.2273, 'learning_rate': 1.9811943253051797e-07, 'epoch': 3.61}
{'loss': 0.153, 'learning_rate': 1.9646981194325305e-07, 'epoch': 3.61}
{'loss': 0.1186, 'learning_rate': 1.9482019135598812e-07, 'epoch': 3.61}
{'eval_loss': 0.35579246282577515, 'eval_content_rmse': 0.5117003917694092, 'eval_wording_rmse': 0.6706324815750122, 'eval_mcrmse': 0.5911664366722107, 'eval_runtime': 57.3883, 'eval_samples_per_second': 19.22, 'eval_steps_per_second': 9.619, 'epoch': 3.61}
{'loss': 0.1113, 'learning_rate': 1.9317057076872317e-07, 'epoch': 3.62}
{'loss': 0.0912, 

{'loss': 0.1096, 'learning_rate': 5.625206202573408e-08, 'epoch': 3.89}
{'loss': 0.0841, 'learning_rate': 5.4602441438469146e-08, 'epoch': 3.89}
{'loss': 0.0822, 'learning_rate': 5.295282085120422e-08, 'epoch': 3.9}
{'loss': 0.0626, 'learning_rate': 5.1303200263939295e-08, 'epoch': 3.9}
{'loss': 0.1463, 'learning_rate': 4.965357967667436e-08, 'epoch': 3.9}
{'loss': 0.235, 'learning_rate': 4.800395908940944e-08, 'epoch': 3.91}
{'loss': 0.1526, 'learning_rate': 4.6354338502144505e-08, 'epoch': 3.91}
{'eval_loss': 0.3581613302230835, 'eval_content_rmse': 0.5082637071609497, 'eval_wording_rmse': 0.676750123500824, 'eval_mcrmse': 0.5925068855285645, 'eval_runtime': 57.382, 'eval_samples_per_second': 19.222, 'eval_steps_per_second': 9.62, 'epoch': 3.91}
{'loss': 0.1152, 'learning_rate': 4.470471791487958e-08, 'epoch': 3.91}
{'loss': 0.0555, 'learning_rate': 4.305509732761465e-08, 'epoch': 3.92}
{'loss': 0.1038, 'learning_rate': 4.140547674034972e-08, 'epoch': 3.92}
{'loss': 0.0878, 'learning

In [16]:
from pathlib import Path

seed = 42

fold = 1

output = f"output_fold{fold}_seed{seed}_3008"

!python train.py \
  --model_name_or_path "microsoft/deberta-v3-large" \
  --add_prompt_question True \
  --fold $fold \
  --data_dir "./" \
  --output_dir $output \
  --fp16 \
  --num_train_epochs 4 \
  --dataloader_num_workers 4 \
  --learning_rate 2e-6 \
  --weight_decay 0.01 \
  --warmup_ratio 0 \
  --optim "adamw_torch" \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 2 \
  --evaluation_strategy "steps" \
  --eval_steps 150 \
  --save_strategy "steps" \
  --save_steps 150 \
  --save_total_limit 1 \
  --report_to "wandb" \
  --metric_for_best_model "mcrmse" \
  --greater_is_better False \
  --logging_steps 10 \
  --log_level "error" \
  --disable_tqdm True \
  --ddp_find_unused_parameters False \
  --dropout 0 \
  --seed $seed


output_dir = Path.cwd() / output
# add json files
for json_file in output_dir.glob("checkpoint*/*token*.json"):
    json_file.rename(output_dir/json_file.name)

# model files
for model_file in output_dir.glob("checkpoint*/*model*"):
    model_file.rename(output_dir/model_file.name)

# remove optimizer states and other files
to_delete = str(list(output_dir.glob("checkpoint*"))[0])
!rm -r $to_delete

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "cfg": {
    "add_prompt_question": true,
    "add_prompt_text": false,
    "data_dir": "./",
    "dropout": 0.0,
    "fold": 1,
    "max_position_embeddings": 2048,
    "max_seq_length": 2048,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "num_proc": 4
  },
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 2048,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "po

{'loss': 0.3139, 'learning_rate': 1.8559122944400938e-06, 'epoch': 0.29}
{'loss': 0.685, 'learning_rate': 1.8539545810493342e-06, 'epoch': 0.29}
{'eval_loss': 0.3642939329147339, 'eval_content_rmse': 0.5703287124633789, 'eval_wording_rmse': 0.6350693106651306, 'eval_mcrmse': 0.6026990413665771, 'eval_runtime': 113.18, 'eval_samples_per_second': 18.175, 'eval_steps_per_second': 9.092, 'epoch': 0.29}
{'loss': 0.3331, 'learning_rate': 1.8519968676585747e-06, 'epoch': 0.3}
{'loss': 0.2668, 'learning_rate': 1.850039154267815e-06, 'epoch': 0.3}
{'loss': 0.2969, 'learning_rate': 1.8480814408770556e-06, 'epoch': 0.31}
{'loss': 0.6689, 'learning_rate': 1.846123727486296e-06, 'epoch': 0.31}
{'loss': 0.3004, 'learning_rate': 1.8441660140955364e-06, 'epoch': 0.31}
{'loss': 0.412, 'learning_rate': 1.8422083007047769e-06, 'epoch': 0.32}
{'loss': 0.3096, 'learning_rate': 1.8402505873140171e-06, 'epoch': 0.32}
{'loss': 0.2925, 'learning_rate': 1.8382928739232576e-06, 'epoch': 0.32}
{'loss': 0.1288, 'l

{'eval_loss': 0.2930901050567627, 'eval_content_rmse': 0.43155285716056824, 'eval_wording_rmse': 0.6324099898338318, 'eval_mcrmse': 0.5319814085960388, 'eval_runtime': 113.336, 'eval_samples_per_second': 18.15, 'eval_steps_per_second': 9.079, 'epoch': 0.65}
{'loss': 0.1881, 'learning_rate': 1.6758026624902112e-06, 'epoch': 0.65}
{'loss': 0.4192, 'learning_rate': 1.6738449490994517e-06, 'epoch': 0.65}
{'loss': 0.259, 'learning_rate': 1.6718872357086921e-06, 'epoch': 0.66}
{'loss': 0.3229, 'learning_rate': 1.6699295223179326e-06, 'epoch': 0.66}
{'loss': 0.2598, 'learning_rate': 1.667971808927173e-06, 'epoch': 0.67}
{'loss': 0.2286, 'learning_rate': 1.6660140955364135e-06, 'epoch': 0.67}
{'loss': 0.2133, 'learning_rate': 1.6640563821456537e-06, 'epoch': 0.67}
{'loss': 0.438, 'learning_rate': 1.6620986687548942e-06, 'epoch': 0.68}
{'loss': 0.3039, 'learning_rate': 1.6601409553641346e-06, 'epoch': 0.68}
{'loss': 0.2145, 'learning_rate': 1.658183241973375e-06, 'epoch': 0.69}
{'loss': 0.3944,

{'loss': 0.2511, 'learning_rate': 1.4996084573218478e-06, 'epoch': 1.0}
{'loss': 0.1157, 'learning_rate': 1.4976507439310883e-06, 'epoch': 1.01}
{'loss': 0.1525, 'learning_rate': 1.4956930305403287e-06, 'epoch': 1.01}
{'loss': 0.2026, 'learning_rate': 1.4937353171495692e-06, 'epoch': 1.01}
{'loss': 0.2284, 'learning_rate': 1.4917776037588096e-06, 'epoch': 1.02}
{'loss': 0.258, 'learning_rate': 1.48981989036805e-06, 'epoch': 1.02}
{'loss': 0.2389, 'learning_rate': 1.4878621769772905e-06, 'epoch': 1.03}
{'loss': 0.1908, 'learning_rate': 1.4859044635865307e-06, 'epoch': 1.03}
{'loss': 0.1954, 'learning_rate': 1.4839467501957712e-06, 'epoch': 1.03}
{'loss': 0.2555, 'learning_rate': 1.4819890368050116e-06, 'epoch': 1.04}
{'loss': 0.2079, 'learning_rate': 1.480031323414252e-06, 'epoch': 1.04}
{'loss': 0.3109, 'learning_rate': 1.4780736100234925e-06, 'epoch': 1.05}
{'loss': 0.1619, 'learning_rate': 1.476115896632733e-06, 'epoch': 1.05}
{'loss': 0.2213, 'learning_rate': 1.4741581832419734e-06,

{'loss': 0.5947, 'learning_rate': 1.3194988253719655e-06, 'epoch': 1.36}
{'loss': 0.0864, 'learning_rate': 1.317541111981206e-06, 'epoch': 1.37}
{'loss': 0.1317, 'learning_rate': 1.3155833985904464e-06, 'epoch': 1.37}
{'loss': 0.2322, 'learning_rate': 1.3136256851996866e-06, 'epoch': 1.37}
{'loss': 0.2713, 'learning_rate': 1.311667971808927e-06, 'epoch': 1.38}
{'loss': 0.1849, 'learning_rate': 1.3097102584181675e-06, 'epoch': 1.38}
{'loss': 0.2647, 'learning_rate': 1.3077525450274078e-06, 'epoch': 1.39}
{'loss': 0.2067, 'learning_rate': 1.3057948316366482e-06, 'epoch': 1.39}
{'loss': 0.1357, 'learning_rate': 1.3038371182458887e-06, 'epoch': 1.39}
{'loss': 0.1975, 'learning_rate': 1.3018794048551291e-06, 'epoch': 1.4}
{'loss': 0.3883, 'learning_rate': 1.2999216914643696e-06, 'epoch': 1.4}
{'loss': 0.1671, 'learning_rate': 1.29796397807361e-06, 'epoch': 1.41}
{'loss': 0.3639, 'learning_rate': 1.2960062646828505e-06, 'epoch': 1.41}
{'eval_loss': 0.24272367358207703, 'eval_content_rmse': 0

{'loss': 0.2579, 'learning_rate': 1.139780736100235e-06, 'epoch': 1.72}
{'loss': 0.2355, 'learning_rate': 1.1378230227094754e-06, 'epoch': 1.73}
{'loss': 0.2268, 'learning_rate': 1.1358653093187157e-06, 'epoch': 1.73}
{'loss': 0.1855, 'learning_rate': 1.1339075959279561e-06, 'epoch': 1.73}
{'loss': 0.1425, 'learning_rate': 1.1319498825371966e-06, 'epoch': 1.74}
{'loss': 0.1279, 'learning_rate': 1.129992169146437e-06, 'epoch': 1.74}
{'loss': 0.2543, 'learning_rate': 1.1280344557556773e-06, 'epoch': 1.75}
{'loss': 0.1337, 'learning_rate': 1.1260767423649177e-06, 'epoch': 1.75}
{'loss': 0.261, 'learning_rate': 1.1241190289741581e-06, 'epoch': 1.75}
{'loss': 0.1652, 'learning_rate': 1.1221613155833984e-06, 'epoch': 1.76}
{'loss': 0.3021, 'learning_rate': 1.1202036021926388e-06, 'epoch': 1.76}
{'eval_loss': 0.2742100656032562, 'eval_content_rmse': 0.4018018841743469, 'eval_wording_rmse': 0.6220734715461731, 'eval_mcrmse': 0.51193767786026, 'eval_runtime': 113.2113, 'eval_samples_per_second'

{'loss': 0.2421, 'learning_rate': 9.596711041503523e-07, 'epoch': 2.08}
{'loss': 0.1954, 'learning_rate': 9.577133907595927e-07, 'epoch': 2.09}
{'loss': 0.2277, 'learning_rate': 9.557556773688332e-07, 'epoch': 2.09}
{'loss': 0.1907, 'learning_rate': 9.537979639780736e-07, 'epoch': 2.09}
{'loss': 0.1487, 'learning_rate': 9.518402505873139e-07, 'epoch': 2.1}
{'loss': 0.0642, 'learning_rate': 9.498825371965544e-07, 'epoch': 2.1}
{'loss': 0.1124, 'learning_rate': 9.479248238057947e-07, 'epoch': 2.11}
{'loss': 0.2118, 'learning_rate': 9.459671104150352e-07, 'epoch': 2.11}
{'loss': 0.1346, 'learning_rate': 9.440093970242756e-07, 'epoch': 2.11}
{'eval_loss': 0.26203030347824097, 'eval_content_rmse': 0.4167264699935913, 'eval_wording_rmse': 0.5919454097747803, 'eval_mcrmse': 0.5043359398841858, 'eval_runtime': 113.4571, 'eval_samples_per_second': 18.13, 'eval_steps_per_second': 9.07, 'epoch': 2.11}
{'loss': 0.1356, 'learning_rate': 9.420516836335161e-07, 'epoch': 2.12}
{'loss': 0.193, 'learnin

{'loss': 0.1644, 'learning_rate': 7.776037588097102e-07, 'epoch': 2.45}
{'loss': 0.07, 'learning_rate': 7.756460454189506e-07, 'epoch': 2.45}
{'loss': 0.2125, 'learning_rate': 7.736883320281911e-07, 'epoch': 2.45}
{'loss': 0.3909, 'learning_rate': 7.717306186374315e-07, 'epoch': 2.46}
{'loss': 0.1737, 'learning_rate': 7.697729052466719e-07, 'epoch': 2.46}
{'loss': 0.2056, 'learning_rate': 7.678151918559122e-07, 'epoch': 2.47}
{'eval_loss': 0.2578878104686737, 'eval_content_rmse': 0.4031749665737152, 'eval_wording_rmse': 0.594327449798584, 'eval_mcrmse': 0.4987512230873108, 'eval_runtime': 113.5201, 'eval_samples_per_second': 18.12, 'eval_steps_per_second': 9.064, 'epoch': 2.47}
{'loss': 0.1877, 'learning_rate': 7.658574784651527e-07, 'epoch': 2.47}
{'loss': 0.1787, 'learning_rate': 7.63899765074393e-07, 'epoch': 2.47}
{'loss': 0.1484, 'learning_rate': 7.619420516836334e-07, 'epoch': 2.48}
{'loss': 0.1663, 'learning_rate': 7.599843382928739e-07, 'epoch': 2.48}
{'loss': 0.1021, 'learning

{'loss': 0.2589, 'learning_rate': 5.955364134690681e-07, 'epoch': 2.81}
{'loss': 0.1067, 'learning_rate': 5.935787000783085e-07, 'epoch': 2.82}
{'loss': 0.1382, 'learning_rate': 5.916209866875489e-07, 'epoch': 2.82}
{'eval_loss': 0.226210817694664, 'eval_content_rmse': 0.4014350175857544, 'eval_wording_rmse': 0.5396958589553833, 'eval_mcrmse': 0.47056543827056885, 'eval_runtime': 113.7927, 'eval_samples_per_second': 18.077, 'eval_steps_per_second': 9.043, 'epoch': 2.82}
{'loss': 0.1122, 'learning_rate': 5.896632732967893e-07, 'epoch': 2.82}
{'loss': 0.0772, 'learning_rate': 5.877055599060298e-07, 'epoch': 2.83}
{'loss': 0.105, 'learning_rate': 5.857478465152701e-07, 'epoch': 2.83}
{'loss': 0.1239, 'learning_rate': 5.837901331245106e-07, 'epoch': 2.83}
{'loss': 0.1157, 'learning_rate': 5.818324197337509e-07, 'epoch': 2.84}
{'loss': 0.1045, 'learning_rate': 5.798747063429914e-07, 'epoch': 2.84}
{'loss': 0.1305, 'learning_rate': 5.779169929522317e-07, 'epoch': 2.85}
{'loss': 0.1691, 'lear

{'eval_loss': 0.25217801332473755, 'eval_content_rmse': 0.42506369948387146, 'eval_wording_rmse': 0.5689263939857483, 'eval_mcrmse': 0.4969950318336487, 'eval_runtime': 113.6374, 'eval_samples_per_second': 18.101, 'eval_steps_per_second': 9.055, 'epoch': 3.17}
{'loss': 0.1444, 'learning_rate': 4.13469068128426e-07, 'epoch': 3.18}
{'loss': 0.0757, 'learning_rate': 4.115113547376664e-07, 'epoch': 3.18}
{'loss': 0.1375, 'learning_rate': 4.095536413469068e-07, 'epoch': 3.18}
{'loss': 0.1859, 'learning_rate': 4.075959279561472e-07, 'epoch': 3.19}
{'loss': 0.1208, 'learning_rate': 4.056382145653876e-07, 'epoch': 3.19}
{'loss': 0.0744, 'learning_rate': 4.0368050117462806e-07, 'epoch': 3.19}
{'loss': 0.0786, 'learning_rate': 4.017227877838684e-07, 'epoch': 3.2}
{'loss': 0.1067, 'learning_rate': 3.997650743931088e-07, 'epoch': 3.2}
{'loss': 0.1429, 'learning_rate': 3.9780736100234924e-07, 'epoch': 3.21}
{'loss': 0.273, 'learning_rate': 3.9584964761158964e-07, 'epoch': 3.21}
{'loss': 0.1463, 'le

{'loss': 0.121, 'learning_rate': 2.374706342991386e-07, 'epoch': 3.53}
{'loss': 0.0924, 'learning_rate': 2.3551292090837902e-07, 'epoch': 3.53}
{'loss': 0.1673, 'learning_rate': 2.335552075176194e-07, 'epoch': 3.54}
{'loss': 0.0983, 'learning_rate': 2.315974941268598e-07, 'epoch': 3.54}
{'loss': 0.1051, 'learning_rate': 2.2963978073610022e-07, 'epoch': 3.54}
{'loss': 0.0888, 'learning_rate': 2.2768206734534062e-07, 'epoch': 3.55}
{'loss': 0.13, 'learning_rate': 2.2572435395458104e-07, 'epoch': 3.55}
{'loss': 0.0767, 'learning_rate': 2.2376664056382143e-07, 'epoch': 3.56}
{'loss': 0.1514, 'learning_rate': 2.2180892717306185e-07, 'epoch': 3.56}
{'loss': 0.073, 'learning_rate': 2.1985121378230225e-07, 'epoch': 3.56}
{'loss': 0.1896, 'learning_rate': 2.1789350039154267e-07, 'epoch': 3.57}
{'loss': 0.2346, 'learning_rate': 2.1593578700078309e-07, 'epoch': 3.57}
{'loss': 0.0995, 'learning_rate': 2.1397807361002348e-07, 'epoch': 3.57}
{'loss': 0.1193, 'learning_rate': 2.120203602192639e-07, '

{'loss': 0.0823, 'learning_rate': 5.736100234925607e-08, 'epoch': 3.89}
{'loss': 0.1132, 'learning_rate': 5.540328895849648e-08, 'epoch': 3.89}
{'loss': 0.1505, 'learning_rate': 5.344557556773688e-08, 'epoch': 3.9}
{'loss': 0.1342, 'learning_rate': 5.1487862176977284e-08, 'epoch': 3.9}
{'loss': 0.1057, 'learning_rate': 4.95301487862177e-08, 'epoch': 3.9}
{'loss': 0.0811, 'learning_rate': 4.7572435395458104e-08, 'epoch': 3.91}
{'loss': 0.1531, 'learning_rate': 4.5614722004698504e-08, 'epoch': 3.91}
{'loss': 0.1568, 'learning_rate': 4.365700861393892e-08, 'epoch': 3.92}
{'loss': 0.112, 'learning_rate': 4.1699295223179324e-08, 'epoch': 3.92}
{'loss': 0.0894, 'learning_rate': 3.974158183241973e-08, 'epoch': 3.92}
{'loss': 0.1234, 'learning_rate': 3.7783868441660145e-08, 'epoch': 3.93}
{'loss': 0.1317, 'learning_rate': 3.5826155050900545e-08, 'epoch': 3.93}
{'loss': 0.1665, 'learning_rate': 3.386844166014095e-08, 'epoch': 3.94}
{'eval_loss': 0.2415931075811386, 'eval_content_rmse': 0.410585

In [17]:
from pathlib import Path

seed = 42

fold = 2

output = f"output_fold{fold}_seed{seed}_3008"

!python train.py \
  --model_name_or_path "microsoft/deberta-v3-large" \
  --add_prompt_question True \
  --fold $fold \
  --data_dir "./" \
  --output_dir $output \
  --fp16 \
  --num_train_epochs 4 \
  --dataloader_num_workers 4 \
  --learning_rate 2e-6 \
  --weight_decay 0.01 \
  --warmup_ratio 0 \
  --optim "adamw_torch" \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 2 \
  --evaluation_strategy "steps" \
  --eval_steps 150 \
  --save_strategy "steps" \
  --save_steps 150 \
  --save_total_limit 1 \
  --report_to "wandb" \
  --metric_for_best_model "mcrmse" \
  --greater_is_better False \
  --logging_steps 10 \
  --log_level "error" \
  --disable_tqdm True \
  --ddp_find_unused_parameters False \
  --dropout 0 \
  --seed $seed


output_dir = Path.cwd() / output
# add json files
for json_file in output_dir.glob("checkpoint*/*token*.json"):
    json_file.rename(output_dir/json_file.name)

# model files
for model_file in output_dir.glob("checkpoint*/*model*"):
    model_file.rename(output_dir/model_file.name)

# remove optimizer states and other files
to_delete = str(list(output_dir.glob("checkpoint*"))[0])
!rm -r $to_delete

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "cfg": {
    "add_prompt_question": true,
    "add_prompt_text": false,
    "data_dir": "./",
    "dropout": 0.0,
    "fold": 2,
    "max_position_embeddings": 2048,
    "max_seq_length": 2048,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "num_proc": 4
  },
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 2048,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "po

{'loss': 0.2426, 'learning_rate': 1.8576415826221877e-06, 'epoch': 0.29}
{'loss': 0.1579, 'learning_rate': 1.855702094647013e-06, 'epoch': 0.29}
{'eval_loss': 0.38325920701026917, 'eval_content_rmse': 0.5728476047515869, 'eval_wording_rmse': 0.6620907783508301, 'eval_mcrmse': 0.6174691915512085, 'eval_runtime': 109.6662, 'eval_samples_per_second': 18.319, 'eval_steps_per_second': 9.164, 'epoch': 0.29}
{'loss': 0.2838, 'learning_rate': 1.8537626066718386e-06, 'epoch': 0.29}
{'loss': 0.3849, 'learning_rate': 1.851823118696664e-06, 'epoch': 0.3}
{'loss': 0.2408, 'learning_rate': 1.8498836307214893e-06, 'epoch': 0.3}
{'loss': 0.2549, 'learning_rate': 1.847944142746315e-06, 'epoch': 0.31}
{'loss': 0.186, 'learning_rate': 1.8460046547711404e-06, 'epoch': 0.31}
{'loss': 0.184, 'learning_rate': 1.8440651667959657e-06, 'epoch': 0.31}
{'loss': 0.2362, 'learning_rate': 1.8421256788207913e-06, 'epoch': 0.32}
{'loss': 0.3885, 'learning_rate': 1.8401861908456166e-06, 'epoch': 0.32}
{'loss': 0.2507, 

{'eval_loss': 0.3252255320549011, 'eval_content_rmse': 0.47312748432159424, 'eval_wording_rmse': 0.6531474590301514, 'eval_mcrmse': 0.5631374716758728, 'eval_runtime': 109.9484, 'eval_samples_per_second': 18.272, 'eval_steps_per_second': 9.141, 'epoch': 0.64}
{'loss': 0.2471, 'learning_rate': 1.6792086889061289e-06, 'epoch': 0.64}
{'loss': 0.2731, 'learning_rate': 1.6772692009309541e-06, 'epoch': 0.65}
{'loss': 0.2309, 'learning_rate': 1.6753297129557796e-06, 'epoch': 0.65}
{'loss': 0.1922, 'learning_rate': 1.673390224980605e-06, 'epoch': 0.66}
{'loss': 0.1263, 'learning_rate': 1.6714507370054305e-06, 'epoch': 0.66}
{'loss': 0.1744, 'learning_rate': 1.669511249030256e-06, 'epoch': 0.66}
{'loss': 0.3086, 'learning_rate': 1.6675717610550814e-06, 'epoch': 0.67}
{'loss': 0.2654, 'learning_rate': 1.6656322730799068e-06, 'epoch': 0.67}
{'loss': 0.1662, 'learning_rate': 1.6636927851047323e-06, 'epoch': 0.67}
{'loss': 0.2773, 'learning_rate': 1.6617532971295577e-06, 'epoch': 0.68}
{'loss': 0.3

{'loss': 0.1325, 'learning_rate': 1.504654771140419e-06, 'epoch': 0.99}
{'loss': 0.115, 'learning_rate': 1.5027152831652444e-06, 'epoch': 1.0}
{'loss': 0.2208, 'learning_rate': 1.5007757951900696e-06, 'epoch': 1.0}
{'loss': 0.2696, 'learning_rate': 1.4988363072148953e-06, 'epoch': 1.0}
{'loss': 0.1526, 'learning_rate': 1.4968968192397207e-06, 'epoch': 1.01}
{'loss': 0.1333, 'learning_rate': 1.494957331264546e-06, 'epoch': 1.01}
{'loss': 0.1988, 'learning_rate': 1.4930178432893716e-06, 'epoch': 1.02}
{'loss': 0.0916, 'learning_rate': 1.491078355314197e-06, 'epoch': 1.02}
{'loss': 0.13, 'learning_rate': 1.4891388673390223e-06, 'epoch': 1.02}
{'loss': 0.1618, 'learning_rate': 1.487199379363848e-06, 'epoch': 1.03}
{'loss': 0.2081, 'learning_rate': 1.4852598913886734e-06, 'epoch': 1.03}
{'loss': 0.2318, 'learning_rate': 1.4833204034134987e-06, 'epoch': 1.04}
{'loss': 0.2987, 'learning_rate': 1.4813809154383241e-06, 'epoch': 1.04}
{'loss': 0.1431, 'learning_rate': 1.4794414274631498e-06, 'ep

{'loss': 0.1774, 'learning_rate': 1.3262218774243599e-06, 'epoch': 1.35}
{'loss': 0.1818, 'learning_rate': 1.3242823894491853e-06, 'epoch': 1.35}
{'loss': 0.1679, 'learning_rate': 1.3223429014740108e-06, 'epoch': 1.36}
{'loss': 0.0922, 'learning_rate': 1.3204034134988362e-06, 'epoch': 1.36}
{'loss': 0.2879, 'learning_rate': 1.3184639255236617e-06, 'epoch': 1.37}
{'loss': 0.2161, 'learning_rate': 1.3165244375484871e-06, 'epoch': 1.37}
{'loss': 0.1753, 'learning_rate': 1.3145849495733126e-06, 'epoch': 1.37}
{'loss': 0.1552, 'learning_rate': 1.312645461598138e-06, 'epoch': 1.38}
{'loss': 0.111, 'learning_rate': 1.3107059736229635e-06, 'epoch': 1.38}
{'loss': 0.1664, 'learning_rate': 1.308766485647789e-06, 'epoch': 1.38}
{'loss': 0.1473, 'learning_rate': 1.3068269976726144e-06, 'epoch': 1.39}
{'loss': 0.4242, 'learning_rate': 1.3048875096974398e-06, 'epoch': 1.39}
{'loss': 0.2703, 'learning_rate': 1.3029480217222653e-06, 'epoch': 1.4}
{'eval_loss': 0.3071252703666687, 'eval_content_rmse': 

{'loss': 0.2629, 'learning_rate': 1.147788983708301e-06, 'epoch': 1.71}
{'loss': 0.1479, 'learning_rate': 1.1458494957331265e-06, 'epoch': 1.71}
{'loss': 0.203, 'learning_rate': 1.1439100077579517e-06, 'epoch': 1.71}
{'loss': 0.1227, 'learning_rate': 1.1419705197827774e-06, 'epoch': 1.72}
{'loss': 0.1623, 'learning_rate': 1.1400310318076028e-06, 'epoch': 1.72}
{'loss': 0.2073, 'learning_rate': 1.138091543832428e-06, 'epoch': 1.73}
{'loss': 0.3331, 'learning_rate': 1.1361520558572537e-06, 'epoch': 1.73}
{'loss': 0.1825, 'learning_rate': 1.1342125678820792e-06, 'epoch': 1.73}
{'loss': 0.1416, 'learning_rate': 1.1322730799069044e-06, 'epoch': 1.74}
{'loss': 0.4731, 'learning_rate': 1.13033359193173e-06, 'epoch': 1.74}
{'loss': 0.1712, 'learning_rate': 1.1283941039565553e-06, 'epoch': 1.75}
{'eval_loss': 0.3130057752132416, 'eval_content_rmse': 0.4667176604270935, 'eval_wording_rmse': 0.6388945579528809, 'eval_mcrmse': 0.5528061389923096, 'eval_runtime': 109.7115, 'eval_samples_per_second'

{'loss': 0.0996, 'learning_rate': 9.69356089992242e-07, 'epoch': 2.06}
{'loss': 0.1791, 'learning_rate': 9.674166020170674e-07, 'epoch': 2.07}
{'loss': 0.1582, 'learning_rate': 9.654771140418929e-07, 'epoch': 2.07}
{'loss': 0.1765, 'learning_rate': 9.635376260667183e-07, 'epoch': 2.08}
{'loss': 0.1212, 'learning_rate': 9.615981380915438e-07, 'epoch': 2.08}
{'loss': 0.1332, 'learning_rate': 9.596586501163692e-07, 'epoch': 2.08}
{'loss': 0.1743, 'learning_rate': 9.577191621411947e-07, 'epoch': 2.09}
{'loss': 0.0911, 'learning_rate': 9.557796741660201e-07, 'epoch': 2.09}
{'loss': 0.1073, 'learning_rate': 9.538401861908456e-07, 'epoch': 2.09}
{'eval_loss': 0.30439990758895874, 'eval_content_rmse': 0.4854724407196045, 'eval_wording_rmse': 0.6108320951461792, 'eval_mcrmse': 0.5481522679328918, 'eval_runtime': 109.425, 'eval_samples_per_second': 18.36, 'eval_steps_per_second': 9.184, 'epoch': 2.09}
{'loss': 0.1724, 'learning_rate': 9.519006982156709e-07, 'epoch': 2.1}
{'loss': 0.1449, 'learni

{'loss': 0.2133, 'learning_rate': 7.889837083010085e-07, 'epoch': 2.42}
{'loss': 0.1725, 'learning_rate': 7.870442203258339e-07, 'epoch': 2.43}
{'loss': 0.172, 'learning_rate': 7.851047323506595e-07, 'epoch': 2.43}
{'loss': 0.0676, 'learning_rate': 7.831652443754848e-07, 'epoch': 2.44}
{'loss': 0.1719, 'learning_rate': 7.812257564003103e-07, 'epoch': 2.44}
{'loss': 0.1406, 'learning_rate': 7.792862684251357e-07, 'epoch': 2.44}
{'eval_loss': 0.3051133453845978, 'eval_content_rmse': 0.4672282040119171, 'eval_wording_rmse': 0.6260384321212769, 'eval_mcrmse': 0.5466333031654358, 'eval_runtime': 109.3997, 'eval_samples_per_second': 18.364, 'eval_steps_per_second': 9.186, 'epoch': 2.44}
{'loss': 0.1104, 'learning_rate': 7.773467804499612e-07, 'epoch': 2.45}
{'loss': 0.1679, 'learning_rate': 7.754072924747866e-07, 'epoch': 2.45}
{'loss': 0.2436, 'learning_rate': 7.73467804499612e-07, 'epoch': 2.46}
{'loss': 0.105, 'learning_rate': 7.715283165244375e-07, 'epoch': 2.46}
{'loss': 0.1275, 'learni

{'loss': 0.1278, 'learning_rate': 6.08611326609775e-07, 'epoch': 2.79}
{'loss': 0.1071, 'learning_rate': 6.066718386346004e-07, 'epoch': 2.79}
{'loss': 0.1205, 'learning_rate': 6.047323506594259e-07, 'epoch': 2.79}
{'eval_loss': 0.30712977051734924, 'eval_content_rmse': 0.4556184411048889, 'eval_wording_rmse': 0.6377080082893372, 'eval_mcrmse': 0.546663224697113, 'eval_runtime': 109.4814, 'eval_samples_per_second': 18.35, 'eval_steps_per_second': 9.18, 'epoch': 2.79}
{'loss': 0.0977, 'learning_rate': 6.027928626842513e-07, 'epoch': 2.8}
{'loss': 0.2067, 'learning_rate': 6.008533747090768e-07, 'epoch': 2.8}
{'loss': 0.1359, 'learning_rate': 5.989138867339022e-07, 'epoch': 2.8}
{'loss': 0.2475, 'learning_rate': 5.969743987587277e-07, 'epoch': 2.81}
{'loss': 0.1191, 'learning_rate': 5.950349107835531e-07, 'epoch': 2.81}
{'loss': 0.1381, 'learning_rate': 5.930954228083785e-07, 'epoch': 2.82}
{'loss': 0.1404, 'learning_rate': 5.91155934833204e-07, 'epoch': 2.82}
{'loss': 0.2812, 'learning_r

{'eval_loss': 0.30563703179359436, 'eval_content_rmse': 0.47258082032203674, 'eval_wording_rmse': 0.6228494644165039, 'eval_mcrmse': 0.5477151274681091, 'eval_runtime': 110.1506, 'eval_samples_per_second': 18.239, 'eval_steps_per_second': 9.124, 'epoch': 3.14}
{'loss': 0.0802, 'learning_rate': 4.282389449185415e-07, 'epoch': 3.15}
{'loss': 0.1798, 'learning_rate': 4.2629945694336694e-07, 'epoch': 3.15}
{'loss': 0.0644, 'learning_rate': 4.243599689681924e-07, 'epoch': 3.15}
{'loss': 0.296, 'learning_rate': 4.224204809930178e-07, 'epoch': 3.16}
{'loss': 0.1012, 'learning_rate': 4.204809930178433e-07, 'epoch': 3.16}
{'loss': 0.0808, 'learning_rate': 4.185415050426687e-07, 'epoch': 3.17}
{'loss': 0.2764, 'learning_rate': 4.1660201706749415e-07, 'epoch': 3.17}
{'loss': 0.0855, 'learning_rate': 4.1466252909231965e-07, 'epoch': 3.17}
{'loss': 0.1462, 'learning_rate': 4.1272304111714505e-07, 'epoch': 3.18}
{'loss': 0.1409, 'learning_rate': 4.107835531419705e-07, 'epoch': 3.18}
{'loss': 0.108, 

{'loss': 0.1546, 'learning_rate': 2.5368502715283164e-07, 'epoch': 3.49}
{'loss': 0.092, 'learning_rate': 2.5174553917765704e-07, 'epoch': 3.5}
{'loss': 0.115, 'learning_rate': 2.4980605120248254e-07, 'epoch': 3.5}
{'loss': 0.1316, 'learning_rate': 2.47866563227308e-07, 'epoch': 3.51}
{'loss': 0.156, 'learning_rate': 2.4592707525213345e-07, 'epoch': 3.51}
{'loss': 0.0664, 'learning_rate': 2.4398758727695885e-07, 'epoch': 3.51}
{'loss': 0.0734, 'learning_rate': 2.420480993017843e-07, 'epoch': 3.52}
{'loss': 0.1693, 'learning_rate': 2.4010861132660975e-07, 'epoch': 3.52}
{'loss': 0.1525, 'learning_rate': 2.381691233514352e-07, 'epoch': 3.53}
{'loss': 0.1163, 'learning_rate': 2.3622963537626066e-07, 'epoch': 3.53}
{'loss': 0.0864, 'learning_rate': 2.342901474010861e-07, 'epoch': 3.53}
{'loss': 0.0877, 'learning_rate': 2.3235065942591156e-07, 'epoch': 3.54}
{'loss': 0.0861, 'learning_rate': 2.3041117145073699e-07, 'epoch': 3.54}
{'loss': 0.0732, 'learning_rate': 2.2847168347556244e-07, 'ep

{'loss': 0.12, 'learning_rate': 7.525213343677268e-08, 'epoch': 3.85}
{'loss': 0.1172, 'learning_rate': 7.331264546159815e-08, 'epoch': 3.86}
{'loss': 0.1518, 'learning_rate': 7.137315748642359e-08, 'epoch': 3.86}
{'loss': 0.0756, 'learning_rate': 6.943366951124902e-08, 'epoch': 3.86}
{'loss': 0.1433, 'learning_rate': 6.749418153607448e-08, 'epoch': 3.87}
{'loss': 0.1199, 'learning_rate': 6.555469356089992e-08, 'epoch': 3.87}
{'loss': 0.1228, 'learning_rate': 6.361520558572537e-08, 'epoch': 3.88}
{'loss': 0.0704, 'learning_rate': 6.167571761055082e-08, 'epoch': 3.88}
{'loss': 0.0979, 'learning_rate': 5.973622963537626e-08, 'epoch': 3.88}
{'loss': 0.1362, 'learning_rate': 5.7796741660201704e-08, 'epoch': 3.89}
{'loss': 0.1433, 'learning_rate': 5.5857253685027156e-08, 'epoch': 3.89}
{'loss': 0.108, 'learning_rate': 5.3917765709852594e-08, 'epoch': 3.89}
{'loss': 0.2088, 'learning_rate': 5.197827773467804e-08, 'epoch': 3.9}
{'eval_loss': 0.29552027583122253, 'eval_content_rmse': 0.4716262

In [18]:
from pathlib import Path

seed = 42

fold = 3

output = f"output_fold{fold}_seed{seed}_3008"

!python train.py \
  --model_name_or_path "microsoft/deberta-v3-large" \
  --add_prompt_question True \
  --fold $fold \
  --data_dir "./" \
  --output_dir $output \
  --fp16 \
  --num_train_epochs 4 \
  --dataloader_num_workers 4 \
  --learning_rate 2e-6 \
  --weight_decay 0.01 \
  --warmup_ratio 0 \
  --optim "adamw_torch" \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 2 \
  --evaluation_strategy "steps" \
  --eval_steps 150 \
  --save_strategy "steps" \
  --save_steps 150 \
  --save_total_limit 1 \
  --report_to "wandb" \
  --metric_for_best_model "mcrmse" \
  --greater_is_better False \
  --logging_steps 10 \
  --log_level "error" \
  --disable_tqdm True \
  --ddp_find_unused_parameters False \
  --dropout 0 \
  --seed $seed


output_dir = Path.cwd() / output
# add json files
for json_file in output_dir.glob("checkpoint*/*token*.json"):
    json_file.rename(output_dir/json_file.name)

# model files
for model_file in output_dir.glob("checkpoint*/*model*"):
    model_file.rename(output_dir/model_file.name)

# remove optimizer states and other files
to_delete = str(list(output_dir.glob("checkpoint*"))[0])
!rm -r $to_delete

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "cfg": {
    "add_prompt_question": true,
    "add_prompt_text": false,
    "data_dir": "./",
    "dropout": 0.0,
    "fold": 3,
    "max_position_embeddings": 2048,
    "max_seq_length": 2048,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "num_proc": 4
  },
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 2048,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "po

{'loss': 0.157, 'learning_rate': 1.857833655705996e-06, 'epoch': 0.29}
{'loss': 0.5623, 'learning_rate': 1.8558994197292069e-06, 'epoch': 0.29}
{'eval_loss': 0.3329795300960541, 'eval_content_rmse': 0.4875447154045105, 'eval_wording_rmse': 0.6544152498245239, 'eval_mcrmse': 0.5709799528121948, 'eval_runtime': 227.0666, 'eval_samples_per_second': 8.79, 'eval_steps_per_second': 4.395, 'epoch': 0.29}
{'loss': 0.3006, 'learning_rate': 1.8539651837524176e-06, 'epoch': 0.29}
{'loss': 0.2668, 'learning_rate': 1.8520309477756286e-06, 'epoch': 0.3}
{'loss': 0.5491, 'learning_rate': 1.8500967117988392e-06, 'epoch': 0.3}
{'loss': 0.3555, 'learning_rate': 1.8481624758220502e-06, 'epoch': 0.31}
{'loss': 0.4975, 'learning_rate': 1.846228239845261e-06, 'epoch': 0.31}
{'loss': 0.3031, 'learning_rate': 1.8442940038684718e-06, 'epoch': 0.31}
{'loss': 0.3745, 'learning_rate': 1.8423597678916826e-06, 'epoch': 0.32}
{'loss': 0.3178, 'learning_rate': 1.8404255319148936e-06, 'epoch': 0.32}
{'loss': 0.4941, '

{'eval_loss': 0.24717791378498077, 'eval_content_rmse': 0.4507507085800171, 'eval_wording_rmse': 0.5396105051040649, 'eval_mcrmse': 0.495180606842041, 'eval_runtime': 226.9036, 'eval_samples_per_second': 8.797, 'eval_steps_per_second': 4.398, 'epoch': 0.64}
{'loss': 0.18, 'learning_rate': 1.6798839458413924e-06, 'epoch': 0.64}
{'loss': 0.2622, 'learning_rate': 1.6779497098646034e-06, 'epoch': 0.65}
{'loss': 0.2075, 'learning_rate': 1.6760154738878142e-06, 'epoch': 0.65}
{'loss': 0.3127, 'learning_rate': 1.6740812379110252e-06, 'epoch': 0.65}
{'loss': 0.1291, 'learning_rate': 1.6721470019342358e-06, 'epoch': 0.66}
{'loss': 0.2076, 'learning_rate': 1.6702127659574468e-06, 'epoch': 0.66}
{'loss': 0.1587, 'learning_rate': 1.6682785299806576e-06, 'epoch': 0.67}
{'loss': 0.2102, 'learning_rate': 1.6663442940038684e-06, 'epoch': 0.67}
{'loss': 0.2398, 'learning_rate': 1.6644100580270792e-06, 'epoch': 0.67}
{'loss': 0.1806, 'learning_rate': 1.6624758220502902e-06, 'epoch': 0.68}
{'loss': 0.244

{'loss': 0.3077, 'learning_rate': 1.5058027079303675e-06, 'epoch': 0.99}
{'loss': 0.297, 'learning_rate': 1.5038684719535785e-06, 'epoch': 0.99}
{'loss': 0.2805, 'learning_rate': 1.501934235976789e-06, 'epoch': 1.0}
{'loss': 0.3233, 'learning_rate': 1.5e-06, 'epoch': 1.0}
{'loss': 0.2794, 'learning_rate': 1.4980657640232108e-06, 'epoch': 1.01}
{'loss': 0.3965, 'learning_rate': 1.4961315280464214e-06, 'epoch': 1.01}
{'loss': 0.1858, 'learning_rate': 1.4941972920696324e-06, 'epoch': 1.01}
{'loss': 0.3066, 'learning_rate': 1.4922630560928434e-06, 'epoch': 1.02}
{'loss': 0.136, 'learning_rate': 1.490328820116054e-06, 'epoch': 1.02}
{'loss': 0.2412, 'learning_rate': 1.488394584139265e-06, 'epoch': 1.03}
{'loss': 0.0831, 'learning_rate': 1.4864603481624758e-06, 'epoch': 1.03}
{'loss': 0.1837, 'learning_rate': 1.4845261121856868e-06, 'epoch': 1.03}
{'loss': 0.219, 'learning_rate': 1.4825918762088973e-06, 'epoch': 1.04}
{'loss': 0.263, 'learning_rate': 1.4806576402321083e-06, 'epoch': 1.04}
{'

{'loss': 0.1109, 'learning_rate': 1.327852998065764e-06, 'epoch': 1.35}
{'loss': 0.2289, 'learning_rate': 1.3259187620889746e-06, 'epoch': 1.35}
{'loss': 0.2109, 'learning_rate': 1.3239845261121856e-06, 'epoch': 1.35}
{'loss': 0.1607, 'learning_rate': 1.3220502901353964e-06, 'epoch': 1.36}
{'loss': 0.1404, 'learning_rate': 1.3201160541586074e-06, 'epoch': 1.36}
{'loss': 0.2171, 'learning_rate': 1.318181818181818e-06, 'epoch': 1.37}
{'loss': 0.1467, 'learning_rate': 1.316247582205029e-06, 'epoch': 1.37}
{'loss': 0.2223, 'learning_rate': 1.3143133462282398e-06, 'epoch': 1.37}
{'loss': 0.1734, 'learning_rate': 1.3123791102514506e-06, 'epoch': 1.38}
{'loss': 0.2035, 'learning_rate': 1.3104448742746614e-06, 'epoch': 1.38}
{'loss': 0.3699, 'learning_rate': 1.3085106382978724e-06, 'epoch': 1.38}
{'loss': 0.2058, 'learning_rate': 1.306576402321083e-06, 'epoch': 1.39}
{'loss': 0.2341, 'learning_rate': 1.304642166344294e-06, 'epoch': 1.39}
{'eval_loss': 0.21068039536476135, 'eval_content_rmse': 

{'loss': 0.1962, 'learning_rate': 1.1499032882011606e-06, 'epoch': 1.7}
{'loss': 0.2379, 'learning_rate': 1.1479690522243712e-06, 'epoch': 1.71}
{'loss': 0.147, 'learning_rate': 1.1460348162475822e-06, 'epoch': 1.71}
{'loss': 0.1529, 'learning_rate': 1.144100580270793e-06, 'epoch': 1.71}
{'loss': 0.193, 'learning_rate': 1.1421663442940038e-06, 'epoch': 1.72}
{'loss': 0.1789, 'learning_rate': 1.1402321083172146e-06, 'epoch': 1.72}
{'loss': 0.2724, 'learning_rate': 1.1382978723404256e-06, 'epoch': 1.73}
{'loss': 0.1875, 'learning_rate': 1.1363636363636364e-06, 'epoch': 1.73}
{'loss': 0.2936, 'learning_rate': 1.1344294003868472e-06, 'epoch': 1.73}
{'loss': 0.3505, 'learning_rate': 1.132495164410058e-06, 'epoch': 1.74}
{'loss': 0.1407, 'learning_rate': 1.130560928433269e-06, 'epoch': 1.74}
{'eval_loss': 0.19701862335205078, 'eval_content_rmse': 0.4046471118927002, 'eval_wording_rmse': 0.47989365458488464, 'eval_mcrmse': 0.4422703981399536, 'eval_runtime': 227.0426, 'eval_samples_per_second

{'loss': 0.1181, 'learning_rate': 9.71953578336557e-07, 'epoch': 2.06}
{'loss': 0.1063, 'learning_rate': 9.700193423597678e-07, 'epoch': 2.06}
{'loss': 0.1899, 'learning_rate': 9.680851063829786e-07, 'epoch': 2.07}
{'loss': 0.1607, 'learning_rate': 9.661508704061894e-07, 'epoch': 2.07}
{'loss': 0.0814, 'learning_rate': 9.642166344294004e-07, 'epoch': 2.07}
{'loss': 0.2528, 'learning_rate': 9.622823984526112e-07, 'epoch': 2.08}
{'loss': 0.2426, 'learning_rate': 9.60348162475822e-07, 'epoch': 2.08}
{'loss': 0.1552, 'learning_rate': 9.584139264990328e-07, 'epoch': 2.09}
{'loss': 0.2991, 'learning_rate': 9.564796905222435e-07, 'epoch': 2.09}
{'eval_loss': 0.23223882913589478, 'eval_content_rmse': 0.4727170169353485, 'eval_wording_rmse': 0.49093419313430786, 'eval_mcrmse': 0.481825590133667, 'eval_runtime': 227.0366, 'eval_samples_per_second': 8.792, 'eval_steps_per_second': 4.396, 'epoch': 2.09}
{'loss': 0.1357, 'learning_rate': 9.545454545454546e-07, 'epoch': 2.09}
{'loss': 0.1181, 'learn

{'loss': 0.1742, 'learning_rate': 7.920696324951643e-07, 'epoch': 2.42}
{'loss': 0.1111, 'learning_rate': 7.901353965183752e-07, 'epoch': 2.42}
{'loss': 0.2144, 'learning_rate': 7.88201160541586e-07, 'epoch': 2.43}
{'loss': 0.1058, 'learning_rate': 7.862669245647968e-07, 'epoch': 2.43}
{'loss': 0.1004, 'learning_rate': 7.843326885880077e-07, 'epoch': 2.43}
{'loss': 0.1948, 'learning_rate': 7.823984526112185e-07, 'epoch': 2.44}
{'eval_loss': 0.21222645044326782, 'eval_content_rmse': 0.41108784079551697, 'eval_wording_rmse': 0.5054301619529724, 'eval_mcrmse': 0.4582589864730835, 'eval_runtime': 227.1547, 'eval_samples_per_second': 8.787, 'eval_steps_per_second': 4.393, 'epoch': 2.44}
{'loss': 0.1627, 'learning_rate': 7.804642166344294e-07, 'epoch': 2.44}
{'loss': 0.1884, 'learning_rate': 7.785299806576401e-07, 'epoch': 2.44}
{'loss': 0.1734, 'learning_rate': 7.765957446808509e-07, 'epoch': 2.45}
{'loss': 0.1403, 'learning_rate': 7.746615087040619e-07, 'epoch': 2.45}
{'loss': 0.1106, 'lea

{'loss': 0.1868, 'learning_rate': 6.123791102514506e-07, 'epoch': 2.78}
{'loss': 0.1498, 'learning_rate': 6.104448742746615e-07, 'epoch': 2.78}
{'loss': 0.1539, 'learning_rate': 6.085106382978723e-07, 'epoch': 2.79}
{'eval_loss': 0.21563953161239624, 'eval_content_rmse': 0.4238651692867279, 'eval_wording_rmse': 0.5016148686408997, 'eval_mcrmse': 0.4627400040626526, 'eval_runtime': 227.5001, 'eval_samples_per_second': 8.774, 'eval_steps_per_second': 4.387, 'epoch': 2.79}
{'loss': 0.1091, 'learning_rate': 6.065764023210831e-07, 'epoch': 2.79}
{'loss': 0.1675, 'learning_rate': 6.04642166344294e-07, 'epoch': 2.79}
{'loss': 0.2453, 'learning_rate': 6.027079303675048e-07, 'epoch': 2.8}
{'loss': 0.1641, 'learning_rate': 6.007736943907157e-07, 'epoch': 2.8}
{'loss': 0.1763, 'learning_rate': 5.988394584139264e-07, 'epoch': 2.8}
{'loss': 0.1353, 'learning_rate': 5.969052224371372e-07, 'epoch': 2.81}
{'loss': 0.2148, 'learning_rate': 5.949709864603481e-07, 'epoch': 2.81}
{'loss': 0.2919, 'learnin

{'eval_loss': 0.2053852379322052, 'eval_content_rmse': 0.4101869761943817, 'eval_wording_rmse': 0.4924601912498474, 'eval_mcrmse': 0.45132356882095337, 'eval_runtime': 227.4344, 'eval_samples_per_second': 8.776, 'eval_steps_per_second': 4.388, 'epoch': 3.13}
{'loss': 0.1289, 'learning_rate': 4.32495164410058e-07, 'epoch': 3.14}
{'loss': 0.1589, 'learning_rate': 4.3056092843326883e-07, 'epoch': 3.14}
{'loss': 0.092, 'learning_rate': 4.286266924564796e-07, 'epoch': 3.15}
{'loss': 0.1028, 'learning_rate': 4.266924564796905e-07, 'epoch': 3.15}
{'loss': 0.1889, 'learning_rate': 4.2475822050290136e-07, 'epoch': 3.15}
{'loss': 0.1, 'learning_rate': 4.228239845261122e-07, 'epoch': 3.16}
{'loss': 0.1656, 'learning_rate': 4.2088974854932304e-07, 'epoch': 3.16}
{'loss': 0.0894, 'learning_rate': 4.1895551257253383e-07, 'epoch': 3.16}
{'loss': 0.1275, 'learning_rate': 4.1702127659574467e-07, 'epoch': 3.17}
{'loss': 0.1268, 'learning_rate': 4.150870406189555e-07, 'epoch': 3.17}
{'loss': 0.1096, 'lea

{'loss': 0.1232, 'learning_rate': 2.584139264990329e-07, 'epoch': 3.49}
{'loss': 0.1268, 'learning_rate': 2.5647969052224374e-07, 'epoch': 3.49}
{'loss': 0.0772, 'learning_rate': 2.5454545454545453e-07, 'epoch': 3.49}
{'loss': 0.1363, 'learning_rate': 2.5261121856866537e-07, 'epoch': 3.5}
{'loss': 0.1253, 'learning_rate': 2.506769825918762e-07, 'epoch': 3.5}
{'loss': 0.2162, 'learning_rate': 2.4874274661508705e-07, 'epoch': 3.5}
{'loss': 0.1907, 'learning_rate': 2.4680851063829784e-07, 'epoch': 3.51}
{'loss': 0.0816, 'learning_rate': 2.448742746615087e-07, 'epoch': 3.51}
{'loss': 0.098, 'learning_rate': 2.4294003868471953e-07, 'epoch': 3.52}
{'loss': 0.1725, 'learning_rate': 2.4100580270793037e-07, 'epoch': 3.52}
{'loss': 0.1766, 'learning_rate': 2.3907156673114116e-07, 'epoch': 3.52}
{'loss': 0.1665, 'learning_rate': 2.3713733075435203e-07, 'epoch': 3.53}
{'loss': 0.0923, 'learning_rate': 2.3520309477756284e-07, 'epoch': 3.53}
{'loss': 0.2079, 'learning_rate': 2.3326885880077368e-07, 

{'loss': 0.1466, 'learning_rate': 8.065764023210832e-08, 'epoch': 3.84}
{'loss': 0.1599, 'learning_rate': 7.872340425531915e-08, 'epoch': 3.85}
{'loss': 0.0982, 'learning_rate': 7.678916827852998e-08, 'epoch': 3.85}
{'loss': 0.1073, 'learning_rate': 7.48549323017408e-08, 'epoch': 3.85}
{'loss': 0.1264, 'learning_rate': 7.292069632495163e-08, 'epoch': 3.86}
{'loss': 0.1111, 'learning_rate': 7.098646034816246e-08, 'epoch': 3.86}
{'loss': 0.1382, 'learning_rate': 6.90522243713733e-08, 'epoch': 3.86}
{'loss': 0.1217, 'learning_rate': 6.711798839458413e-08, 'epoch': 3.87}
{'loss': 0.1511, 'learning_rate': 6.518375241779497e-08, 'epoch': 3.87}
{'loss': 0.1087, 'learning_rate': 6.32495164410058e-08, 'epoch': 3.88}
{'loss': 0.1089, 'learning_rate': 6.131528046421663e-08, 'epoch': 3.88}
{'loss': 0.0865, 'learning_rate': 5.938104448742747e-08, 'epoch': 3.88}
{'loss': 0.0719, 'learning_rate': 5.7446808510638295e-08, 'epoch': 3.89}
{'eval_loss': 0.2093939185142517, 'eval_content_rmse': 0.434239417