### Logs
* 30.08: use 2048 for max_len and max_position_embeddings
* 03.09: use 512 instead
* try deepset/deberta-v3-large-squad2

In [2]:
!pip install transformers -q
!pip install sentencepiece -q
!pip install datasets -q
!pip install accelerate -U -q



In [3]:
from transformers import AutoTokenizer
# #deepset/deberta-v3-large-squad2
# tok = AutoTokenizer.from_pretrained("./pretrain")

from datasets import Dataset, disable_progress_bar
import pandas as pd

pdf = pd.read_csv("./input/prompts_train.csv")
sdf = pd.read_csv("./input/summaries_train.csv")

In [4]:
from transformers import AutoModelForSequenceClassification
# model = AutoModelForSequenceClassification.from_pretrained("./pretrain")
# ## use the pretrained model
from transformers import AutoConfig
# config = AutoConfig.from_pretrained('./pretrain')
# # model = AutoModelForSequenceClassification.from_pretrained('./input/pretrain/pretrained_model', config = config)

In [5]:
pdf.shape, sdf.shape

((4, 4), (7165, 5))

## Train

In [14]:
%%writefile train.py

import os
import logging
import warnings
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    set_seed,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorWithPadding,
)
from datasets import Dataset, disable_progress_bar
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-commonlit-eval-student-summaries-2409'

disable_progress_bar()

@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=2048,
        #default = 512,
        #default = 512,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=2048,
        #default=512,
        #default = 512,
        metadata={"help": "Amount of dropout to apply"},
    )


# Spell auto correction
# from spellchecker import SpellChecker

# def correct_spelling(input_text):
#     print('input_text: ', input_text)
#     # Initialize the spell checker
#     spell = SpellChecker()
#     # Split the input text into words
#     words = input_text.split()
#     # Initialize an empty list to store the corrected words
#     corrected_words = []
#     for word in words:
#         # Check if the word has any punctuation at the end
#         if word[-1].isalpha():
#             # Extract the punctuation
#             punctuation_end = ""
#         else:
#             punctuation_end = word[-1]
#             word = word[:-1]
#         # check if the word has any punctuation at the start
#         if word[0].isalpha():
#            # Extract the punctuation
#             punctuation_start = ""
#         else:
#             punctuation_start = word[0]
#             word = word[1:]
#         # Check the spelling of the word (case insensitive)
#         corrected_word = spell.correction(word.lower())
#         # Preserve the original capitalization
#         if word[0].isupper():
#             corrected_word = corrected_word.capitalize()
#         # Combine the corrected word and punctuation (if any)
#         corrected_word = punctuation_start+corrected_word+punctuation_end
#         # Append the corrected word to the list
#         corrected_words.append(corrected_word)
#     # Join the corrected words back into a single string
#     corrected_text = " ".join(corrected_words)
#     return corrected_text

def tokenize(example, tokenizer, config):
    sep = tokenizer.sep_token

    # if config.add_prompt_question:
    #     text = sep.join(
    #         [example["prompt_question"], example["prompt_text"], example["text"]]
    #     )
    # elif config.add_prompt_text:
    #     text = sep.join([example["prompt_text"], example["text"]])
    # else:
    #     text = example["text"]
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    labels = [example["content"], example["wording"]]

    tokenized = tokenizer(
#         prompt,
#         example["text"],
        example['text'],
        prompt,
        padding=False,
        truncation=True, # changed by Peng, turn on the truncation
        max_length=config.max_seq_length,
    )

    return {
        **tokenized,
        "labels": labels,
    }




def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


def main():
    parser = HfArgumentParser((Config, TrainingArguments))

    config, training_args = parser.parse_args_into_dataclasses()

    set_seed(training_args.seed)

    if "wandb" in training_args.report_to:
        import wandb

        try:
#             from kaggle_secrets import UserSecretsClient
#             user_secrets = UserSecretsClient()
#             key = user_secrets.get_secret("wandb")

#             wandb.login(key=key)
            wandb.login()
        except:
            print("Could not log in to WandB")

    tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
    model_config = AutoConfig.from_pretrained(config.model_name_or_path)

    model_config.update({
        "hidden_dropout_prob": config.dropout,
        "attention_probs_dropout_prob": config.dropout,
        "num_labels": 2,
        "problem_type": "regression",
        "max_position_embeddings": config.max_position_embeddings,
        "cfg": config.__dict__,
    })

    print(model_config)
    
    # Do not use pretrained model
#     model = AutoModelForSequenceClassification.from_pretrained(
#         config.model_name_or_path, config=model_config
#     )
    
    # use pretrained model
    print('use pretrained_model')
    print('pretrain deberta-v3-large-squad2')
    model = AutoModelForSequenceClassification.from_pretrained('./pretrain', config = model_config)

    #pdf = pd.read_csv(f"{config.data_dir}/prompts_train.csv")
    pdf = pd.read_csv(f"./input/prompts_train.csv")
    #sdf = pd.read_csv(f"{config.data_dir}/summaries_train.csv")
    sdf = pd.read_csv(f"./input/summaries_train.csv")

    df = pdf.merge(sdf, on="prompt_id")

    # 4 prompt ids, 4 folds
    id2fold = {
        "814d6b": 0,
        "39c16e": 1,
        "3b9047": 2,
        "ebad26": 3,
    }

    df["fold"] = df["prompt_id"].map(id2fold)

    train_ds = Dataset.from_pandas(df[df["fold"] != config.fold])
    val_ds = Dataset.from_pandas(df[df["fold"] == config.fold])

    train_ds = train_ds.map(
        tokenize,
        batched=False,
        num_proc=config.num_proc,
        fn_kwargs={"tokenizer": tokenizer, "config": config},
    )

    val_ds = val_ds.map(
        tokenize,
        batched=False,
        num_proc=config.num_proc,
        fn_kwargs={"tokenizer": tokenizer, "config": config},
    )

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        pad_to_multiple_of=16 if training_args.fp16 else None,
    )

    training_args.bf16 =True
    training_args.gradient_accumulation_steps = 1
    training_args.load_best_model_at_end = True
    training_args.greater_is_better = False
    training_args.metric_for_best_model = 'mcrmse'
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_mcrmse,
    )

    trainer.train()

    model.config.best_metric = trainer.state.best_metric
    model.config.save_pretrained(training_args.output_dir)

    trainer.log({"eval_best_mcrmse": trainer.state.best_metric})


if __name__ == "__main__":
    main()

Overwriting train.py


In [15]:
from pathlib import Path

seed = 42

fold = 0

output = f"output_fold{fold}_seed{seed}_2409"

!python train.py \
  --model_name_or_path "microsoft/deberta-v3-large" \
  --add_prompt_question True \
  --fold $fold \
  --data_dir "./" \
  --output_dir $output \
  --fp16 \
  --num_train_epochs 4 \
  --dataloader_num_workers 4 \
  --learning_rate 2e-6 \
  --weight_decay 0.01 \
  --warmup_ratio 0 \
  --optim "adamw_torch" \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 2 \
  --evaluation_strategy "steps" \
  --eval_steps 150 \
  --save_strategy "steps" \
  --save_steps 150 \
  --save_total_limit 1 \
  --report_to "wandb" \
  --metric_for_best_model "mcrmse" \
  --greater_is_better False \
  --logging_steps 10 \
  --log_level "error" \
  --disable_tqdm True \
  --ddp_find_unused_parameters False \
  --dropout 0 \
  --seed $seed


output_dir = Path.cwd() / output
# add json files
for json_file in output_dir.glob("checkpoint*/*token*.json"):
    json_file.rename(output_dir/json_file.name)

# model files
for model_file in output_dir.glob("checkpoint*/*model*"):
    model_file.rename(output_dir/model_file.name)

# remove optimizer states and other files
to_delete = str(list(output_dir.glob("checkpoint*"))[0])
!rm -r $to_delete

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "cfg": {
    "add_prompt_question": true,
    "add_prompt_text": false,
    "data_dir": "./",
    "dropout": 0.0,
    "fold": 0,
    "max_position_embeddings": 2048,
    "max_seq_length": 2048,
    "model_name_or_path": "microsoft/deberta-v3-large",
    "num_proc": 4
  },
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 2048,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "po

{'loss': 0.1729, 'learning_rate': 1.8804025074232926e-06, 'epoch': 0.24}
{'loss': 0.2119, 'learning_rate': 1.8787528868360276e-06, 'epoch': 0.24}
{'loss': 0.4374, 'learning_rate': 1.8771032662487628e-06, 'epoch': 0.25}
{'eval_loss': 0.6923098564147949, 'eval_content_rmse': 0.6684188842773438, 'eval_wording_rmse': 0.9684187173843384, 'eval_mcrmse': 0.8184188008308411, 'eval_runtime': 57.7825, 'eval_samples_per_second': 19.089, 'eval_steps_per_second': 9.553, 'epoch': 0.25}
{'loss': 0.157, 'learning_rate': 1.8754536456614978e-06, 'epoch': 0.25}
{'loss': 0.1896, 'learning_rate': 1.8738040250742327e-06, 'epoch': 0.25}
{'loss': 0.2953, 'learning_rate': 1.872154404486968e-06, 'epoch': 0.26}
{'loss': 0.3036, 'learning_rate': 1.870504783899703e-06, 'epoch': 0.26}
{'loss': 0.2596, 'learning_rate': 1.8688551633124382e-06, 'epoch': 0.26}
{'loss': 0.2605, 'learning_rate': 1.8672055427251732e-06, 'epoch': 0.27}
{'loss': 0.3726, 'learning_rate': 1.8655559221379081e-06, 'epoch': 0.27}
{'loss': 0.2173

{'loss': 0.3538, 'learning_rate': 1.728637413394919e-06, 'epoch': 0.54}
{'eval_loss': 0.638758659362793, 'eval_content_rmse': 0.6524243950843811, 'eval_wording_rmse': 0.9229626655578613, 'eval_mcrmse': 0.7876935005187988, 'eval_runtime': 57.817, 'eval_samples_per_second': 19.077, 'eval_steps_per_second': 9.547, 'epoch': 0.54}
{'loss': 0.1999, 'learning_rate': 1.726987792807654e-06, 'epoch': 0.55}
{'loss': 0.1994, 'learning_rate': 1.7253381722203893e-06, 'epoch': 0.55}
{'loss': 0.2357, 'learning_rate': 1.7236885516331243e-06, 'epoch': 0.55}
{'loss': 0.2277, 'learning_rate': 1.7220389310458595e-06, 'epoch': 0.56}
{'loss': 0.373, 'learning_rate': 1.7203893104585945e-06, 'epoch': 0.56}
{'loss': 0.2425, 'learning_rate': 1.7187396898713295e-06, 'epoch': 0.56}
{'loss': 0.1708, 'learning_rate': 1.7170900692840647e-06, 'epoch': 0.57}
{'loss': 0.2024, 'learning_rate': 1.7154404486967997e-06, 'epoch': 0.57}
{'loss': 0.2415, 'learning_rate': 1.713790828109535e-06, 'epoch': 0.57}
{'loss': 0.2235, '

{'loss': 0.2615, 'learning_rate': 1.5785219399538106e-06, 'epoch': 0.84}
{'loss': 0.2211, 'learning_rate': 1.5768723193665456e-06, 'epoch': 0.85}
{'loss': 0.1504, 'learning_rate': 1.5752226987792806e-06, 'epoch': 0.85}
{'loss': 0.0729, 'learning_rate': 1.5735730781920158e-06, 'epoch': 0.85}
{'loss': 0.3821, 'learning_rate': 1.5719234576047508e-06, 'epoch': 0.86}
{'loss': 0.1981, 'learning_rate': 1.570273837017486e-06, 'epoch': 0.86}
{'loss': 0.2604, 'learning_rate': 1.568624216430221e-06, 'epoch': 0.86}
{'loss': 0.3499, 'learning_rate': 1.566974595842956e-06, 'epoch': 0.87}
{'loss': 0.2215, 'learning_rate': 1.5653249752556912e-06, 'epoch': 0.87}
{'loss': 0.2553, 'learning_rate': 1.5636753546684262e-06, 'epoch': 0.87}
{'loss': 0.1685, 'learning_rate': 1.5620257340811614e-06, 'epoch': 0.88}
{'loss': 0.2962, 'learning_rate': 1.5603761134938964e-06, 'epoch': 0.88}
{'loss': 0.159, 'learning_rate': 1.5587264929066314e-06, 'epoch': 0.88}
{'loss': 0.1961, 'learning_rate': 1.5570768723193666e-0

{'loss': 0.0958, 'learning_rate': 1.4267568459254372e-06, 'epoch': 1.15}
{'loss': 0.2113, 'learning_rate': 1.4251072253381722e-06, 'epoch': 1.15}
{'loss': 0.276, 'learning_rate': 1.4234576047509072e-06, 'epoch': 1.15}
{'loss': 0.2013, 'learning_rate': 1.4218079841636424e-06, 'epoch': 1.16}
{'loss': 0.2868, 'learning_rate': 1.4201583635763774e-06, 'epoch': 1.16}
{'loss': 0.1508, 'learning_rate': 1.4185087429891126e-06, 'epoch': 1.16}
{'loss': 0.1407, 'learning_rate': 1.4168591224018476e-06, 'epoch': 1.17}
{'loss': 0.2102, 'learning_rate': 1.4152095018145826e-06, 'epoch': 1.17}
{'loss': 0.1323, 'learning_rate': 1.4135598812273178e-06, 'epoch': 1.17}
{'loss': 0.1198, 'learning_rate': 1.4119102606400528e-06, 'epoch': 1.18}
{'loss': 0.1638, 'learning_rate': 1.410260640052788e-06, 'epoch': 1.18}
{'loss': 0.1203, 'learning_rate': 1.408611019465523e-06, 'epoch': 1.18}
{'loss': 0.2927, 'learning_rate': 1.406961398878258e-06, 'epoch': 1.19}
{'eval_loss': 0.5385178327560425, 'eval_content_rmse': 

{'loss': 0.1688, 'learning_rate': 1.2751567139557902e-06, 'epoch': 1.45}
{'loss': 0.2897, 'learning_rate': 1.2735070933685252e-06, 'epoch': 1.45}
{'loss': 0.2145, 'learning_rate': 1.2718574727812604e-06, 'epoch': 1.46}
{'loss': 0.1483, 'learning_rate': 1.2702078521939954e-06, 'epoch': 1.46}
{'loss': 0.2248, 'learning_rate': 1.2685582316067306e-06, 'epoch': 1.46}
{'loss': 0.1602, 'learning_rate': 1.2669086110194656e-06, 'epoch': 1.47}
{'loss': 0.183, 'learning_rate': 1.2652589904322006e-06, 'epoch': 1.47}
{'loss': 0.1678, 'learning_rate': 1.2636093698449358e-06, 'epoch': 1.47}
{'loss': 0.1729, 'learning_rate': 1.2619597492576705e-06, 'epoch': 1.48}
{'loss': 0.1776, 'learning_rate': 1.2603101286704057e-06, 'epoch': 1.48}
{'loss': 0.2292, 'learning_rate': 1.2586605080831407e-06, 'epoch': 1.48}
{'eval_loss': 0.5485661625862122, 'eval_content_rmse': 0.6665088534355164, 'eval_wording_rmse': 0.8080209493637085, 'eval_mcrmse': 0.73726487159729, 'eval_runtime': 57.6889, 'eval_samples_per_second

{'loss': 0.2026, 'learning_rate': 1.1233916199274167e-06, 'epoch': 1.76}
{'loss': 0.2081, 'learning_rate': 1.1217419993401517e-06, 'epoch': 1.76}
{'loss': 0.2111, 'learning_rate': 1.1200923787528869e-06, 'epoch': 1.76}
{'loss': 0.1417, 'learning_rate': 1.1184427581656219e-06, 'epoch': 1.77}
{'loss': 0.1678, 'learning_rate': 1.116793137578357e-06, 'epoch': 1.77}
{'loss': 0.359, 'learning_rate': 1.115143516991092e-06, 'epoch': 1.77}
{'loss': 0.2063, 'learning_rate': 1.1134938964038269e-06, 'epoch': 1.77}
{'loss': 0.1522, 'learning_rate': 1.111844275816562e-06, 'epoch': 1.78}
{'loss': 0.167, 'learning_rate': 1.110194655229297e-06, 'epoch': 1.78}
{'eval_loss': 0.5271629691123962, 'eval_content_rmse': 0.5809796452522278, 'eval_wording_rmse': 0.84663325548172, 'eval_mcrmse': 0.7138064503669739, 'eval_runtime': 57.5949, 'eval_samples_per_second': 19.151, 'eval_steps_per_second': 9.584, 'epoch': 1.78}
{'loss': 0.2784, 'learning_rate': 1.1085450346420323e-06, 'epoch': 1.78}
{'loss': 0.1881, 'le

{'loss': 0.1299, 'learning_rate': 9.716265258990432e-07, 'epoch': 2.06}
{'loss': 0.1626, 'learning_rate': 9.699769053117782e-07, 'epoch': 2.06}
{'loss': 0.1234, 'learning_rate': 9.683272847245132e-07, 'epoch': 2.07}
{'loss': 0.2039, 'learning_rate': 9.666776641372484e-07, 'epoch': 2.07}
{'loss': 0.1006, 'learning_rate': 9.650280435499834e-07, 'epoch': 2.07}
{'loss': 0.2017, 'learning_rate': 9.633784229627186e-07, 'epoch': 2.08}
{'loss': 0.0909, 'learning_rate': 9.617288023754536e-07, 'epoch': 2.08}
{'eval_loss': 0.4346396028995514, 'eval_content_rmse': 0.5279823541641235, 'eval_wording_rmse': 0.768449068069458, 'eval_mcrmse': 0.6482157111167908, 'eval_runtime': 57.7589, 'eval_samples_per_second': 19.097, 'eval_steps_per_second': 9.557, 'epoch': 2.08}
{'loss': 0.0977, 'learning_rate': 9.600791817881886e-07, 'epoch': 2.08}
{'loss': 0.1985, 'learning_rate': 9.584295612009238e-07, 'epoch': 2.09}
{'loss': 0.1542, 'learning_rate': 9.567799406136588e-07, 'epoch': 2.09}
{'loss': 0.1572, 'learn

{'loss': 0.1036, 'learning_rate': 8.182118112834048e-07, 'epoch': 2.37}
{'loss': 0.1364, 'learning_rate': 8.165621906961399e-07, 'epoch': 2.37}
{'loss': 0.1432, 'learning_rate': 8.14912570108875e-07, 'epoch': 2.37}
{'loss': 0.1667, 'learning_rate': 8.132629495216101e-07, 'epoch': 2.38}
{'eval_loss': 0.4062572419643402, 'eval_content_rmse': 0.518634021282196, 'eval_wording_rmse': 0.7372471690177917, 'eval_mcrmse': 0.6279405951499939, 'eval_runtime': 57.6346, 'eval_samples_per_second': 19.138, 'eval_steps_per_second': 9.578, 'epoch': 2.38}
{'loss': 0.1554, 'learning_rate': 8.11613328934345e-07, 'epoch': 2.38}
{'loss': 0.2349, 'learning_rate': 8.099637083470802e-07, 'epoch': 2.38}
{'loss': 0.084, 'learning_rate': 8.083140877598153e-07, 'epoch': 2.39}
{'loss': 0.1865, 'learning_rate': 8.066644671725504e-07, 'epoch': 2.39}
{'loss': 0.1394, 'learning_rate': 8.050148465852854e-07, 'epoch': 2.39}
{'loss': 0.1342, 'learning_rate': 8.033652259980203e-07, 'epoch': 2.4}
{'loss': 0.1278, 'learning_

{'loss': 0.1869, 'learning_rate': 6.64962058726493e-07, 'epoch': 2.67}
{'eval_loss': 0.3583536148071289, 'eval_content_rmse': 0.5021491050720215, 'eval_wording_rmse': 0.6815818548202515, 'eval_mcrmse': 0.5918654799461365, 'eval_runtime': 57.6425, 'eval_samples_per_second': 19.135, 'eval_steps_per_second': 9.576, 'epoch': 2.67}
{'loss': 0.1453, 'learning_rate': 6.63312438139228e-07, 'epoch': 2.68}
{'loss': 0.1283, 'learning_rate': 6.61662817551963e-07, 'epoch': 2.68}
{'loss': 0.1518, 'learning_rate': 6.60013196964698e-07, 'epoch': 2.68}
{'loss': 0.2569, 'learning_rate': 6.583635763774331e-07, 'epoch': 2.69}
{'loss': 0.1752, 'learning_rate': 6.567139557901682e-07, 'epoch': 2.69}
{'loss': 0.1389, 'learning_rate': 6.550643352029032e-07, 'epoch': 2.69}
{'loss': 0.1107, 'learning_rate': 6.534147146156383e-07, 'epoch': 2.7}
{'loss': 0.1236, 'learning_rate': 6.517650940283734e-07, 'epoch': 2.7}
{'loss': 0.1594, 'learning_rate': 6.501154734411085e-07, 'epoch': 2.7}
{'loss': 0.2395, 'learning_ra

{'loss': 0.1797, 'learning_rate': 5.148465852853844e-07, 'epoch': 2.97}
{'loss': 0.0968, 'learning_rate': 5.131969646981195e-07, 'epoch': 2.98}
{'loss': 0.1159, 'learning_rate': 5.115473441108544e-07, 'epoch': 2.98}
{'loss': 0.1664, 'learning_rate': 5.098977235235895e-07, 'epoch': 2.98}
{'loss': 0.0913, 'learning_rate': 5.082481029363246e-07, 'epoch': 2.99}
{'loss': 0.1092, 'learning_rate': 5.065984823490597e-07, 'epoch': 2.99}
{'loss': 0.1219, 'learning_rate': 5.049488617617948e-07, 'epoch': 2.99}
{'loss': 0.2002, 'learning_rate': 5.032992411745298e-07, 'epoch': 3.0}
{'loss': 0.1728, 'learning_rate': 5.016496205872649e-07, 'epoch': 3.0}
{'loss': 0.1447, 'learning_rate': 5e-07, 'epoch': 3.0}
{'loss': 0.1428, 'learning_rate': 4.983503794127351e-07, 'epoch': 3.01}
{'loss': 0.1113, 'learning_rate': 4.967007588254702e-07, 'epoch': 3.01}
{'loss': 0.0867, 'learning_rate': 4.950511382382052e-07, 'epoch': 3.01}
{'loss': 0.1297, 'learning_rate': 4.934015176509403e-07, 'epoch': 3.02}
{'loss': 0.

{'loss': 0.2451, 'learning_rate': 3.6143187066974597e-07, 'epoch': 3.28}
{'loss': 0.0918, 'learning_rate': 3.59782250082481e-07, 'epoch': 3.28}
{'loss': 0.0741, 'learning_rate': 3.581326294952161e-07, 'epoch': 3.29}
{'loss': 0.0723, 'learning_rate': 3.564830089079511e-07, 'epoch': 3.29}
{'loss': 0.1189, 'learning_rate': 3.548333883206862e-07, 'epoch': 3.29}
{'loss': 0.1544, 'learning_rate': 3.531837677334213e-07, 'epoch': 3.3}
{'loss': 0.0993, 'learning_rate': 3.5153414714615636e-07, 'epoch': 3.3}
{'loss': 0.1081, 'learning_rate': 3.4988452655889146e-07, 'epoch': 3.3}
{'loss': 0.1369, 'learning_rate': 3.482349059716265e-07, 'epoch': 3.31}
{'loss': 0.0883, 'learning_rate': 3.465852853843616e-07, 'epoch': 3.31}
{'loss': 0.1969, 'learning_rate': 3.449356647970966e-07, 'epoch': 3.31}
{'loss': 0.1417, 'learning_rate': 3.432860442098317e-07, 'epoch': 3.32}
{'eval_loss': 0.3748910129070282, 'eval_content_rmse': 0.5021227598190308, 'eval_wording_rmse': 0.7054464221000671, 'eval_mcrmse': 0.6037

{'loss': 0.0822, 'learning_rate': 2.0966677664137248e-07, 'epoch': 3.58}
{'loss': 0.0999, 'learning_rate': 2.0801715605410755e-07, 'epoch': 3.59}
{'loss': 0.1515, 'learning_rate': 2.0636753546684263e-07, 'epoch': 3.59}
{'loss': 0.0811, 'learning_rate': 2.0471791487957767e-07, 'epoch': 3.59}
{'loss': 0.1005, 'learning_rate': 2.0306829429231275e-07, 'epoch': 3.6}
{'loss': 0.1561, 'learning_rate': 2.0141867370504785e-07, 'epoch': 3.6}
{'loss': 0.1985, 'learning_rate': 1.997690531177829e-07, 'epoch': 3.6}
{'loss': 0.1722, 'learning_rate': 1.9811943253051797e-07, 'epoch': 3.61}
{'loss': 0.157, 'learning_rate': 1.9646981194325305e-07, 'epoch': 3.61}
{'loss': 0.0972, 'learning_rate': 1.9482019135598812e-07, 'epoch': 3.61}
{'eval_loss': 0.4349913001060486, 'eval_content_rmse': 0.5780519247055054, 'eval_wording_rmse': 0.7320101261138916, 'eval_mcrmse': 0.6550310254096985, 'eval_runtime': 57.4104, 'eval_samples_per_second': 19.213, 'eval_steps_per_second': 9.615, 'epoch': 3.61}
{'loss': 0.0857, 

{'loss': 0.2267, 'learning_rate': 5.7901682612999004e-08, 'epoch': 3.89}
{'loss': 0.1176, 'learning_rate': 5.625206202573408e-08, 'epoch': 3.89}
{'loss': 0.0909, 'learning_rate': 5.4602441438469146e-08, 'epoch': 3.89}
{'loss': 0.085, 'learning_rate': 5.295282085120422e-08, 'epoch': 3.9}
{'loss': 0.0741, 'learning_rate': 5.1303200263939295e-08, 'epoch': 3.9}
{'loss': 0.113, 'learning_rate': 4.965357967667436e-08, 'epoch': 3.9}
{'loss': 0.2385, 'learning_rate': 4.800395908940944e-08, 'epoch': 3.91}
{'loss': 0.1439, 'learning_rate': 4.6354338502144505e-08, 'epoch': 3.91}
{'eval_loss': 0.4277520179748535, 'eval_content_rmse': 0.5555188059806824, 'eval_wording_rmse': 0.7395288944244385, 'eval_mcrmse': 0.6475238800048828, 'eval_runtime': 57.7095, 'eval_samples_per_second': 19.113, 'eval_steps_per_second': 9.565, 'epoch': 3.91}
{'loss': 0.0834, 'learning_rate': 4.470471791487958e-08, 'epoch': 3.91}
{'loss': 0.0636, 'learning_rate': 4.305509732761465e-08, 'epoch': 3.92}
{'loss': 0.1362, 'learn

In [1]:
from pathlib import Path
import json

seed = 42
scores = []
for fold in range(1,4):
    output = f"./output_fold{fold}_seed{seed}_2209"
    p = Path(output) / "config.json"

    with open(p) as fp:
        cfg = json.load(fp)


    print(f"MCRMSE for fold {fold}, seed {seed}: {cfg['best_metric']:.4f}")
    scores.append(cfg['best_metric'])
print(sum(scores)/len(scores))

MCRMSE for fold 1, seed 42: 0.4728
MCRMSE for fold 2, seed 42: 0.5201
MCRMSE for fold 3, seed 42: 0.4407
0.4778555631637573


In [19]:
#df.loc[0, 'prompt_text']

In [20]:
# from pathlib import Path
# import json

# seed = 42
# scores = []
# for fold in range(4):
#     output = f"./output_fold{fold}_seed{seed}_3008"
#     p = Path(output) / "config.json"

#     with open(p) as fp:
#         cfg = json.load(fp)


#     print(f"MCRMSE for fold {fold}, seed {seed}: {cfg['best_metric']:.4f}")
#     scores.append(cfg['best_metric'])
# print(sum(scores)/len(scores))