In [None]:
import json
import torch
import random
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    AutoConfig, AutoModelForCausalLM, AutoTokenizer, EarlyStoppingCallback    
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import bitsandbytes as bnb
from huggingface_hub import login
import wandb
from transformers import BitsAndBytesConfig



# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

def set_random_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_random_seed()

In [None]:
# Load API tokens
CONFIG = json.load(open('/home/jupyter/datasphere/project/tokens.json'))
login(token=CONFIG["HF_TOK"])
wandb.login(key=CONFIG['WANDB_API_KEY'])

In [11]:
# Load dataset
dataset = load_dataset("csv", data_files=["/home/jupyter/datasphere/project/rugec/data/art_gec_full.tsv"], sep="\t")
dataset = dataset["train"].rename_columns({'correct': 'correct_sent', 'corrupt': 'corrupt_sent'})
# dataset = dataset.filter(lambda example: example['sentence1'].startswith('Ar'))
# Load additional datasets
def load_tsv(path):
    return pd.read_csv(path, sep="\t", index_col=None)

dev = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.dev.tsv')
train = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.train.tsv')
test = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.test.tsv')
clang8 = load_tsv('/home/jupyter/datasphere/project/rugec/data/clang8_source_target_ru.spacy_tokenized.tsv')
relco = load_tsv('/home/jupyter/datasphere/project/rugec/data/relco_filtered.tsv')
gera_train = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.train.tsv')
gera_test = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.test.tsv')
gera_dev = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.dev.tsv')
# train.corrupt_sent = train.corrupt_sent.map(lambda x: str(x))
# test.corrupt_sent = test.corrupt_sent.map(lambda x: str(x))
# dev.corrupt_sent = dev.corrupt_sent.map(lambda x: str(x))
# train.correct_sent = train.correct_sent.map(lambda x: str(x))
# test.correct_sent = test.correct_sent.map(lambda x: str(x))
# dev.correct_sent = dev.correct_sent.map(lambda x: str(x))
# Combine datasets
train_all = pd.concat([train.sample(frac=1, replace=True), clang8, relco, gera_train, gera_dev], ignore_index=True)
augmented_data = dataset.select(range(100000))

# Convert to Hugging Face Dataset format
fine_tune = {
    'train': Dataset.from_pandas(train_all),
    'test': Dataset.from_pandas(test),
    'dev': Dataset.from_pandas(dev)
}
# fine_tune['train'] = datasets.concatenate_datasets([fine_tune['train'], augmented_data])
fine_tune

{'train': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 4591
 }),
 'test': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 1314
 }),
 'dev': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 775
 })}

In [39]:
train[train.correct_sent.isna()]

Unnamed: 0,corrupt_sent,correct_sent


In [18]:
fine_tune['train'] = fine_tune['train'].shuffle()

In [12]:
for i in fine_tune['train'].take(2):
    print(i)

{'corrupt_sent': 'И никого не прокляну !', 'correct_sent': 'И никого не прокляну !'}
{'corrupt_sent': '( По поэме М. Ю. Лермонтова " Мцыри " . )', 'correct_sent': '( По поэме М. Ю. Лермонтова " Мцыри " . )'}


In [7]:
# Load model and tokenizer
# model_name = "RefalMachine/ruadapt_qwen2.5_3B_ext_u48_instruct_v4"
model_name = 'Qwen/Qwen2-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
).to(device)
model.config.use_cache = False

# LoRA configuration
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = {name.split(".")[-1] for name, module in model.named_modules() if isinstance(module, cls)}
    lora_module_names.discard("lm_head")
    return list(lora_module_names)

target_modules = find_all_linear_names(model)
peft_config = LoraConfig(
    lora_alpha=64,
    target_modules=target_modules,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM"
)

Downloading shards: 100%|██████████| 4/4 [06:15<00:00, 93.78s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [04:13<00:00, 63.38s/it]


In [42]:
#masked out for some 

# from transformers import BitsAndBytesConfig
# from peft import PeftModel, PeftConfig
# # Your adapter repo or local dir
# # peft_model_id = "mika5883/ru_qwen_gec" # or your output_dir path
# # peft_model_id = 'mika5883/ru_qwen_gec_Ag_art'
# peft_model_id = 'mika5883/ru_qwen7b_gec_Ag'

# # Load adapter config
# peft_config = PeftConfig.from_pretrained(peft_model_id)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )
# # Load base model
# base_model = AutoModelForCausalLM.from_pretrained(
#     peft_config.base_model_name_or_path,
#     trust_remote_code=True,
#     device_map="auto",
#     quantization_config=bnb_config,
#     low_cpu_mem_usage=True,
# )

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)

# # Merge LoRA adapter with base model
# model = PeftModel.from_pretrained(base_model, peft_model_id, is_trainable=True)
# model.config.use_cache = False

Downloading shards: 100%|██████████| 4/4 [00:00<00:00, 808.31it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.35s/it]


In [13]:
#probably not ideal format because it's a decoder model. but i needed a quick fix and it works. better option is in qwen3_ft.ipynb

def format_dataset(examples):
    # Prepare the text for tokenization
    text = [
        f"<|im_start|>system\nТы учитель русского языка, который исправляет ошибки в эссе своих учеников.<|im_end|>\n<|im_start|>user\nИсправь ошибки в следующем предложении: {corrupt}\nИсправленное предложение:<|im_end|>\n<|im_start|>assistant\n{correct}<|im_end|>"
        for corrupt, correct in zip(examples['corrupt_sent'], examples['correct_sent'])
    ]
    
    # Tokenize the text and prepare the labels
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(examples['correct_sent'], truncation=True, padding="max_length", max_length=512)

    # Add the labels to the inputs
    inputs['labels'] = labels['input_ids']
    
    return inputs

# Apply the formatting to all splits
fine_tune = {key: value.map(format_dataset, batched=True) for key, value in fine_tune.items()}


Map: 100%|██████████| 4591/4591 [00:01<00:00, 3595.69 examples/s]
Map: 100%|██████████| 1314/1314 [00:00<00:00, 3517.38 examples/s]
Map: 100%|██████████| 775/775 [00:00<00:00, 3502.35 examples/s]


In [14]:
for i in fine_tune['train'].take(1):
    print(i)

{'corrupt_sent': 'И никого не прокляну !', 'correct_sent': 'И никого не прокляну !', 'input_ids': [151644, 8948, 198, 33995, 4552, 28519, 31885, 33513, 20396, 128698, 34011, 18673, 19849, 52587, 26988, 61676, 24725, 37708, 125727, 130084, 88663, 14746, 13, 151645, 198, 151644, 872, 198, 30174, 32693, 26988, 4824, 88663, 16748, 5805, 92029, 71019, 10090, 56825, 81841, 83098, 25, 42796, 126073, 22107, 18658, 12281, 14746, 13695, 125467, 58062, 30174, 32693, 26988, 60398, 47050, 56825, 131923, 25, 151645, 198, 151644, 77091, 198, 30174, 126073, 22107, 18658, 12281, 14746, 13695, 125467, 753, 151645, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 1516

In [18]:
len(fine_tune['train']) // 8 // 2 + 1

287

In [44]:
args = SFTConfig(
    output_dir="ru_qwen7b_gec_Ga",
    eval_strategy="steps",
    eval_steps=9,
    logging_steps=9,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce batch size to prevent memory issues
    per_device_eval_batch_size=8,
    resume_from_checkpoint="last-checkpoint",
    weight_decay=0.05,
    gradient_accumulation_steps=2,
    # warmup_steps=200,
    save_total_limit=1,
    # num_train_epochs=1,
    max_steps=288,
    # predict_with_generate=True,
    # fp16=True,
    # early_stopping_patience=6,
    # early_s
    bf16=True,
    push_to_hub=True,
    hub_strategy="checkpoint",
    metric_for_best_model='loss',
    report_to = 'all',
    save_strategy='best',
    load_best_model_at_end=True,
    lr_scheduler_type="cosine", 
    warmup_ratio=0.05,
    # report_to='wandb',
    dataset_text_field='text',
    max_seq_length=512,
    packing=False,
    # disable_tqdm=False
)


In [45]:
import logging
logging.getLogger("wandb").setLevel(logging.ERROR)

In [46]:
# model.parallelize() #doesn't work, sadly

In [47]:
# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=fine_tune['train'],
    eval_dataset=fine_tune['dev'],
    peft_config=peft_config,
    processing_class=tokenizer,
    args=args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Training
trainer.train()


  0%|          | 0/288 [10:17<?, ?it/s]
  3%|▎         | 9/288 [00:24<12:16,  2.64s/it]

{'loss': 0.7221, 'grad_norm': 1.149951457977295, 'learning_rate': 1.2e-05, 'epoch': 0.03}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.37it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:49,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.74it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.72it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.69it/s][A
 13%|█▎        | 13/97 [00:07<00:49,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.68it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.7347809672355652, 'eval_runtime': 58.0327, 'eval_samples_per_second': 13.355, 'eval_steps_per_second': 1.671, 'epoch': 0.03}


  6%|▋         | 18/288 [01:58<17:22,  3.86s/it]  

{'loss': 0.697, 'grad_norm': 1.082994818687439, 'learning_rate': 1.9994041405510705e-05, 'epoch': 0.06}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.04it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.69it/s][A
 13%|█▎        | 13/97 [00:07<00:49,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.704463541507721, 'eval_runtime': 58.0727, 'eval_samples_per_second': 13.345, 'eval_steps_per_second': 1.67, 'epoch': 0.06}


  9%|▉         | 27/288 [03:33<16:59,  3.91s/it]  

{'loss': 0.6437, 'grad_norm': 1.0259759426116943, 'learning_rate': 1.9904804439875635e-05, 'epoch': 0.09}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6855674982070923, 'eval_runtime': 58.0928, 'eval_samples_per_second': 13.341, 'eval_steps_per_second': 1.67, 'epoch': 0.09}


 12%|█▎        | 36/288 [05:07<16:23,  3.90s/it]  

{'loss': 0.6577, 'grad_norm': 1.1006721258163452, 'learning_rate': 1.9709418174260523e-05, 'epoch': 0.13}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6760121583938599, 'eval_runtime': 58.0978, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.13}


 16%|█▌        | 45/288 [06:41<15:48,  3.90s/it]  

{'loss': 0.6543, 'grad_norm': 0.9172493815422058, 'learning_rate': 1.9409976553623767e-05, 'epoch': 0.16}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6690042018890381, 'eval_runtime': 58.097, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.16}


 19%|█▉        | 54/288 [08:16<15:25,  3.95s/it]  

{'loss': 0.6511, 'grad_norm': 0.9450319409370422, 'learning_rate': 1.900968867902419e-05, 'epoch': 0.19}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.04it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6640349626541138, 'eval_runtime': 58.1041, 'eval_samples_per_second': 13.338, 'eval_steps_per_second': 1.669, 'epoch': 0.19}


 22%|██▏       | 63/288 [09:51<14:38,  3.91s/it]  

{'loss': 0.6433, 'grad_norm': 1.0734907388687134, 'learning_rate': 1.8512844415843514e-05, 'epoch': 0.22}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6609904170036316, 'eval_runtime': 58.1036, 'eval_samples_per_second': 13.338, 'eval_steps_per_second': 1.669, 'epoch': 0.22}


 25%|██▌       | 72/288 [11:26<14:06,  3.92s/it]  

{'loss': 0.6384, 'grad_norm': 0.9293792247772217, 'learning_rate': 1.7924768419510906e-05, 'epoch': 0.25}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6582931280136108, 'eval_runtime': 58.0951, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.25}


 28%|██▊       | 81/288 [13:00<13:28,  3.91s/it]  

{'loss': 0.6404, 'grad_norm': 0.8554283976554871, 'learning_rate': 1.7251763071433767e-05, 'epoch': 0.28}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.74it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6551400423049927, 'eval_runtime': 58.107, 'eval_samples_per_second': 13.337, 'eval_steps_per_second': 1.669, 'epoch': 0.28}


 31%|███▏      | 90/288 [14:35<12:54,  3.91s/it]  

{'loss': 0.6486, 'grad_norm': 0.9829873442649841, 'learning_rate': 1.6501040936687444e-05, 'epoch': 0.31}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6524668335914612, 'eval_runtime': 58.0834, 'eval_samples_per_second': 13.343, 'eval_steps_per_second': 1.67, 'epoch': 0.31}


 34%|███▍      | 99/288 [16:09<12:18,  3.91s/it]  

{'loss': 0.644, 'grad_norm': 0.8760396838188171, 'learning_rate': 1.568064746731156e-05, 'epoch': 0.34}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6505880951881409, 'eval_runtime': 58.0965, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.34}


 38%|███▊      | 108/288 [17:44<12:01,  4.01s/it]  

{'loss': 0.609, 'grad_norm': 0.939542829990387, 'learning_rate': 1.4799374779597866e-05, 'epoch': 0.38}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.648729145526886, 'eval_runtime': 58.1072, 'eval_samples_per_second': 13.337, 'eval_steps_per_second': 1.669, 'epoch': 0.38}


 41%|████      | 117/288 [19:18<11:08,  3.91s/it]  

{'loss': 0.6408, 'grad_norm': 1.0429717302322388, 'learning_rate': 1.3866667429414188e-05, 'epoch': 0.41}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6474384069442749, 'eval_runtime': 58.0957, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.41}


 44%|████▍     | 126/288 [20:53<10:34,  3.92s/it]  

{'loss': 0.6351, 'grad_norm': 0.9410820007324219, 'learning_rate': 1.2892521195365679e-05, 'epoch': 0.44}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.37it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6456689238548279, 'eval_runtime': 58.1064, 'eval_samples_per_second': 13.338, 'eval_steps_per_second': 1.669, 'epoch': 0.44}


 47%|████▋     | 135/288 [22:27<09:58,  3.91s/it]  

{'loss': 0.6167, 'grad_norm': 0.9864495992660522, 'learning_rate': 1.1887375954529167e-05, 'epoch': 0.47}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6443417072296143, 'eval_runtime': 58.1179, 'eval_samples_per_second': 13.335, 'eval_steps_per_second': 1.669, 'epoch': 0.47}


 50%|█████     | 144/288 [24:02<09:24,  3.92s/it]  

{'loss': 0.6358, 'grad_norm': 1.0019580125808716, 'learning_rate': 1.0862003798806195e-05, 'epoch': 0.5}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:51,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6431646347045898, 'eval_runtime': 58.1069, 'eval_samples_per_second': 13.337, 'eval_steps_per_second': 1.669, 'epoch': 0.5}


 53%|█████▎    | 153/288 [25:36<08:46,  3.90s/it]

{'loss': 0.642, 'grad_norm': 0.99605792760849, 'learning_rate': 9.827393590946116e-06, 'epoch': 0.53}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6422916054725647, 'eval_runtime': 58.0912, 'eval_samples_per_second': 13.341, 'eval_steps_per_second': 1.67, 'epoch': 0.53}


 56%|█████▋    | 162/288 [27:10<08:11,  3.90s/it]

{'loss': 0.6366, 'grad_norm': 0.9673943519592285, 'learning_rate': 8.79463319744677e-06, 'epoch': 0.56}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6418711543083191, 'eval_runtime': 58.4192, 'eval_samples_per_second': 13.266, 'eval_steps_per_second': 1.66, 'epoch': 0.56}


 59%|█████▉    | 171/288 [28:45<07:38,  3.91s/it]

{'loss': 0.6061, 'grad_norm': 0.961106538772583, 'learning_rate': 7.774790660436857e-06, 'epoch': 0.6}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:40,  2.33it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6412049531936646, 'eval_runtime': 58.1156, 'eval_samples_per_second': 13.335, 'eval_steps_per_second': 1.669, 'epoch': 0.6}


 62%|██████▎   | 180/288 [30:20<07:01,  3.91s/it]

{'loss': 0.6274, 'grad_norm': 0.8705094456672668, 'learning_rate': 6.778795582015096e-06, 'epoch': 0.63}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.37it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6403592228889465, 'eval_runtime': 58.1007, 'eval_samples_per_second': 13.339, 'eval_steps_per_second': 1.67, 'epoch': 0.63}


 66%|██████▌   | 189/288 [31:54<06:26,  3.90s/it]

{'loss': 0.6073, 'grad_norm': 0.9312356114387512, 'learning_rate': 5.8173219922443516e-06, 'epoch': 0.66}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.04it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6396884918212891, 'eval_runtime': 58.1001, 'eval_samples_per_second': 13.339, 'eval_steps_per_second': 1.67, 'epoch': 0.66}


 69%|██████▉   | 198/288 [33:28<05:51,  3.91s/it]

{'loss': 0.6244, 'grad_norm': 1.0328197479248047, 'learning_rate': 4.900673956098644e-06, 'epoch': 0.69}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:09<00:58,  1.39it/s][A
 18%|█▊        | 17/97 [00:09<00:54,  1.46it/s][A
 19%|█▊        | 18/97 [00:10<00:52,  1.52it/s][A
 20%|█▉        | 19/97 [00:11<00:49,  1.56it/s][A
 21%|██        | 20/97 [00:11<00:48,  1.59it/s]

{'eval_loss': 0.6390401124954224, 'eval_runtime': 58.5097, 'eval_samples_per_second': 13.246, 'eval_steps_per_second': 1.658, 'epoch': 0.69}


 72%|███████▏  | 207/288 [35:03<05:16,  3.91s/it]

{'loss': 0.6218, 'grad_norm': 0.8988907337188721, 'learning_rate': 4.038675145307747e-06, 'epoch': 0.72}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6385577917098999, 'eval_runtime': 58.0924, 'eval_samples_per_second': 13.341, 'eval_steps_per_second': 1.67, 'epoch': 0.72}


 75%|███████▌  | 216/288 [36:37<04:41,  3.90s/it]

{'loss': 0.6052, 'grad_norm': 1.0243576765060425, 'learning_rate': 3.2405635585524566e-06, 'epoch': 0.75}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.37it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6382173895835876, 'eval_runtime': 58.0999, 'eval_samples_per_second': 13.339, 'eval_steps_per_second': 1.67, 'epoch': 0.75}


 78%|███████▊  | 225/288 [38:12<04:05,  3.90s/it]

{'loss': 0.6589, 'grad_norm': 0.8781611919403076, 'learning_rate': 2.514892518288988e-06, 'epoch': 0.78}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6379092335700989, 'eval_runtime': 58.1139, 'eval_samples_per_second': 13.336, 'eval_steps_per_second': 1.669, 'epoch': 0.78}


 81%|████████▏ | 234/288 [39:45<03:30,  3.90s/it]

{'loss': 0.6196, 'grad_norm': 1.0556769371032715, 'learning_rate': 1.8694390052146737e-06, 'epoch': 0.82}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6377308368682861, 'eval_runtime': 58.1191, 'eval_samples_per_second': 13.335, 'eval_steps_per_second': 1.669, 'epoch': 0.82}


 84%|████████▍ | 243/288 [41:20<02:56,  3.91s/it]

{'loss': 0.6244, 'grad_norm': 0.9310774207115173, 'learning_rate': 1.311120312749935e-06, 'epoch': 0.85}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:51,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6376007199287415, 'eval_runtime': 58.1128, 'eval_samples_per_second': 13.336, 'eval_steps_per_second': 1.669, 'epoch': 0.85}


 88%|████████▊ | 252/288 [42:55<02:20,  3.91s/it]

{'loss': 0.6092, 'grad_norm': 0.9245346784591675, 'learning_rate': 8.459199147463371e-07, 'epoch': 0.88}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.82it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6374460458755493, 'eval_runtime': 58.0961, 'eval_samples_per_second': 13.34, 'eval_steps_per_second': 1.67, 'epoch': 0.88}


 91%|█████████ | 261/288 [44:29<01:45,  3.91s/it]

{'loss': 0.6222, 'grad_norm': 1.0773546695709229, 'learning_rate': 4.788233408928588e-07, 'epoch': 0.91}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:48,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6373529434204102, 'eval_runtime': 58.0985, 'eval_samples_per_second': 13.339, 'eval_steps_per_second': 1.67, 'epoch': 0.91}


 94%|█████████▍| 270/288 [46:03<01:10,  3.90s/it]

{'loss': 0.612, 'grad_norm': 0.9593527317047119, 'learning_rate': 2.1376474704044693e-07, 'epoch': 0.94}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.36it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.05it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.90it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:50,  1.77it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.68it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6373732686042786, 'eval_runtime': 58.1117, 'eval_samples_per_second': 13.336, 'eval_steps_per_second': 1.669, 'epoch': 0.94}


 97%|█████████▋| 279/288 [47:25<00:33,  3.69s/it]

{'loss': 0.606, 'grad_norm': 0.9432071447372437, 'learning_rate': 5.3584753048073756e-08, 'epoch': 0.97}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.04it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.89it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:51,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6374443769454956, 'eval_runtime': 58.1322, 'eval_samples_per_second': 13.332, 'eval_steps_per_second': 1.669, 'epoch': 0.97}


100%|██████████| 288/288 [48:47<00:00,  3.65s/it]

{'loss': 0.6156, 'grad_norm': 0.915183424949646, 'learning_rate': 0.0, 'epoch': 1.0}



  0%|          | 0/97 [00:00<?, ?it/s][A
  2%|▏         | 2/97 [00:00<00:28,  3.35it/s][A
  3%|▎         | 3/97 [00:01<00:39,  2.36it/s][A
  4%|▍         | 4/97 [00:01<00:45,  2.04it/s][A
  5%|▌         | 5/97 [00:02<00:48,  1.89it/s][A
  6%|▌         | 6/97 [00:02<00:50,  1.81it/s][A
  7%|▋         | 7/97 [00:03<00:51,  1.76it/s][A
  8%|▊         | 8/97 [00:04<00:51,  1.73it/s][A
  9%|▉         | 9/97 [00:04<00:51,  1.71it/s][A
 10%|█         | 10/97 [00:05<00:51,  1.70it/s][A
 11%|█▏        | 11/97 [00:05<00:50,  1.69it/s][A
 12%|█▏        | 12/97 [00:06<00:50,  1.68it/s][A
 13%|█▎        | 13/97 [00:07<00:50,  1.68it/s][A
 14%|█▍        | 14/97 [00:07<00:49,  1.67it/s][A
 15%|█▌        | 15/97 [00:08<00:49,  1.67it/s][A
 16%|█▋        | 16/97 [00:08<00:48,  1.67it/s][A
 18%|█▊        | 17/97 [00:09<00:47,  1.67it/s][A
 19%|█▊        | 18/97 [00:10<00:47,  1.67it/s][A
 20%|█▉        | 19/97 [00:10<00:46,  1.67it/s][A
 21%|██        | 20/97 [00:11<00:46,  1.67it/s]

{'eval_loss': 0.6373622417449951, 'eval_runtime': 58.1063, 'eval_samples_per_second': 13.338, 'eval_steps_per_second': 1.669, 'epoch': 1.0}


100%|██████████| 288/288 [49:45<00:00, 10.37s/it]

{'train_runtime': 2985.6714, 'train_samples_per_second': 1.543, 'train_steps_per_second': 0.096, 'train_loss': 0.6348927401834064, 'epoch': 1.0}





TrainOutput(global_step=288, training_loss=0.6348927401834064, metrics={'train_runtime': 2985.6714, 'train_samples_per_second': 1.543, 'train_steps_per_second': 0.096, 'total_flos': 1.0063972687989965e+17, 'train_loss': 0.6348927401834064, 'epoch': 1.0034843205574913})