In [None]:
from unsloth import FastLanguageModel, unsloth_train
# import os
import json
import torch
import random
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset, load_dataset
# from transformers import (
#     # AutoConfig, AutoModelForCausalLM, AutoTokenizer,
#     # Seq2SeqTrainer, Seq2SeqTrainingArguments
# )
# from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import bitsandbytes as bnb
# from transformers import BitsAndBytesConfig
# from peft import PeftModel, PeftConfig

from huggingface_hub import login
import wandb

# Load API tokens
CONFIG = json.load(open('/home/jupyter/datasphere/project/tokens.json'))
login(token=CONFIG["HF_TOK"])
wandb.login(key=CONFIG['WANDB_API_KEY'])

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

def set_random_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_random_seed()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-01 01:11:35.516589: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth Zoo will now patch everything to make training faster!


wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: rostik58533 (mika5883). Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
# Load dataset
dataset = load_dataset("csv", data_files=["/home/jupyter/datasphere/project/rugec/data/art_gec_full.tsv"], sep="\t")
dataset = dataset["train"].rename_columns({'correct': 'correct_sent', 'corrupt': 'corrupt_sent'})
# dataset = dataset.filter(lambda example: example['sentence1'].startswith('Ar'))
# Load additional datasets
def load_tsv(path):
    return pd.read_csv(path, sep="\t", index_col=None)

dev = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.dev.tsv')
train = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.train.tsv')
test = load_tsv('/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.test.tsv')
clang8 = load_tsv('/home/jupyter/datasphere/project/rugec/data/clang8_source_target_ru.spacy_tokenized.tsv')
relco = load_tsv('/home/jupyter/datasphere/project/rugec/data/relco_filtered.tsv')
gera_train = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.train.tsv')
# gera_test = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.test.tsv')
gera_dev = load_tsv('/home/jupyter/datasphere/project/rugec/data/GERA.dev.tsv')
my_data = load_tsv('/home/jupyter/datasphere/project/rugec/data/25k_NVP.tsv')
# train.corrupt_sent = train.corrupt_sent.map(lambda x: str(x))
# test.corrupt_sent = test.corrupt_sent.map(lambda x: str(x))
# dev.corrupt_sent = dev.corrupt_sent.map(lambda x: str(x))
# train.correct_sent = train.correct_sent.map(lambda x: str(x))
# test.correct_sent = test.correct_sent.map(lambda x: str(x))
# dev.correct_sent = dev.correct_sent.map(lambda x: str(x))
# Combine datasets
train_all = pd.concat([train, clang8, relco, gera_train, gera_dev, my_data], ignore_index=True)
augmented_data = dataset.select(range(100000))
# Convert to Hugging Face Dataset format
fine_tune = {
    'train': Dataset.from_pandas(train_all),
    'test': Dataset.from_pandas(test),
    'dev': Dataset.from_pandas(dev)
}
fine_tune['train'] = datasets.concatenate_datasets([fine_tune['train'], augmented_data])
fine_tune

Generating train split: 10000000 examples [02:37, 63324.22 examples/s]


{'train': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 184947
 }),
 'test': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 5000
 }),
 'dev': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 2500
 })}

In [4]:
fine_tune['train'] = fine_tune['train'].shuffle()

In [5]:
def format_dataset(examples):
    return {
        "text": [
                f"<|im_start|>system\nТы учитель русского языка, который проверяет эссе своих учеников.<|im_end|>\n<|im_start|>user\nИсправь ошибки в следующем предложении: {corrupt}\nИсправленное предложение:<|im_end|>\n<|im_start|>assistant\n{correct}<|im_end|>"
        for corrupt, correct in zip(examples['corrupt_sent'], examples['correct_sent'])
        ]
    }

fine_tune = {key: value.map(format_dataset, batched=True) for key, value in fine_tune.items()}


Map: 100%|██████████| 184947/184947 [00:06<00:00, 30551.88 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 135728.33 examples/s]
Map: 100%|██████████| 2500/2500 [00:00<00:00, 141747.35 examples/s]


In [7]:
for i in fine_tune['train'].take(1):
    print(i)

{'corrupt_sent': 'Глава минздрава Ливана Хамад Хасан в интервью телекналу Sky News Arabia сообщил , что после взрыва в порту Бейрута ситуация катастрофическая : больницы в з сто улице и Ее окрестностях переоплнены ранеными,  а врачи испытывают нехватку медицинских принадл Бежностей .', 'correct_sent': 'Глава минздрава Ливана Хамад Хасан в интервью телеканалу Sky News Arabia сообщил , что после взрыва в порту Бейрута ситуация катастрофическая : больницы в столице и ее окрестностях переполнены ранеными , а врачи испытывают нехватку медицинских принадлежностей . ', 'text': '<|im_start|>system\nТы учитель русского языка, который проверяет эссе своих учеников.<|im_end|>\n<|im_start|>user\nИсправь ошибки в следующем предложении: Глава минздрава Ливана Хамад Хасан в интервью телекналу Sky News Arabia сообщил , что после взрыва в порту Бейрута ситуация катастрофическая : больницы в з сто улице и Ее окрестностях переоплнены ранеными,  а врачи испытывают нехватку медицинских принадл Бежностей .\

In [7]:
# Load model and tokenizer
# model_name = "RefalMachine/ruadapt_qwen2.5_3B_ext_u48_instruct_v4"
# model_name = 'Qwen/Qwen2-7B-Instruct'
# model_name = 'Qwen/Qwen3-4B'
# model_name = 'unsloth/Qwen3-4B-unsloth-bnb-4bit'
model_name = 'unsloth/Qwen3-14B-unsloth-bnb-4bit'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# config = AutoConfig.from_pretrained(model_name)
model, tokenizer = FastLanguageModel.from_pretrained(model_name = model_name,
    max_seq_length = 512,  # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     trust_remote_code=True,
#     low_cpu_mem_usage=True,
# ).to(device)
# model.config.use_cache = False

# LoRA configuration
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = {name.split(".")[-1] for name, module in model.named_modules() if isinstance(module, cls)}
    lora_module_names.discard("lm_head")
    return list(lora_module_names)

target_modules = find_all_linear_names(model)
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = target_modules,
    lora_alpha = 64,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True,#dse or "unsloth" for very long context
    # random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    # loftq_config = None,  # And LoftQ
)


# peft_config = LoraConfig(
#     lora_alpha=64,
#     target_modules=target_modules,
#     lora_dropout=0.05,
#     r=16,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

==((====))==  Unsloth 2025.5.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.325 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [03:09<00:00, 63.08s/it]
Unsloth 2025.5.7 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [8]:
for i in fine_tune['train'].take(1):
    print(i)

{'corrupt_sent': 'Изд коронавируса автолизинг во всем мире сделали элитно езаболевания , опаснее которого чеовечесэво и не видывало .', 'correct_sent': 'Из коронавируса во всем мире сделали элитное заболевание , опаснее которого человечество и не видывало . ', 'text': '<|im_start|>system\nТы учитель русского языка, который проверяет эссе своих учеников.<|im_end|>\n<|im_start|>user\nИсправь ошибки в следующем предложении: Изд коронавируса автолизинг во всем мире сделали элитно езаболевания , опаснее которого чеовечесэво и не видывало .\nИсправленное предложение:<|im_end|>\n<|im_start|>assistant\nИз коронавируса во всем мире сделали элитное заболевание , опаснее которого человечество и не видывало . <|im_end|>'}


In [11]:
(len(fine_tune['train']) // 8 // 4 + 1)

5780

In [10]:
from transformers import EarlyStoppingCallback

In [12]:
args = SFTConfig(
    output_dir="qwen3-14b_rugec",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    learning_rate=1e-4,
    per_device_train_batch_size=8,  # Reduce batch size to prevent memory issues
    per_device_eval_batch_size=8,
    resume_from_checkpoint="last-checkpoint",
    weight_decay=0.02,
    gradient_accumulation_steps=4,
    # warmup_steps=200,
    save_total_limit=1,
    # num_train_epochs=1,
    max_steps=5800, 
    bf16=True,
    optim = "adamw_8bit",
    push_to_hub=True,
    hub_strategy="checkpoint",
    metric_for_best_model='loss',
    report_to = 'all',
    save_strategy='best',
    load_best_model_at_end=True,
    lr_scheduler_type="cosine", 
    warmup_ratio=0.01,
    dataset_text_field='text',
    max_seq_length=512,
    packing=False,
    # disable_tqdm=False
)


In [13]:
import logging
logging.getLogger("wandb").setLevel(logging.ERROR)

In [14]:
# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=fine_tune['train'],
    eval_dataset=fine_tune['dev'],
    # peft_config=peft_config,
    processing_class=tokenizer,
    args=args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=15)],
)

# Training
unsloth_train(trainer)


Map (num_proc=28): 100%|██████████| 184947/184947 [00:07<00:00, 25886.89 examples/s]
Map (num_proc=28): 100%|██████████| 2500/2500 [00:03<00:00, 729.63 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 184,947 | Num Epochs = 2 | Total steps = 5,800
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)
wandb: Tracking run with wandb version 0.18.5
wandb: Run data is saved locally in /home/jupyter/work/resources/rugec/notebooks/qwen_gec/wandb/run-20250601_013133-ttjrdkt8
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run qwen3-14b_rugec
wandb: ⭐️ View project at https://wandb.ai/mika5883/huggingface
wandb: 🚀 View run at https://wandb.ai/mika5883/huggingface/runs/ttjrdkt8
  0%|          | 0/5800 [00:00<?, ?it/s]

Unsloth: Will smartly offload gradients to save VRAM!


  0%|          | 10/5800 [00:53<7:07:03,  4.43s/it]

{'loss': 2.5817, 'grad_norm': 6.015710830688477, 'learning_rate': 1.5517241379310346e-05, 'epoch': 0.0}


  0%|          | 20/5800 [01:31<5:53:02,  3.66s/it]

{'loss': 1.3876, 'grad_norm': 2.2214765548706055, 'learning_rate': 3.275862068965517e-05, 'epoch': 0.0}


  1%|          | 30/5800 [02:10<5:59:06,  3.73s/it]

{'loss': 1.0897, 'grad_norm': 1.0858349800109863, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|          | 40/5800 [02:46<5:57:08,  3.72s/it]

{'loss': 1.0129, 'grad_norm': 1.0100895166397095, 'learning_rate': 6.724137931034483e-05, 'epoch': 0.01}


  1%|          | 50/5800 [03:24<6:06:38,  3.83s/it]

{'loss': 0.9768, 'grad_norm': 0.9623083472251892, 'learning_rate': 8.448275862068966e-05, 'epoch': 0.01}


  1%|          | 60/5800 [03:58<5:23:02,  3.38s/it]

{'loss': 0.9678, 'grad_norm': 0.9459243416786194, 'learning_rate': 9.999999251635174e-05, 'epoch': 0.01}


  1%|          | 70/5800 [04:35<5:49:21,  3.66s/it]

{'loss': 1.0035, 'grad_norm': 0.8169307112693787, 'learning_rate': 9.999909448127132e-05, 'epoch': 0.01}


  1%|▏         | 80/5800 [05:11<5:35:36,  3.52s/it]

{'loss': 0.9555, 'grad_norm': 0.9122353792190552, 'learning_rate': 9.999669974734172e-05, 'epoch': 0.01}


  2%|▏         | 90/5800 [05:45<5:28:06,  3.45s/it]

{'loss': 0.9379, 'grad_norm': 0.8205231428146362, 'learning_rate': 9.999280838624815e-05, 'epoch': 0.02}


  2%|▏         | 100/5800 [06:22<5:43:05,  3.61s/it]Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


{'loss': 0.9894, 'grad_norm': 0.9211392998695374, 'learning_rate': 9.998742051447662e-05, 'epoch': 0.02}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.07it/s][A
  1%|          | 3/313 [00:00<01:01,  5.04it/s][A
  1%|▏         | 4/313 [00:00<01:12,  4.25it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.58it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.52it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:32,  3.28it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.76it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.69it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.80it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6898553967475891, 'eval_runtime': 83.9384, 'eval_samples_per_second': 29.784, 'eval_steps_per_second': 3.729, 'epoch': 0.02}


  2%|▏         | 110/5800 [09:00<8:09:19,  5.16s/it] 

{'loss': 0.9533, 'grad_norm': 0.821162760257721, 'learning_rate': 9.998053629331052e-05, 'epoch': 0.02}


  2%|▏         | 120/5800 [09:37<5:57:32,  3.78s/it]

{'loss': 0.939, 'grad_norm': 0.9390451908111572, 'learning_rate': 9.997215592882565e-05, 'epoch': 0.02}


  2%|▏         | 130/5800 [10:13<6:06:25,  3.88s/it]

{'loss': 0.907, 'grad_norm': 0.7198857665061951, 'learning_rate': 9.996227967188423e-05, 'epoch': 0.02}


  2%|▏         | 140/5800 [10:49<5:25:07,  3.45s/it]

{'loss': 0.9593, 'grad_norm': 0.788139283657074, 'learning_rate': 9.995090781812723e-05, 'epoch': 0.02}


  3%|▎         | 150/5800 [11:27<5:53:34,  3.75s/it]

{'loss': 0.9682, 'grad_norm': 0.6435049772262573, 'learning_rate': 9.993804070796566e-05, 'epoch': 0.03}


  3%|▎         | 160/5800 [12:05<6:01:34,  3.85s/it]

{'loss': 0.9689, 'grad_norm': 0.7634875178337097, 'learning_rate': 9.992367872657025e-05, 'epoch': 0.03}


  3%|▎         | 170/5800 [12:42<5:44:49,  3.67s/it]

{'loss': 0.9501, 'grad_norm': 0.8156092166900635, 'learning_rate': 9.990782230386e-05, 'epoch': 0.03}


  3%|▎         | 180/5800 [13:17<5:29:03,  3.51s/it]

{'loss': 0.9012, 'grad_norm': 0.8065553903579712, 'learning_rate': 9.989047191448932e-05, 'epoch': 0.03}


  3%|▎         | 190/5800 [13:53<5:41:25,  3.65s/it]

{'loss': 0.9724, 'grad_norm': 0.8228743672370911, 'learning_rate': 9.987162807783377e-05, 'epoch': 0.03}


  3%|▎         | 200/5800 [14:28<5:23:08,  3.46s/it]

{'loss': 0.9284, 'grad_norm': 0.7270958423614502, 'learning_rate': 9.985129135797453e-05, 'epoch': 0.03}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.04it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6876827478408813, 'eval_runtime': 83.5815, 'eval_samples_per_second': 29.911, 'eval_steps_per_second': 3.745, 'epoch': 0.03}


  4%|▎         | 210/5800 [16:51<7:25:35,  4.78s/it] 

{'loss': 0.952, 'grad_norm': 0.7607027292251587, 'learning_rate': 9.982946236368152e-05, 'epoch': 0.04}


  4%|▍         | 220/5800 [17:27<5:16:22,  3.40s/it]

{'loss': 0.9448, 'grad_norm': 0.8047840595245361, 'learning_rate': 9.98061417483952e-05, 'epoch': 0.04}


  4%|▍         | 230/5800 [18:02<5:33:17,  3.59s/it]

{'loss': 0.9087, 'grad_norm': 0.7010196447372437, 'learning_rate': 9.978133021020697e-05, 'epoch': 0.04}


  4%|▍         | 240/5800 [18:39<5:24:17,  3.50s/it]

{'loss': 0.9371, 'grad_norm': 0.7305755615234375, 'learning_rate': 9.97550284918383e-05, 'epoch': 0.04}


  4%|▍         | 250/5800 [19:16<5:28:43,  3.55s/it]

{'loss': 0.9381, 'grad_norm': 0.7592856884002686, 'learning_rate': 9.972723738061847e-05, 'epoch': 0.04}


  4%|▍         | 260/5800 [19:51<5:25:21,  3.52s/it]

{'loss': 0.9138, 'grad_norm': 0.7927075624465942, 'learning_rate': 9.969795770846104e-05, 'epoch': 0.04}


  5%|▍         | 270/5800 [20:28<5:34:03,  3.62s/it]

{'loss': 0.9171, 'grad_norm': 0.7674887180328369, 'learning_rate': 9.966719035183893e-05, 'epoch': 0.05}


  5%|▍         | 280/5800 [21:05<5:44:53,  3.75s/it]

{'loss': 0.9235, 'grad_norm': 0.6968740224838257, 'learning_rate': 9.963493623175812e-05, 'epoch': 0.05}


  5%|▌         | 290/5800 [21:40<5:33:46,  3.63s/it]

{'loss': 0.9336, 'grad_norm': 0.7178575992584229, 'learning_rate': 9.960119631373022e-05, 'epoch': 0.05}


  5%|▌         | 300/5800 [22:17<5:32:10,  3.62s/it]

{'loss': 0.9312, 'grad_norm': 0.7630997896194458, 'learning_rate': 9.956597160774345e-05, 'epoch': 0.05}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6797415018081665, 'eval_runtime': 83.6595, 'eval_samples_per_second': 29.883, 'eval_steps_per_second': 3.741, 'epoch': 0.05}


  5%|▌         | 310/5800 [24:31<7:16:09,  4.77s/it] 

{'loss': 0.9236, 'grad_norm': 0.8914933204650879, 'learning_rate': 9.952926316823242e-05, 'epoch': 0.05}


  6%|▌         | 320/5800 [25:05<5:25:06,  3.56s/it]

{'loss': 0.935, 'grad_norm': 0.8269317746162415, 'learning_rate': 9.949107209404665e-05, 'epoch': 0.06}


  6%|▌         | 330/5800 [25:41<5:32:57,  3.65s/it]

{'loss': 0.919, 'grad_norm': 0.7308562994003296, 'learning_rate': 9.945139952841753e-05, 'epoch': 0.06}


  6%|▌         | 340/5800 [26:19<5:48:58,  3.83s/it]

{'loss': 0.9477, 'grad_norm': 0.8175477981567383, 'learning_rate': 9.94102466589243e-05, 'epoch': 0.06}


  6%|▌         | 350/5800 [26:57<5:55:17,  3.91s/it]

{'loss': 0.9645, 'grad_norm': 0.9482796788215637, 'learning_rate': 9.936761471745826e-05, 'epoch': 0.06}


  6%|▌         | 360/5800 [27:33<5:21:30,  3.55s/it]

{'loss': 0.9381, 'grad_norm': 0.9046177268028259, 'learning_rate': 9.93235049801861e-05, 'epoch': 0.06}


  6%|▋         | 370/5800 [28:10<5:36:50,  3.72s/it]

{'loss': 0.9163, 'grad_norm': 0.7312265038490295, 'learning_rate': 9.92779187675116e-05, 'epoch': 0.06}


  7%|▋         | 380/5800 [28:47<5:25:23,  3.60s/it]

{'loss': 0.9369, 'grad_norm': 0.9754659533500671, 'learning_rate': 9.92308574440361e-05, 'epoch': 0.07}


  7%|▋         | 390/5800 [29:24<5:40:03,  3.77s/it]

{'loss': 0.9207, 'grad_norm': 0.8144022226333618, 'learning_rate': 9.918232241851769e-05, 'epoch': 0.07}


  7%|▋         | 400/5800 [30:01<5:27:40,  3.64s/it]

{'loss': 0.9477, 'grad_norm': 0.7482270002365112, 'learning_rate': 9.913231514382903e-05, 'epoch': 0.07}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:01,  5.02it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.57it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.69it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.97it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6779100298881531, 'eval_runtime': 83.578, 'eval_samples_per_second': 29.912, 'eval_steps_per_second': 3.745, 'epoch': 0.07}


  7%|▋         | 410/5800 [32:26<7:25:59,  4.96s/it] 

{'loss': 0.9442, 'grad_norm': 0.7981593608856201, 'learning_rate': 9.908083711691383e-05, 'epoch': 0.07}


  7%|▋         | 420/5800 [33:02<5:30:38,  3.69s/it]

{'loss': 0.9109, 'grad_norm': 1.0844032764434814, 'learning_rate': 9.902788987874206e-05, 'epoch': 0.07}


  7%|▋         | 430/5800 [33:38<5:17:12,  3.54s/it]

{'loss': 0.9142, 'grad_norm': 0.7185619473457336, 'learning_rate': 9.897347501426386e-05, 'epoch': 0.07}


  8%|▊         | 440/5800 [34:12<5:13:16,  3.51s/it]

{'loss': 0.8833, 'grad_norm': 0.8481583595275879, 'learning_rate': 9.891759415236201e-05, 'epoch': 0.08}


  8%|▊         | 450/5800 [34:48<5:14:46,  3.53s/it]

{'loss': 0.8987, 'grad_norm': 0.7301335334777832, 'learning_rate': 9.886024896580325e-05, 'epoch': 0.08}


  8%|▊         | 460/5800 [35:25<5:20:51,  3.61s/it]

{'loss': 0.9299, 'grad_norm': 0.8622676134109497, 'learning_rate': 9.880144117118814e-05, 'epoch': 0.08}


  8%|▊         | 470/5800 [36:00<5:23:19,  3.64s/it]

{'loss': 0.9162, 'grad_norm': 0.7100499868392944, 'learning_rate': 9.874117252889976e-05, 'epoch': 0.08}


  8%|▊         | 480/5800 [36:37<5:30:56,  3.73s/it]

{'loss': 0.8815, 'grad_norm': 0.6737704277038574, 'learning_rate': 9.867944484305094e-05, 'epoch': 0.08}


  8%|▊         | 490/5800 [37:14<5:22:37,  3.65s/it]

{'loss': 0.8988, 'grad_norm': 0.7742936611175537, 'learning_rate': 9.861625996143022e-05, 'epoch': 0.08}


  9%|▊         | 500/5800 [37:51<5:09:19,  3.50s/it]

{'loss': 0.921, 'grad_norm': 0.716029703617096, 'learning_rate': 9.855161977544672e-05, 'epoch': 0.09}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.69it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.08it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.97it/s][A
  6%|▌         | 19/313 [00:04<01:12,  4.03it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6807900667190552, 'eval_runtime': 83.5442, 'eval_samples_per_second': 29.924, 'eval_steps_per_second': 3.747, 'epoch': 0.09}


  9%|▉         | 510/5800 [39:51<6:45:42,  4.60s/it] 

{'loss': 0.9144, 'grad_norm': 0.8720099925994873, 'learning_rate': 9.848552622007326e-05, 'epoch': 0.09}


  9%|▉         | 520/5800 [40:28<5:12:54,  3.56s/it]

{'loss': 0.9301, 'grad_norm': 0.7763680815696716, 'learning_rate': 9.841798127378866e-05, 'epoch': 0.09}


  9%|▉         | 530/5800 [41:04<5:13:09,  3.57s/it]

{'loss': 0.9072, 'grad_norm': 0.8207877278327942, 'learning_rate': 9.83489869585184e-05, 'epoch': 0.09}


  9%|▉         | 540/5800 [41:39<5:13:10,  3.57s/it]

{'loss': 0.8771, 'grad_norm': 0.7608314156532288, 'learning_rate': 9.827854533957413e-05, 'epoch': 0.09}


  9%|▉         | 550/5800 [42:16<5:15:41,  3.61s/it]

{'loss': 0.954, 'grad_norm': 0.8814701437950134, 'learning_rate': 9.820665852559186e-05, 'epoch': 0.1}


 10%|▉         | 560/5800 [42:52<5:12:26,  3.58s/it]

{'loss': 0.8643, 'grad_norm': 0.8162974715232849, 'learning_rate': 9.813332866846876e-05, 'epoch': 0.1}


 10%|▉         | 570/5800 [43:30<5:43:39,  3.94s/it]

{'loss': 0.9407, 'grad_norm': 0.8587883114814758, 'learning_rate': 9.805855796329886e-05, 'epoch': 0.1}


 10%|█         | 580/5800 [44:05<4:50:39,  3.34s/it]

{'loss': 0.8975, 'grad_norm': 0.939703643321991, 'learning_rate': 9.798234864830727e-05, 'epoch': 0.1}


 10%|█         | 590/5800 [44:41<5:16:05,  3.64s/it]

{'loss': 0.8693, 'grad_norm': 0.7515537738800049, 'learning_rate': 9.790470300478319e-05, 'epoch': 0.1}


 10%|█         | 600/5800 [45:18<5:11:21,  3.59s/it]

{'loss': 0.9214, 'grad_norm': 0.8917337656021118, 'learning_rate': 9.782562335701158e-05, 'epoch': 0.1}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:01,  5.01it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.57it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.28it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.97it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6761459708213806, 'eval_runtime': 83.5616, 'eval_samples_per_second': 29.918, 'eval_steps_per_second': 3.746, 'epoch': 0.1}


 11%|█         | 610/5800 [47:51<7:24:51,  5.14s/it] 

{'loss': 0.9429, 'grad_norm': 0.757129967212677, 'learning_rate': 9.77451120722037e-05, 'epoch': 0.11}


 11%|█         | 620/5800 [48:27<5:15:39,  3.66s/it]

{'loss': 0.9352, 'grad_norm': 0.9175543785095215, 'learning_rate': 9.766317156042615e-05, 'epoch': 0.11}


 11%|█         | 630/5800 [49:03<5:02:14,  3.51s/it]

{'loss': 0.908, 'grad_norm': 0.7622382044792175, 'learning_rate': 9.757980427452871e-05, 'epoch': 0.11}


 11%|█         | 640/5800 [49:39<5:06:30,  3.56s/it]

{'loss': 0.8835, 'grad_norm': 0.758353590965271, 'learning_rate': 9.749501271007103e-05, 'epoch': 0.11}


 11%|█         | 650/5800 [50:16<5:15:24,  3.67s/it]

{'loss': 0.9081, 'grad_norm': 0.8489050269126892, 'learning_rate': 9.740879940524781e-05, 'epoch': 0.11}


 11%|█▏        | 660/5800 [50:53<5:22:02,  3.76s/it]

{'loss': 0.9128, 'grad_norm': 0.8480801582336426, 'learning_rate': 9.732116694081286e-05, 'epoch': 0.11}


 12%|█▏        | 670/5800 [51:28<5:06:17,  3.58s/it]

{'loss': 0.9143, 'grad_norm': 0.841681718826294, 'learning_rate': 9.723211794000185e-05, 'epoch': 0.12}


 12%|█▏        | 680/5800 [52:02<4:51:35,  3.42s/it]

{'loss': 0.8548, 'grad_norm': 0.7679477334022522, 'learning_rate': 9.714165506845382e-05, 'epoch': 0.12}


 12%|█▏        | 690/5800 [52:39<5:15:36,  3.71s/it]

{'loss': 0.9214, 'grad_norm': 0.9901122450828552, 'learning_rate': 9.704978103413132e-05, 'epoch': 0.12}


 12%|█▏        | 700/5800 [53:16<4:59:56,  3.53s/it]

{'loss': 0.8867, 'grad_norm': 0.7925032377243042, 'learning_rate': 9.695649858723938e-05, 'epoch': 0.12}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6771001219749451, 'eval_runtime': 83.7098, 'eval_samples_per_second': 29.865, 'eval_steps_per_second': 3.739, 'epoch': 0.12}


 12%|█▏        | 710/5800 [55:15<6:13:31,  4.40s/it] 

{'loss': 0.9077, 'grad_norm': 0.6943270564079285, 'learning_rate': 9.686181052014318e-05, 'epoch': 0.12}


 12%|█▏        | 720/5800 [55:49<4:41:08,  3.32s/it]

{'loss': 0.8497, 'grad_norm': 0.7677996754646301, 'learning_rate': 9.676571966728452e-05, 'epoch': 0.12}


 13%|█▎        | 730/5800 [56:25<5:08:38,  3.65s/it]

{'loss': 0.8984, 'grad_norm': 0.838508129119873, 'learning_rate': 9.66682289050968e-05, 'epoch': 0.13}


 13%|█▎        | 740/5800 [57:01<5:08:25,  3.66s/it]

{'loss': 0.9185, 'grad_norm': 0.9653245210647583, 'learning_rate': 9.656934115191916e-05, 'epoch': 0.13}


 13%|█▎        | 750/5800 [57:37<5:06:47,  3.65s/it]

{'loss': 0.8821, 'grad_norm': 1.150577425956726, 'learning_rate': 9.64690593679089e-05, 'epoch': 0.13}


 13%|█▎        | 760/5800 [58:13<5:06:44,  3.65s/it]

{'loss': 0.8994, 'grad_norm': 0.9269518852233887, 'learning_rate': 9.636738655495305e-05, 'epoch': 0.13}


 13%|█▎        | 770/5800 [58:48<5:18:41,  3.80s/it]

{'loss': 0.8787, 'grad_norm': 0.9918815493583679, 'learning_rate': 9.626432575657833e-05, 'epoch': 0.13}


 13%|█▎        | 780/5800 [59:23<4:48:43,  3.45s/it]

{'loss': 0.9018, 'grad_norm': 0.7654539346694946, 'learning_rate': 9.615988005786019e-05, 'epoch': 0.13}


 14%|█▎        | 790/5800 [1:00:00<4:59:26,  3.59s/it]

{'loss': 0.8747, 'grad_norm': 0.9215059280395508, 'learning_rate': 9.605405258533038e-05, 'epoch': 0.14}


 14%|█▍        | 800/5800 [1:00:36<4:57:01,  3.56s/it]

{'loss': 0.891, 'grad_norm': 0.7570186257362366, 'learning_rate': 9.594684650688341e-05, 'epoch': 0.14}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6750118136405945, 'eval_runtime': 83.6854, 'eval_samples_per_second': 29.874, 'eval_steps_per_second': 3.74, 'epoch': 0.14}


 14%|█▍        | 810/5800 [1:03:08<6:37:50,  4.78s/it] 

{'loss': 0.8912, 'grad_norm': 0.8019583821296692, 'learning_rate': 9.58382650316817e-05, 'epoch': 0.14}


 14%|█▍        | 820/5800 [1:03:43<4:43:19,  3.41s/it]

{'loss': 0.9, 'grad_norm': 0.8997704386711121, 'learning_rate': 9.572831141005947e-05, 'epoch': 0.14}


 14%|█▍        | 830/5800 [1:04:19<5:01:42,  3.64s/it]

{'loss': 0.8879, 'grad_norm': 0.8337780237197876, 'learning_rate': 9.561698893342551e-05, 'epoch': 0.14}


 14%|█▍        | 840/5800 [1:04:55<5:03:08,  3.67s/it]

{'loss': 0.8872, 'grad_norm': 0.872983992099762, 'learning_rate': 9.550430093416464e-05, 'epoch': 0.15}


 15%|█▍        | 850/5800 [1:05:33<5:05:49,  3.71s/it]

{'loss': 0.8952, 'grad_norm': 0.8959748148918152, 'learning_rate': 9.539025078553792e-05, 'epoch': 0.15}


 15%|█▍        | 860/5800 [1:06:08<4:52:41,  3.55s/it]

{'loss': 0.8792, 'grad_norm': 0.8051807880401611, 'learning_rate': 9.52748419015817e-05, 'epoch': 0.15}


 15%|█▌        | 870/5800 [1:06:44<4:58:39,  3.63s/it]

{'loss': 0.8951, 'grad_norm': 0.8593394756317139, 'learning_rate': 9.515807773700543e-05, 'epoch': 0.15}


 15%|█▌        | 880/5800 [1:07:18<4:38:25,  3.40s/it]

{'loss': 0.879, 'grad_norm': 0.9438800811767578, 'learning_rate': 9.50399617870882e-05, 'epoch': 0.15}


 15%|█▌        | 890/5800 [1:07:55<5:05:02,  3.73s/it]

{'loss': 0.9213, 'grad_norm': 0.8278340101242065, 'learning_rate': 9.492049758757422e-05, 'epoch': 0.15}


 16%|█▌        | 900/5800 [1:08:31<5:02:42,  3.71s/it]

{'loss': 0.8583, 'grad_norm': 0.8920608162879944, 'learning_rate': 9.479968871456679e-05, 'epoch': 0.16}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6754932403564453, 'eval_runtime': 83.7086, 'eval_samples_per_second': 29.866, 'eval_steps_per_second': 3.739, 'epoch': 0.16}


 16%|█▌        | 910/5800 [1:10:31<6:22:44,  4.70s/it] 

{'loss': 0.8766, 'grad_norm': 0.8289142847061157, 'learning_rate': 9.467753878442148e-05, 'epoch': 0.16}


 16%|█▌        | 920/5800 [1:11:07<4:51:03,  3.58s/it]

{'loss': 0.9292, 'grad_norm': 0.8719796538352966, 'learning_rate': 9.45540514536377e-05, 'epoch': 0.16}


 16%|█▌        | 930/5800 [1:11:43<4:56:25,  3.65s/it]

{'loss': 0.8948, 'grad_norm': 1.0385291576385498, 'learning_rate': 9.442923041874927e-05, 'epoch': 0.16}


 16%|█▌        | 940/5800 [1:12:18<4:56:26,  3.66s/it]

{'loss': 0.9094, 'grad_norm': 0.9549623131752014, 'learning_rate': 9.43030794162139e-05, 'epoch': 0.16}


 16%|█▋        | 950/5800 [1:12:56<4:54:30,  3.64s/it]

{'loss': 0.9403, 'grad_norm': 0.8118468523025513, 'learning_rate': 9.417560222230115e-05, 'epoch': 0.16}


 17%|█▋        | 960/5800 [1:13:33<4:55:41,  3.67s/it]

{'loss': 0.9248, 'grad_norm': 0.8081384897232056, 'learning_rate': 9.404680265297954e-05, 'epoch': 0.17}


 17%|█▋        | 970/5800 [1:14:10<4:51:35,  3.62s/it]

{'loss': 0.8565, 'grad_norm': 0.9302375912666321, 'learning_rate': 9.391668456380221e-05, 'epoch': 0.17}


 17%|█▋        | 980/5800 [1:14:47<5:01:24,  3.75s/it]

{'loss': 0.9351, 'grad_norm': 0.8262361288070679, 'learning_rate': 9.378525184979162e-05, 'epoch': 0.17}


 17%|█▋        | 990/5800 [1:15:23<4:42:03,  3.52s/it]

{'loss': 0.9208, 'grad_norm': 0.8189195990562439, 'learning_rate': 9.365250844532282e-05, 'epoch': 0.17}


 17%|█▋        | 1000/5800 [1:15:58<4:39:48,  3.50s/it]

{'loss': 0.9186, 'grad_norm': 0.836380124092102, 'learning_rate': 9.351845832400584e-05, 'epoch': 0.17}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6780780553817749, 'eval_runtime': 83.7238, 'eval_samples_per_second': 29.86, 'eval_steps_per_second': 3.738, 'epoch': 0.17}


 17%|█▋        | 1010/5800 [1:17:56<6:00:34,  4.52s/it] 

{'loss': 0.9075, 'grad_norm': 0.8620930910110474, 'learning_rate': 9.338310549856657e-05, 'epoch': 0.17}


 18%|█▊        | 1020/5800 [1:18:34<4:59:08,  3.75s/it]

{'loss': 0.9181, 'grad_norm': 1.0194014310836792, 'learning_rate': 9.324645402072673e-05, 'epoch': 0.18}


 18%|█▊        | 1030/5800 [1:19:15<5:16:44,  3.98s/it]

{'loss': 0.8923, 'grad_norm': 0.9024031162261963, 'learning_rate': 9.310850798108263e-05, 'epoch': 0.18}


 18%|█▊        | 1040/5800 [1:19:52<4:49:22,  3.65s/it]

{'loss': 0.9252, 'grad_norm': 0.8997750282287598, 'learning_rate': 9.29692715089826e-05, 'epoch': 0.18}


 18%|█▊        | 1050/5800 [1:20:30<4:59:31,  3.78s/it]

{'loss': 0.9403, 'grad_norm': 0.9035513997077942, 'learning_rate': 9.282874877240348e-05, 'epoch': 0.18}


 18%|█▊        | 1060/5800 [1:21:06<4:28:47,  3.40s/it]

{'loss': 0.8753, 'grad_norm': 0.8628132939338684, 'learning_rate': 9.268694397782585e-05, 'epoch': 0.18}


 18%|█▊        | 1070/5800 [1:21:41<4:38:22,  3.53s/it]

{'loss': 0.9069, 'grad_norm': 1.0172667503356934, 'learning_rate': 9.2543861370108e-05, 'epoch': 0.19}


 19%|█▊        | 1080/5800 [1:22:19<4:48:00,  3.66s/it]

{'loss': 0.8938, 'grad_norm': 0.8527135252952576, 'learning_rate': 9.2399505232359e-05, 'epoch': 0.19}


 19%|█▉        | 1090/5800 [1:22:54<4:41:37,  3.59s/it]

{'loss': 0.8871, 'grad_norm': 0.7801002860069275, 'learning_rate': 9.22538798858104e-05, 'epoch': 0.19}


 19%|█▉        | 1100/5800 [1:23:31<4:38:42,  3.56s/it]

{'loss': 0.9254, 'grad_norm': 0.9427697658538818, 'learning_rate': 9.210698968968695e-05, 'epoch': 0.19}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.69it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.80it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.04it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.97it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6722095012664795, 'eval_runtime': 83.6414, 'eval_samples_per_second': 29.889, 'eval_steps_per_second': 3.742, 'epoch': 0.19}


 19%|█▉        | 1110/5800 [1:25:56<6:31:58,  5.01s/it] 

{'loss': 0.9039, 'grad_norm': 0.8215741515159607, 'learning_rate': 9.1958839041076e-05, 'epoch': 0.19}


 19%|█▉        | 1120/5800 [1:26:30<4:25:46,  3.41s/it]

{'loss': 0.8651, 'grad_norm': 0.822327733039856, 'learning_rate': 9.1809432374796e-05, 'epoch': 0.19}


 19%|█▉        | 1130/5800 [1:27:06<4:36:36,  3.55s/it]

{'loss': 0.9437, 'grad_norm': 0.8510898351669312, 'learning_rate': 9.165877416326366e-05, 'epoch': 0.2}


 20%|█▉        | 1140/5800 [1:27:44<5:05:08,  3.93s/it]

{'loss': 0.8763, 'grad_norm': 0.8635074496269226, 'learning_rate': 9.150686891636005e-05, 'epoch': 0.2}


 20%|█▉        | 1150/5800 [1:28:21<4:48:40,  3.72s/it]

{'loss': 0.8531, 'grad_norm': 1.0166690349578857, 'learning_rate': 9.13537211812957e-05, 'epoch': 0.2}


 20%|██        | 1160/5800 [1:29:00<4:42:47,  3.66s/it]

{'loss': 0.9374, 'grad_norm': 0.9273441433906555, 'learning_rate': 9.119933554247445e-05, 'epoch': 0.2}


 20%|██        | 1170/5800 [1:29:35<4:28:42,  3.48s/it]

{'loss': 0.8548, 'grad_norm': 0.8464263081550598, 'learning_rate': 9.104371662135612e-05, 'epoch': 0.2}


 20%|██        | 1180/5800 [1:30:12<4:41:06,  3.65s/it]

{'loss': 0.8966, 'grad_norm': 0.9665746688842773, 'learning_rate': 9.08868690763183e-05, 'epoch': 0.2}


 21%|██        | 1190/5800 [1:30:47<4:22:37,  3.42s/it]

{'loss': 0.868, 'grad_norm': 0.8274655938148499, 'learning_rate': 9.072879760251679e-05, 'epoch': 0.21}


 21%|██        | 1200/5800 [1:31:22<4:20:44,  3.40s/it]

{'loss': 0.8674, 'grad_norm': 0.9298135042190552, 'learning_rate': 9.056950693174518e-05, 'epoch': 0.21}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.06it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6757708787918091, 'eval_runtime': 83.7024, 'eval_samples_per_second': 29.868, 'eval_steps_per_second': 3.739, 'epoch': 0.21}


 21%|██        | 1210/5800 [1:33:24<6:02:31,  4.74s/it] 

{'loss': 0.92, 'grad_norm': 0.9235158562660217, 'learning_rate': 9.040900183229306e-05, 'epoch': 0.21}


 21%|██        | 1220/5800 [1:34:00<4:41:30,  3.69s/it]

{'loss': 0.8964, 'grad_norm': 0.9878303408622742, 'learning_rate': 9.024728710880345e-05, 'epoch': 0.21}


 21%|██        | 1230/5800 [1:34:37<4:46:58,  3.77s/it]

{'loss': 0.8958, 'grad_norm': 0.9292741417884827, 'learning_rate': 9.008436760212878e-05, 'epoch': 0.21}


 21%|██▏       | 1240/5800 [1:35:12<4:31:02,  3.57s/it]

{'loss': 0.8831, 'grad_norm': 0.7761751413345337, 'learning_rate': 8.992024818918617e-05, 'epoch': 0.21}


 22%|██▏       | 1250/5800 [1:35:48<4:36:25,  3.65s/it]

{'loss': 0.9257, 'grad_norm': 0.8773901462554932, 'learning_rate': 8.975493378281128e-05, 'epoch': 0.22}


 22%|██▏       | 1260/5800 [1:36:24<4:24:23,  3.49s/it]

{'loss': 0.8716, 'grad_norm': 0.7828934192657471, 'learning_rate': 8.958842933161142e-05, 'epoch': 0.22}


 22%|██▏       | 1270/5800 [1:37:01<4:40:22,  3.71s/it]

{'loss': 0.8839, 'grad_norm': 0.8220714926719666, 'learning_rate': 8.942073981981723e-05, 'epoch': 0.22}


 22%|██▏       | 1280/5800 [1:37:35<4:18:06,  3.43s/it]

{'loss': 0.8991, 'grad_norm': 0.8851328492164612, 'learning_rate': 8.925187026713362e-05, 'epoch': 0.22}


 22%|██▏       | 1290/5800 [1:38:11<4:28:45,  3.58s/it]

{'loss': 0.9119, 'grad_norm': 0.7971526980400085, 'learning_rate': 8.908182572858944e-05, 'epoch': 0.22}


 22%|██▏       | 1300/5800 [1:38:47<4:23:09,  3.51s/it]

{'loss': 0.8636, 'grad_norm': 0.8458296060562134, 'learning_rate': 8.891061129438618e-05, 'epoch': 0.22}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.679550290107727, 'eval_runtime': 83.738, 'eval_samples_per_second': 29.855, 'eval_steps_per_second': 3.738, 'epoch': 0.22}


 23%|██▎       | 1310/5800 [1:40:48<5:48:33,  4.66s/it] 

{'loss': 0.8784, 'grad_norm': 0.8123028874397278, 'learning_rate': 8.873823208974556e-05, 'epoch': 0.23}


 23%|██▎       | 1320/5800 [1:41:25<4:33:20,  3.66s/it]

{'loss': 0.913, 'grad_norm': 0.8134354948997498, 'learning_rate': 8.856469327475623e-05, 'epoch': 0.23}


 23%|██▎       | 1330/5800 [1:42:06<4:42:51,  3.80s/it]

{'loss': 0.9002, 'grad_norm': 1.0804033279418945, 'learning_rate': 8.839000004421914e-05, 'epoch': 0.23}


 23%|██▎       | 1340/5800 [1:42:44<4:42:48,  3.80s/it]

{'loss': 0.8856, 'grad_norm': 0.8233842849731445, 'learning_rate': 8.821415762749213e-05, 'epoch': 0.23}


 23%|██▎       | 1350/5800 [1:43:20<4:28:19,  3.62s/it]

{'loss': 0.89, 'grad_norm': 0.9836586117744446, 'learning_rate': 8.80371712883334e-05, 'epoch': 0.23}


 23%|██▎       | 1360/5800 [1:43:57<4:39:35,  3.78s/it]

{'loss': 0.9272, 'grad_norm': 0.9974381923675537, 'learning_rate': 8.785904632474386e-05, 'epoch': 0.24}


 24%|██▎       | 1370/5800 [1:44:33<4:41:03,  3.81s/it]

{'loss': 0.863, 'grad_norm': 0.8438715934753418, 'learning_rate': 8.76797880688087e-05, 'epoch': 0.24}


 24%|██▍       | 1380/5800 [1:45:09<4:19:59,  3.53s/it]

{'loss': 0.9288, 'grad_norm': 0.8910900354385376, 'learning_rate': 8.749940188653754e-05, 'epoch': 0.24}


 24%|██▍       | 1390/5800 [1:45:45<4:15:32,  3.48s/it]

{'loss': 0.873, 'grad_norm': 0.7809755802154541, 'learning_rate': 8.731789317770407e-05, 'epoch': 0.24}


 24%|██▍       | 1400/5800 [1:46:22<4:26:02,  3.63s/it]

{'loss': 0.9084, 'grad_norm': 1.0406297445297241, 'learning_rate': 8.713526737568413e-05, 'epoch': 0.24}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6754797697067261, 'eval_runtime': 83.7035, 'eval_samples_per_second': 29.867, 'eval_steps_per_second': 3.739, 'epoch': 0.24}


 24%|██▍       | 1410/5800 [1:48:22<5:31:15,  4.53s/it] 

{'loss': 0.9164, 'grad_norm': 0.9217818379402161, 'learning_rate': 8.695152994729334e-05, 'epoch': 0.24}


 24%|██▍       | 1420/5800 [1:48:58<4:29:46,  3.70s/it]

{'loss': 0.8917, 'grad_norm': 0.9482040405273438, 'learning_rate': 8.676668639262324e-05, 'epoch': 0.25}


 25%|██▍       | 1430/5800 [1:49:35<4:18:41,  3.55s/it]

{'loss': 0.8905, 'grad_norm': 0.7724895477294922, 'learning_rate': 8.658074224487675e-05, 'epoch': 0.25}


 25%|██▍       | 1440/5800 [1:50:11<4:30:40,  3.72s/it]

{'loss': 0.8944, 'grad_norm': 0.7965384125709534, 'learning_rate': 8.639370307020252e-05, 'epoch': 0.25}


 25%|██▌       | 1450/5800 [1:50:47<4:06:49,  3.40s/it]

{'loss': 0.8746, 'grad_norm': 0.8107068538665771, 'learning_rate': 8.620557446752827e-05, 'epoch': 0.25}


 25%|██▌       | 1460/5800 [1:51:24<4:23:04,  3.64s/it]

{'loss': 0.8839, 'grad_norm': 0.9051913619041443, 'learning_rate': 8.601636206839328e-05, 'epoch': 0.25}


 25%|██▌       | 1470/5800 [1:52:00<4:22:27,  3.64s/it]

{'loss': 0.9023, 'grad_norm': 0.7543138861656189, 'learning_rate': 8.582607153677973e-05, 'epoch': 0.25}


 26%|██▌       | 1480/5800 [1:52:37<4:21:01,  3.63s/it]

{'loss': 0.877, 'grad_norm': 0.9856760501861572, 'learning_rate': 8.563470856894316e-05, 'epoch': 0.26}


 26%|██▌       | 1490/5800 [1:53:13<4:21:08,  3.64s/it]

{'loss': 0.8962, 'grad_norm': 0.8227445483207703, 'learning_rate': 8.544227889324198e-05, 'epoch': 0.26}


 26%|██▌       | 1500/5800 [1:53:48<4:06:28,  3.44s/it]

{'loss': 0.8762, 'grad_norm': 0.8740897178649902, 'learning_rate': 8.5248788269966e-05, 'epoch': 0.26}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6715632081031799, 'eval_runtime': 83.726, 'eval_samples_per_second': 29.859, 'eval_steps_per_second': 3.738, 'epoch': 0.26}


 26%|██▌       | 1510/5800 [1:56:20<5:43:15,  4.80s/it] 

{'loss': 0.8808, 'grad_norm': 0.8504020571708679, 'learning_rate': 8.505424249116402e-05, 'epoch': 0.26}


 26%|██▌       | 1520/5800 [1:56:55<4:25:52,  3.73s/it]

{'loss': 0.9051, 'grad_norm': 0.8679038882255554, 'learning_rate': 8.485864738047031e-05, 'epoch': 0.26}


 26%|██▋       | 1530/5800 [1:57:34<4:28:52,  3.78s/it]

{'loss': 0.8986, 'grad_norm': 0.945533037185669, 'learning_rate': 8.466200879293049e-05, 'epoch': 0.26}


 27%|██▋       | 1540/5800 [1:58:13<4:35:54,  3.89s/it]

{'loss': 0.8795, 'grad_norm': 0.8132824897766113, 'learning_rate': 8.446433261482611e-05, 'epoch': 0.27}


 27%|██▋       | 1550/5800 [1:58:47<4:09:57,  3.53s/it]

{'loss': 0.8769, 'grad_norm': 0.9359444379806519, 'learning_rate': 8.426562476349848e-05, 'epoch': 0.27}


 27%|██▋       | 1560/5800 [1:59:24<4:17:43,  3.65s/it]

{'loss': 0.8806, 'grad_norm': 0.8513033390045166, 'learning_rate': 8.406589118717161e-05, 'epoch': 0.27}


 27%|██▋       | 1570/5800 [1:59:59<4:06:35,  3.50s/it]

{'loss': 0.8782, 'grad_norm': 0.8694882392883301, 'learning_rate': 8.386513786477401e-05, 'epoch': 0.27}


 27%|██▋       | 1580/5800 [2:00:35<4:15:03,  3.63s/it]

{'loss': 0.8876, 'grad_norm': 0.7655044198036194, 'learning_rate': 8.36633708057599e-05, 'epoch': 0.27}


 27%|██▋       | 1590/5800 [2:01:11<4:07:10,  3.52s/it]

{'loss': 0.8987, 'grad_norm': 0.8315529227256775, 'learning_rate': 8.346059604992912e-05, 'epoch': 0.28}


 28%|██▊       | 1600/5800 [2:01:49<4:33:20,  3.90s/it]

{'loss': 0.8972, 'grad_norm': 0.8563298583030701, 'learning_rate': 8.325681966724647e-05, 'epoch': 0.28}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6726592779159546, 'eval_runtime': 83.7602, 'eval_samples_per_second': 29.847, 'eval_steps_per_second': 3.737, 'epoch': 0.28}


 28%|██▊       | 1610/5800 [2:03:47<5:12:28,  4.47s/it] 

{'loss': 0.8346, 'grad_norm': 0.8989658355712891, 'learning_rate': 8.305204775766003e-05, 'epoch': 0.28}


 28%|██▊       | 1620/5800 [2:04:23<4:16:35,  3.68s/it]

{'loss': 0.9044, 'grad_norm': 0.8175770044326782, 'learning_rate': 8.284628645091836e-05, 'epoch': 0.28}


 28%|██▊       | 1630/5800 [2:05:00<4:14:26,  3.66s/it]

{'loss': 0.8526, 'grad_norm': 0.9095108509063721, 'learning_rate': 8.263954190638728e-05, 'epoch': 0.28}


 28%|██▊       | 1640/5800 [2:05:36<4:06:10,  3.55s/it]

{'loss': 0.9133, 'grad_norm': 0.9947728514671326, 'learning_rate': 8.243182031286531e-05, 'epoch': 0.28}


 28%|██▊       | 1650/5800 [2:06:12<4:13:04,  3.66s/it]

{'loss': 0.8737, 'grad_norm': 0.8599993586540222, 'learning_rate': 8.222312788839843e-05, 'epoch': 0.29}


 29%|██▊       | 1660/5800 [2:06:49<4:08:18,  3.60s/it]

{'loss': 0.8789, 'grad_norm': 0.9148308634757996, 'learning_rate': 8.201347088009403e-05, 'epoch': 0.29}


 29%|██▉       | 1670/5800 [2:07:24<4:13:00,  3.68s/it]

{'loss': 0.8699, 'grad_norm': 0.8280636072158813, 'learning_rate': 8.180285556393383e-05, 'epoch': 0.29}


 29%|██▉       | 1680/5800 [2:08:02<4:17:28,  3.75s/it]

{'loss': 0.9233, 'grad_norm': 0.8332366943359375, 'learning_rate': 8.159128824458604e-05, 'epoch': 0.29}


 29%|██▉       | 1690/5800 [2:08:37<4:04:27,  3.57s/it]

{'loss': 0.8614, 'grad_norm': 0.8075991868972778, 'learning_rate': 8.137877525521662e-05, 'epoch': 0.29}


 29%|██▉       | 1700/5800 [2:09:11<3:59:39,  3.51s/it]

{'loss': 0.871, 'grad_norm': 0.9421436190605164, 'learning_rate': 8.11653229572997e-05, 'epoch': 0.29}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6779306530952454, 'eval_runtime': 83.7479, 'eval_samples_per_second': 29.852, 'eval_steps_per_second': 3.737, 'epoch': 0.29}


 29%|██▉       | 1710/5800 [2:11:11<5:09:58,  4.55s/it] 

{'loss': 0.8798, 'grad_norm': 0.8142483830451965, 'learning_rate': 8.095093774042717e-05, 'epoch': 0.3}


 30%|██▉       | 1720/5800 [2:11:47<4:11:56,  3.71s/it]

{'loss': 0.894, 'grad_norm': 0.7512742280960083, 'learning_rate': 8.073562602211741e-05, 'epoch': 0.3}


 30%|██▉       | 1730/5800 [2:12:27<4:07:48,  3.65s/it]

{'loss': 0.8958, 'grad_norm': 0.8227419257164001, 'learning_rate': 8.051939424762319e-05, 'epoch': 0.3}


 30%|███       | 1740/5800 [2:13:04<4:13:06,  3.74s/it]

{'loss': 0.8769, 'grad_norm': 0.8768590092658997, 'learning_rate': 8.030224888973866e-05, 'epoch': 0.3}


 30%|███       | 1750/5800 [2:13:41<4:05:45,  3.64s/it]

{'loss': 0.885, 'grad_norm': 1.001565933227539, 'learning_rate': 8.008419644860569e-05, 'epoch': 0.3}


 30%|███       | 1760/5800 [2:14:18<3:59:34,  3.56s/it]

{'loss': 0.882, 'grad_norm': 0.8985353708267212, 'learning_rate': 7.986524345151925e-05, 'epoch': 0.3}


 31%|███       | 1770/5800 [2:14:54<4:04:19,  3.64s/it]

{'loss': 0.8719, 'grad_norm': 0.898499608039856, 'learning_rate': 7.964539645273204e-05, 'epoch': 0.31}


 31%|███       | 1780/5800 [2:15:31<4:08:11,  3.70s/it]

{'loss': 0.8831, 'grad_norm': 0.871583104133606, 'learning_rate': 7.94246620332582e-05, 'epoch': 0.31}


 31%|███       | 1790/5800 [2:16:05<3:53:08,  3.49s/it]

{'loss': 0.8684, 'grad_norm': 0.7986581325531006, 'learning_rate': 7.920304680067646e-05, 'epoch': 0.31}


 31%|███       | 1800/5800 [2:16:42<4:03:25,  3.65s/it]

{'loss': 0.8718, 'grad_norm': 0.9577500224113464, 'learning_rate': 7.898055738893223e-05, 'epoch': 0.31}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.04it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6697911024093628, 'eval_runtime': 83.8037, 'eval_samples_per_second': 29.832, 'eval_steps_per_second': 3.735, 'epoch': 0.31}


 31%|███       | 1810/5800 [2:19:07<5:40:40,  5.12s/it] 

{'loss': 0.9289, 'grad_norm': 0.9021440148353577, 'learning_rate': 7.875720045813905e-05, 'epoch': 0.31}


 31%|███▏      | 1820/5800 [2:19:44<4:02:48,  3.66s/it]

{'loss': 0.914, 'grad_norm': 0.8513519763946533, 'learning_rate': 7.853298269437923e-05, 'epoch': 0.31}


 32%|███▏      | 1830/5800 [2:20:21<4:00:19,  3.63s/it]

{'loss': 0.8726, 'grad_norm': 0.8369174599647522, 'learning_rate': 7.830791080950373e-05, 'epoch': 0.32}


 32%|███▏      | 1840/5800 [2:20:56<3:52:15,  3.52s/it]

{'loss': 0.8678, 'grad_norm': 0.816008448600769, 'learning_rate': 7.808199154093116e-05, 'epoch': 0.32}


 32%|███▏      | 1850/5800 [2:21:32<3:49:28,  3.49s/it]

{'loss': 0.8752, 'grad_norm': 0.835021436214447, 'learning_rate': 7.785523165144619e-05, 'epoch': 0.32}


 32%|███▏      | 1860/5800 [2:22:08<3:49:14,  3.49s/it]

{'loss': 0.8914, 'grad_norm': 0.9539132118225098, 'learning_rate': 7.762763792899707e-05, 'epoch': 0.32}


 32%|███▏      | 1870/5800 [2:22:46<4:22:43,  4.01s/it]

{'loss': 0.8986, 'grad_norm': 1.1131680011749268, 'learning_rate': 7.739921718649242e-05, 'epoch': 0.32}


 32%|███▏      | 1880/5800 [2:23:23<4:07:41,  3.79s/it]

{'loss': 0.8751, 'grad_norm': 0.9413450360298157, 'learning_rate': 7.716997626159734e-05, 'epoch': 0.33}


 33%|███▎      | 1890/5800 [2:23:59<3:48:14,  3.50s/it]

{'loss': 0.879, 'grad_norm': 1.0507797002792358, 'learning_rate': 7.693992201652867e-05, 'epoch': 0.33}


 33%|███▎      | 1900/5800 [2:24:36<4:00:02,  3.69s/it]

{'loss': 0.9044, 'grad_norm': 0.9121330380439758, 'learning_rate': 7.67090613378496e-05, 'epoch': 0.33}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6667335629463196, 'eval_runtime': 83.7258, 'eval_samples_per_second': 29.859, 'eval_steps_per_second': 3.738, 'epoch': 0.33}


 33%|███▎      | 1910/5800 [2:26:48<4:54:13,  4.54s/it] 

{'loss': 0.8657, 'grad_norm': 0.8443592190742493, 'learning_rate': 7.647740113626354e-05, 'epoch': 0.33}


 33%|███▎      | 1920/5800 [2:27:25<4:06:17,  3.81s/it]

{'loss': 0.8886, 'grad_norm': 0.8647570013999939, 'learning_rate': 7.624494834640719e-05, 'epoch': 0.33}


 33%|███▎      | 1930/5800 [2:27:59<3:37:06,  3.37s/it]

{'loss': 0.829, 'grad_norm': 0.8407773971557617, 'learning_rate': 7.601170992664307e-05, 'epoch': 0.33}


 33%|███▎      | 1940/5800 [2:28:36<4:06:11,  3.83s/it]

{'loss': 0.878, 'grad_norm': 0.9583560824394226, 'learning_rate': 7.577769285885109e-05, 'epoch': 0.34}


 34%|███▎      | 1950/5800 [2:29:12<3:52:28,  3.62s/it]

{'loss': 0.888, 'grad_norm': 0.8576187491416931, 'learning_rate': 7.554290414821965e-05, 'epoch': 0.34}


 34%|███▍      | 1960/5800 [2:29:48<3:45:20,  3.52s/it]

{'loss': 0.8751, 'grad_norm': 0.8445473909378052, 'learning_rate': 7.530735082303588e-05, 'epoch': 0.34}


 34%|███▍      | 1970/5800 [2:30:23<3:45:54,  3.54s/it]

{'loss': 0.8945, 'grad_norm': 0.9504017233848572, 'learning_rate': 7.507103993447531e-05, 'epoch': 0.34}


 34%|███▍      | 1980/5800 [2:31:01<3:56:18,  3.71s/it]

{'loss': 0.8862, 'grad_norm': 0.8330782055854797, 'learning_rate': 7.483397855639074e-05, 'epoch': 0.34}


 34%|███▍      | 1990/5800 [2:31:37<3:45:50,  3.56s/it]

{'loss': 0.8628, 'grad_norm': 0.8790995478630066, 'learning_rate': 7.45961737851005e-05, 'epoch': 0.34}


 34%|███▍      | 2000/5800 [2:32:13<3:47:27,  3.59s/it]

{'loss': 0.8696, 'grad_norm': 0.8248181939125061, 'learning_rate': 7.435763273917611e-05, 'epoch': 0.35}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.32it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.69it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6702947020530701, 'eval_runtime': 83.7264, 'eval_samples_per_second': 29.859, 'eval_steps_per_second': 3.738, 'epoch': 0.35}


 35%|███▍      | 2010/5800 [2:34:13<4:50:39,  4.60s/it] 

{'loss': 0.8941, 'grad_norm': 0.9419578909873962, 'learning_rate': 7.411836255922903e-05, 'epoch': 0.35}


 35%|███▍      | 2020/5800 [2:34:48<3:41:36,  3.52s/it]

{'loss': 0.8427, 'grad_norm': 0.9193402528762817, 'learning_rate': 7.387837040769705e-05, 'epoch': 0.35}


 35%|███▌      | 2030/5800 [2:35:25<3:54:47,  3.74s/it]

{'loss': 0.9158, 'grad_norm': 0.8948056101799011, 'learning_rate': 7.363766346862981e-05, 'epoch': 0.35}


 35%|███▌      | 2040/5800 [2:36:01<3:49:39,  3.66s/it]

{'loss': 0.9079, 'grad_norm': 0.8924005627632141, 'learning_rate': 7.339624894747377e-05, 'epoch': 0.35}


 35%|███▌      | 2050/5800 [2:36:37<3:41:06,  3.54s/it]

{'loss': 0.859, 'grad_norm': 0.8811632990837097, 'learning_rate': 7.315413407085655e-05, 'epoch': 0.35}


 36%|███▌      | 2060/5800 [2:37:12<3:48:08,  3.66s/it]

{'loss': 0.8722, 'grad_norm': 0.9034895896911621, 'learning_rate': 7.291132608637052e-05, 'epoch': 0.36}


 36%|███▌      | 2070/5800 [2:37:46<3:37:38,  3.50s/it]

{'loss': 0.8323, 'grad_norm': 0.8921817541122437, 'learning_rate': 7.266783226235596e-05, 'epoch': 0.36}


 36%|███▌      | 2080/5800 [2:38:22<3:44:27,  3.62s/it]

{'loss': 0.8417, 'grad_norm': 0.8138280510902405, 'learning_rate': 7.242365988768338e-05, 'epoch': 0.36}


 36%|███▌      | 2090/5800 [2:38:58<3:43:49,  3.62s/it]

{'loss': 0.8694, 'grad_norm': 1.1583423614501953, 'learning_rate': 7.21788162715354e-05, 'epoch': 0.36}


 36%|███▌      | 2100/5800 [2:39:35<3:52:30,  3.77s/it]

{'loss': 0.8923, 'grad_norm': 0.8686999082565308, 'learning_rate': 7.193330874318792e-05, 'epoch': 0.36}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6663892269134521, 'eval_runtime': 83.7747, 'eval_samples_per_second': 29.842, 'eval_steps_per_second': 3.736, 'epoch': 0.36}


 36%|███▋      | 2110/5800 [2:42:13<5:21:07,  5.22s/it] 

{'loss': 0.8825, 'grad_norm': 0.8704347610473633, 'learning_rate': 7.168714465179076e-05, 'epoch': 0.37}


 37%|███▋      | 2120/5800 [2:42:50<3:53:53,  3.81s/it]

{'loss': 0.894, 'grad_norm': 0.9947754144668579, 'learning_rate': 7.14403313661476e-05, 'epoch': 0.37}


 37%|███▋      | 2130/5800 [2:43:26<3:49:19,  3.75s/it]

{'loss': 0.9025, 'grad_norm': 0.8267766833305359, 'learning_rate': 7.119287627449545e-05, 'epoch': 0.37}


 37%|███▋      | 2140/5800 [2:44:05<3:54:57,  3.85s/it]

{'loss': 0.9273, 'grad_norm': 0.841367244720459, 'learning_rate': 7.09447867842835e-05, 'epoch': 0.37}


 37%|███▋      | 2150/5800 [2:44:41<3:44:50,  3.70s/it]

{'loss': 0.8526, 'grad_norm': 0.9064012765884399, 'learning_rate': 7.069607032195131e-05, 'epoch': 0.37}


 37%|███▋      | 2160/5800 [2:45:16<3:30:13,  3.47s/it]

{'loss': 0.8544, 'grad_norm': 1.0095405578613281, 'learning_rate': 7.044673433270659e-05, 'epoch': 0.37}


 37%|███▋      | 2170/5800 [2:45:49<3:20:04,  3.31s/it]

{'loss': 0.8302, 'grad_norm': 0.9836069345474243, 'learning_rate': 7.019678628030228e-05, 'epoch': 0.38}


 38%|███▊      | 2180/5800 [2:46:26<3:39:17,  3.63s/it]

{'loss': 0.8655, 'grad_norm': 0.8343697786331177, 'learning_rate': 6.994623364681312e-05, 'epoch': 0.38}


 38%|███▊      | 2190/5800 [2:47:03<3:38:51,  3.64s/it]

{'loss': 0.8856, 'grad_norm': 0.7839730978012085, 'learning_rate': 6.969508393241171e-05, 'epoch': 0.38}


 38%|███▊      | 2200/5800 [2:47:39<3:43:30,  3.73s/it]

{'loss': 0.8524, 'grad_norm': 0.8051570057868958, 'learning_rate': 6.944334465514395e-05, 'epoch': 0.38}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6668261885643005, 'eval_runtime': 83.8019, 'eval_samples_per_second': 29.832, 'eval_steps_per_second': 3.735, 'epoch': 0.38}


 38%|███▊      | 2210/5800 [2:49:40<4:48:56,  4.83s/it] 

{'loss': 0.8689, 'grad_norm': 0.9022552371025085, 'learning_rate': 6.91910233507041e-05, 'epoch': 0.38}


 38%|███▊      | 2220/5800 [2:50:19<3:46:08,  3.79s/it]

{'loss': 0.9271, 'grad_norm': 0.8260829448699951, 'learning_rate': 6.893812757220903e-05, 'epoch': 0.38}


 38%|███▊      | 2230/5800 [2:50:56<3:50:28,  3.87s/it]

{'loss': 0.9065, 'grad_norm': 0.8443773984909058, 'learning_rate': 6.868466488997227e-05, 'epoch': 0.39}


 39%|███▊      | 2240/5800 [2:51:33<3:34:57,  3.62s/it]

{'loss': 0.8581, 'grad_norm': 0.9480289220809937, 'learning_rate': 6.84306428912773e-05, 'epoch': 0.39}


 39%|███▉      | 2250/5800 [2:52:08<3:21:11,  3.40s/it]

{'loss': 0.8465, 'grad_norm': 0.8045953512191772, 'learning_rate': 6.817606918015052e-05, 'epoch': 0.39}


 39%|███▉      | 2260/5800 [2:52:44<3:41:11,  3.75s/it]

{'loss': 0.8575, 'grad_norm': 0.8419873714447021, 'learning_rate': 6.792095137713354e-05, 'epoch': 0.39}


 39%|███▉      | 2270/5800 [2:53:19<3:18:41,  3.38s/it]

{'loss': 0.8422, 'grad_norm': 0.776531994342804, 'learning_rate': 6.766529711905513e-05, 'epoch': 0.39}


 39%|███▉      | 2280/5800 [2:53:54<3:24:59,  3.49s/it]

{'loss': 0.8338, 'grad_norm': 0.8773966431617737, 'learning_rate': 6.740911405880253e-05, 'epoch': 0.39}


 39%|███▉      | 2290/5800 [2:54:28<3:27:06,  3.54s/it]

{'loss': 0.8491, 'grad_norm': 0.8604735732078552, 'learning_rate': 6.715240986509246e-05, 'epoch': 0.4}


 40%|███▉      | 2300/5800 [2:55:04<3:25:25,  3.52s/it]

{'loss': 0.8575, 'grad_norm': 0.9003579020500183, 'learning_rate': 6.689519222224153e-05, 'epoch': 0.4}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.00it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6661560535430908, 'eval_runtime': 83.7395, 'eval_samples_per_second': 29.854, 'eval_steps_per_second': 3.738, 'epoch': 0.4}


 40%|███▉      | 2310/5800 [2:57:26<4:36:39,  4.76s/it] 

{'loss': 0.8491, 'grad_norm': 0.7852982878684998, 'learning_rate': 6.663746882993616e-05, 'epoch': 0.4}


 40%|████      | 2320/5800 [2:58:01<3:30:16,  3.63s/it]

{'loss': 0.8579, 'grad_norm': 1.048638105392456, 'learning_rate': 6.637924740300219e-05, 'epoch': 0.4}


 40%|████      | 2330/5800 [2:58:38<3:32:55,  3.68s/it]

{'loss': 0.8477, 'grad_norm': 0.7904406785964966, 'learning_rate': 6.612053567117384e-05, 'epoch': 0.4}


 40%|████      | 2340/5800 [2:59:14<3:29:24,  3.63s/it]

{'loss': 0.8443, 'grad_norm': 0.8780279755592346, 'learning_rate': 6.58613413788624e-05, 'epoch': 0.4}


 41%|████      | 2350/5800 [2:59:51<3:31:16,  3.67s/it]

{'loss': 0.8888, 'grad_norm': 0.8488750457763672, 'learning_rate': 6.560167228492436e-05, 'epoch': 0.41}


 41%|████      | 2360/5800 [3:00:28<3:24:33,  3.57s/it]

{'loss': 0.8635, 'grad_norm': 0.7373877167701721, 'learning_rate': 6.534153616242918e-05, 'epoch': 0.41}


 41%|████      | 2370/5800 [3:01:04<3:21:38,  3.53s/it]

{'loss': 0.8433, 'grad_norm': 0.8790000677108765, 'learning_rate': 6.508094079842657e-05, 'epoch': 0.41}


 41%|████      | 2380/5800 [3:01:39<3:15:45,  3.43s/it]

{'loss': 0.8505, 'grad_norm': 0.9458850622177124, 'learning_rate': 6.481989399371347e-05, 'epoch': 0.41}


 41%|████      | 2390/5800 [3:02:15<3:21:16,  3.54s/it]

{'loss': 0.8696, 'grad_norm': 0.8672084212303162, 'learning_rate': 6.455840356260041e-05, 'epoch': 0.41}


 41%|████▏     | 2400/5800 [3:02:50<3:17:34,  3.49s/it]

{'loss': 0.865, 'grad_norm': 0.9661089181900024, 'learning_rate': 6.429647733267772e-05, 'epoch': 0.42}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.05it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6657170653343201, 'eval_runtime': 83.7789, 'eval_samples_per_second': 29.84, 'eval_steps_per_second': 3.736, 'epoch': 0.42}


 42%|████▏     | 2410/5800 [3:05:13<4:32:52,  4.83s/it] 

{'loss': 0.8519, 'grad_norm': 1.0278488397598267, 'learning_rate': 6.40341231445811e-05, 'epoch': 0.42}


 42%|████▏     | 2420/5800 [3:05:48<3:18:10,  3.52s/it]

{'loss': 0.8343, 'grad_norm': 0.7547295093536377, 'learning_rate': 6.377134885175705e-05, 'epoch': 0.42}


 42%|████▏     | 2430/5800 [3:06:24<3:19:43,  3.56s/it]

{'loss': 0.8904, 'grad_norm': 0.801518976688385, 'learning_rate': 6.350816232022764e-05, 'epoch': 0.42}


 42%|████▏     | 2440/5800 [3:07:02<3:34:58,  3.84s/it]

{'loss': 0.8737, 'grad_norm': 0.9058918356895447, 'learning_rate': 6.324457142835513e-05, 'epoch': 0.42}


 42%|████▏     | 2450/5800 [3:07:35<3:07:38,  3.36s/it]

{'loss': 0.8301, 'grad_norm': 1.0331343412399292, 'learning_rate': 6.298058406660611e-05, 'epoch': 0.42}


 42%|████▏     | 2460/5800 [3:08:11<3:09:11,  3.40s/it]

{'loss': 0.8461, 'grad_norm': 0.9198092222213745, 'learning_rate': 6.27162081373153e-05, 'epoch': 0.43}


 43%|████▎     | 2470/5800 [3:08:45<3:09:12,  3.41s/it]

{'loss': 0.8639, 'grad_norm': 0.8707752823829651, 'learning_rate': 6.245145155444901e-05, 'epoch': 0.43}


 43%|████▎     | 2480/5800 [3:09:22<3:16:33,  3.55s/it]

{'loss': 0.8904, 'grad_norm': 0.8431342244148254, 'learning_rate': 6.218632224336824e-05, 'epoch': 0.43}


 43%|████▎     | 2490/5800 [3:09:58<3:18:02,  3.59s/it]

{'loss': 0.9016, 'grad_norm': 0.8598293662071228, 'learning_rate': 6.19208281405914e-05, 'epoch': 0.43}


 43%|████▎     | 2500/5800 [3:10:34<3:11:49,  3.49s/it]

{'loss': 0.9047, 'grad_norm': 0.9135134220123291, 'learning_rate': 6.16549771935568e-05, 'epoch': 0.43}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:01,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.57it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.08it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.97it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6637772917747498, 'eval_runtime': 83.5955, 'eval_samples_per_second': 29.906, 'eval_steps_per_second': 3.744, 'epoch': 0.43}


 43%|████▎     | 2510/5800 [3:12:49<4:27:08,  4.87s/it] 

{'loss': 0.8733, 'grad_norm': 1.1695159673690796, 'learning_rate': 6.138877736038467e-05, 'epoch': 0.43}


 43%|████▎     | 2520/5800 [3:13:23<3:08:35,  3.45s/it]

{'loss': 0.8437, 'grad_norm': 0.9245681166648865, 'learning_rate': 6.112223660963903e-05, 'epoch': 0.44}


 44%|████▎     | 2530/5800 [3:14:00<3:14:09,  3.56s/it]

{'loss': 0.8505, 'grad_norm': 0.8198302388191223, 'learning_rate': 6.085536292008904e-05, 'epoch': 0.44}


 44%|████▍     | 2540/5800 [3:14:35<3:05:21,  3.41s/it]

{'loss': 0.8431, 'grad_norm': 0.7480392456054688, 'learning_rate': 6.0588164280470314e-05, 'epoch': 0.44}


 44%|████▍     | 2550/5800 [3:15:12<3:27:38,  3.83s/it]

{'loss': 0.8686, 'grad_norm': 0.9015849828720093, 'learning_rate': 6.032064868924561e-05, 'epoch': 0.44}


 44%|████▍     | 2560/5800 [3:15:47<3:14:46,  3.61s/it]

{'loss': 0.8387, 'grad_norm': 0.8270828723907471, 'learning_rate': 6.0052824154365474e-05, 'epoch': 0.44}


 44%|████▍     | 2570/5800 [3:16:24<3:21:55,  3.75s/it]

{'loss': 0.8557, 'grad_norm': 1.00472891330719, 'learning_rate': 5.9784698693028607e-05, 'epoch': 0.44}


 44%|████▍     | 2580/5800 [3:17:01<3:17:02,  3.67s/it]

{'loss': 0.9083, 'grad_norm': 0.9346905946731567, 'learning_rate': 5.951628033144173e-05, 'epoch': 0.45}


 45%|████▍     | 2590/5800 [3:17:36<3:02:33,  3.41s/it]

{'loss': 0.8613, 'grad_norm': 0.8845067620277405, 'learning_rate': 5.924757710457943e-05, 'epoch': 0.45}


 45%|████▍     | 2600/5800 [3:18:12<3:18:14,  3.72s/it]

{'loss': 0.8615, 'grad_norm': 0.8662773966789246, 'learning_rate': 5.897859705594359e-05, 'epoch': 0.45}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.03it/s][A
  1%|          | 3/313 [00:00<01:02,  4.95it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.19it/s][A
  2%|▏         | 5/313 [00:01<01:27,  3.54it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:22,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6662167906761169, 'eval_runtime': 83.7733, 'eval_samples_per_second': 29.842, 'eval_steps_per_second': 3.736, 'epoch': 0.45}


 45%|████▌     | 2610/5800 [3:20:12<3:56:47,  4.45s/it] 

{'loss': 0.8663, 'grad_norm': 1.0314249992370605, 'learning_rate': 5.87093482373226e-05, 'epoch': 0.45}


 45%|████▌     | 2620/5800 [3:20:46<3:04:27,  3.48s/it]

{'loss': 0.8233, 'grad_norm': 0.8143996596336365, 'learning_rate': 5.843983870855038e-05, 'epoch': 0.45}


 45%|████▌     | 2630/5800 [3:21:21<3:04:42,  3.50s/it]

{'loss': 0.8295, 'grad_norm': 0.9047433137893677, 'learning_rate': 5.817007653726504e-05, 'epoch': 0.46}


 46%|████▌     | 2640/5800 [3:21:58<3:14:54,  3.70s/it]

{'loss': 0.8622, 'grad_norm': 0.9644585847854614, 'learning_rate': 5.79000697986675e-05, 'epoch': 0.46}


 46%|████▌     | 2650/5800 [3:22:33<3:03:29,  3.49s/it]

{'loss': 0.8574, 'grad_norm': 0.7915915846824646, 'learning_rate': 5.762982657527961e-05, 'epoch': 0.46}


 46%|████▌     | 2660/5800 [3:23:09<3:07:29,  3.58s/it]

{'loss': 0.8543, 'grad_norm': 1.0433694124221802, 'learning_rate': 5.735935495670229e-05, 'epoch': 0.46}


 46%|████▌     | 2670/5800 [3:23:45<3:06:23,  3.57s/it]

{'loss': 0.894, 'grad_norm': 0.8824267387390137, 'learning_rate': 5.708866303937339e-05, 'epoch': 0.46}


 46%|████▌     | 2680/5800 [3:24:20<3:01:13,  3.49s/it]

{'loss': 0.8754, 'grad_norm': 0.855444073677063, 'learning_rate': 5.681775892632528e-05, 'epoch': 0.46}


 46%|████▋     | 2690/5800 [3:24:57<3:07:20,  3.61s/it]

{'loss': 0.8679, 'grad_norm': 0.8884530663490295, 'learning_rate': 5.654665072694232e-05, 'epoch': 0.47}


 47%|████▋     | 2700/5800 [3:25:31<3:00:18,  3.49s/it]

{'loss': 0.8441, 'grad_norm': 0.8568119406700134, 'learning_rate': 5.6275346556718075e-05, 'epoch': 0.47}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.00it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6617206931114197, 'eval_runtime': 83.8265, 'eval_samples_per_second': 29.823, 'eval_steps_per_second': 3.734, 'epoch': 0.47}


 47%|████▋     | 2710/5800 [3:28:08<4:33:29,  5.31s/it] 

{'loss': 0.8705, 'grad_norm': 0.8303757309913635, 'learning_rate': 5.600385453701241e-05, 'epoch': 0.47}


 47%|████▋     | 2720/5800 [3:28:44<3:00:23,  3.51s/it]

{'loss': 0.8921, 'grad_norm': 0.8137895464897156, 'learning_rate': 5.573218279480837e-05, 'epoch': 0.47}


 47%|████▋     | 2730/5800 [3:29:20<3:02:28,  3.57s/it]

{'loss': 0.8822, 'grad_norm': 0.8158031702041626, 'learning_rate': 5.546033946246894e-05, 'epoch': 0.47}


 47%|████▋     | 2740/5800 [3:29:56<3:03:09,  3.59s/it]

{'loss': 0.8426, 'grad_norm': 1.073317527770996, 'learning_rate': 5.518833267749352e-05, 'epoch': 0.47}


 47%|████▋     | 2750/5800 [3:30:32<2:48:28,  3.31s/it]

{'loss': 0.8394, 'grad_norm': 0.9511677622795105, 'learning_rate': 5.491617058227443e-05, 'epoch': 0.48}


 48%|████▊     | 2760/5800 [3:31:08<2:56:24,  3.48s/it]

{'loss': 0.8234, 'grad_norm': 0.8250383138656616, 'learning_rate': 5.4643861323853093e-05, 'epoch': 0.48}


 48%|████▊     | 2770/5800 [3:31:43<2:56:45,  3.50s/it]

{'loss': 0.869, 'grad_norm': 1.073413610458374, 'learning_rate': 5.4371413053676215e-05, 'epoch': 0.48}


 48%|████▊     | 2780/5800 [3:32:19<3:07:14,  3.72s/it]

{'loss': 0.8689, 'grad_norm': 0.9527281522750854, 'learning_rate': 5.409883392735176e-05, 'epoch': 0.48}


 48%|████▊     | 2790/5800 [3:32:54<2:55:50,  3.51s/it]

{'loss': 0.8136, 'grad_norm': 0.9321504235267639, 'learning_rate': 5.382613210440477e-05, 'epoch': 0.48}


 48%|████▊     | 2800/5800 [3:33:30<2:48:23,  3.37s/it]

{'loss': 0.8821, 'grad_norm': 0.9234850406646729, 'learning_rate': 5.3553315748033195e-05, 'epoch': 0.48}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6629632115364075, 'eval_runtime': 83.7469, 'eval_samples_per_second': 29.852, 'eval_steps_per_second': 3.737, 'epoch': 0.48}


 48%|████▊     | 2810/5800 [3:35:29<3:51:05,  4.64s/it] 

{'loss': 0.8766, 'grad_norm': 1.041621446609497, 'learning_rate': 5.328039302486346e-05, 'epoch': 0.49}


 49%|████▊     | 2820/5800 [3:36:05<3:03:32,  3.70s/it]

{'loss': 0.8758, 'grad_norm': 0.8459653854370117, 'learning_rate': 5.300737210470603e-05, 'epoch': 0.49}


 49%|████▉     | 2830/5800 [3:36:42<3:02:04,  3.68s/it]

{'loss': 0.8332, 'grad_norm': 1.0246378183364868, 'learning_rate': 5.273426116031088e-05, 'epoch': 0.49}


 49%|████▉     | 2840/5800 [3:37:18<2:54:59,  3.55s/it]

{'loss': 0.8909, 'grad_norm': 0.8772057294845581, 'learning_rate': 5.2461068367122766e-05, 'epoch': 0.49}


 49%|████▉     | 2850/5800 [3:37:54<2:54:54,  3.56s/it]

{'loss': 0.885, 'grad_norm': 0.8590616583824158, 'learning_rate': 5.2187801903036595e-05, 'epoch': 0.49}


 49%|████▉     | 2860/5800 [3:38:32<2:59:11,  3.66s/it]

{'loss': 0.8842, 'grad_norm': 0.917966365814209, 'learning_rate': 5.191446994815254e-05, 'epoch': 0.49}


 49%|████▉     | 2870/5800 [3:39:08<2:54:21,  3.57s/it]

{'loss': 0.8566, 'grad_norm': 0.944231390953064, 'learning_rate': 5.164108068453125e-05, 'epoch': 0.5}


 50%|████▉     | 2880/5800 [3:39:44<3:05:51,  3.82s/it]

{'loss': 0.8591, 'grad_norm': 0.8490015864372253, 'learning_rate': 5.1367642295948834e-05, 'epoch': 0.5}


 50%|████▉     | 2890/5800 [3:40:21<3:07:42,  3.87s/it]

{'loss': 0.8822, 'grad_norm': 0.9323723316192627, 'learning_rate': 5.109416296765199e-05, 'epoch': 0.5}


 50%|█████     | 2900/5800 [3:40:56<2:46:19,  3.44s/it]

{'loss': 0.8516, 'grad_norm': 0.8235582709312439, 'learning_rate': 5.082065088611289e-05, 'epoch': 0.5}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.20it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:21,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:22,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6605252623558044, 'eval_runtime': 83.7802, 'eval_samples_per_second': 29.84, 'eval_steps_per_second': 3.736, 'epoch': 0.5}


 50%|█████     | 2910/5800 [3:43:19<3:46:47,  4.71s/it] 

{'loss': 0.8933, 'grad_norm': 0.9088184237480164, 'learning_rate': 5.054711423878415e-05, 'epoch': 0.5}


 50%|█████     | 2920/5800 [3:43:55<2:54:30,  3.64s/it]

{'loss': 0.8756, 'grad_norm': 0.8653523921966553, 'learning_rate': 5.027356121385377e-05, 'epoch': 0.51}


 51%|█████     | 2930/5800 [3:44:30<2:46:49,  3.49s/it]

{'loss': 0.8329, 'grad_norm': 0.8679718971252441, 'learning_rate': 5e-05, 'epoch': 0.51}


 51%|█████     | 2940/5800 [3:45:04<2:36:48,  3.29s/it]

{'loss': 0.8341, 'grad_norm': 0.8079841732978821, 'learning_rate': 4.972643878614624e-05, 'epoch': 0.51}


 51%|█████     | 2950/5800 [3:45:43<3:11:01,  4.02s/it]

{'loss': 0.8855, 'grad_norm': 0.8463074564933777, 'learning_rate': 4.9452885761215866e-05, 'epoch': 0.51}


 51%|█████     | 2960/5800 [3:46:20<2:50:08,  3.59s/it]

{'loss': 0.8809, 'grad_norm': 0.8410806655883789, 'learning_rate': 4.917934911388712e-05, 'epoch': 0.51}


 51%|█████     | 2970/5800 [3:46:55<2:45:50,  3.52s/it]

{'loss': 0.8412, 'grad_norm': 1.1662734746932983, 'learning_rate': 4.890583703234801e-05, 'epoch': 0.51}


 51%|█████▏    | 2980/5800 [3:47:30<2:50:29,  3.63s/it]

{'loss': 0.8743, 'grad_norm': 0.9582754969596863, 'learning_rate': 4.863235770405116e-05, 'epoch': 0.52}


 52%|█████▏    | 2990/5800 [3:48:08<2:51:25,  3.66s/it]

{'loss': 0.9036, 'grad_norm': 0.9041896462440491, 'learning_rate': 4.8358919315468755e-05, 'epoch': 0.52}


 52%|█████▏    | 3000/5800 [3:48:45<2:54:27,  3.74s/it]

{'loss': 0.8716, 'grad_norm': 0.9822123646736145, 'learning_rate': 4.8085530051847464e-05, 'epoch': 0.52}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6637321710586548, 'eval_runtime': 83.73, 'eval_samples_per_second': 29.858, 'eval_steps_per_second': 3.738, 'epoch': 0.52}


 52%|█████▏    | 3010/5800 [3:50:44<3:26:58,  4.45s/it] 

{'loss': 0.8425, 'grad_norm': 0.8704565763473511, 'learning_rate': 4.7812198096963416e-05, 'epoch': 0.52}


 52%|█████▏    | 3020/5800 [3:51:21<2:50:58,  3.69s/it]

{'loss': 0.8617, 'grad_norm': 0.8596230149269104, 'learning_rate': 4.753893163287725e-05, 'epoch': 0.52}


 52%|█████▏    | 3030/5800 [3:51:59<2:51:31,  3.72s/it]

{'loss': 0.8864, 'grad_norm': 0.85665363073349, 'learning_rate': 4.7265738839689146e-05, 'epoch': 0.52}


 52%|█████▏    | 3040/5800 [3:52:33<2:39:08,  3.46s/it]

{'loss': 0.821, 'grad_norm': 0.9429534673690796, 'learning_rate': 4.699262789529396e-05, 'epoch': 0.53}


 53%|█████▎    | 3050/5800 [3:53:10<2:41:28,  3.52s/it]

{'loss': 0.8687, 'grad_norm': 0.9251335263252258, 'learning_rate': 4.6719606975136545e-05, 'epoch': 0.53}


 53%|█████▎    | 3060/5800 [3:53:45<2:39:41,  3.50s/it]

{'loss': 0.9166, 'grad_norm': 0.9651950001716614, 'learning_rate': 4.6446684251966816e-05, 'epoch': 0.53}


 53%|█████▎    | 3070/5800 [3:54:21<2:40:16,  3.52s/it]

{'loss': 0.8703, 'grad_norm': 0.8002276420593262, 'learning_rate': 4.617386789559524e-05, 'epoch': 0.53}


 53%|█████▎    | 3080/5800 [3:54:57<2:37:05,  3.47s/it]

{'loss': 0.8407, 'grad_norm': 0.8060377240180969, 'learning_rate': 4.590116607264826e-05, 'epoch': 0.53}


 53%|█████▎    | 3090/5800 [3:55:33<2:44:18,  3.64s/it]

{'loss': 0.8542, 'grad_norm': 0.8819029331207275, 'learning_rate': 4.5628586946323797e-05, 'epoch': 0.53}


 53%|█████▎    | 3100/5800 [3:56:09<2:38:32,  3.52s/it]

{'loss': 0.8876, 'grad_norm': 0.9806045889854431, 'learning_rate': 4.535613867614693e-05, 'epoch': 0.54}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.06it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6599287390708923, 'eval_runtime': 83.7718, 'eval_samples_per_second': 29.843, 'eval_steps_per_second': 3.736, 'epoch': 0.54}


 54%|█████▎    | 3110/5800 [3:58:33<3:49:20,  5.12s/it] 

{'loss': 0.8968, 'grad_norm': 0.8471876382827759, 'learning_rate': 4.508382941772558e-05, 'epoch': 0.54}


 54%|█████▍    | 3120/5800 [3:59:09<2:39:19,  3.57s/it]

{'loss': 0.8212, 'grad_norm': 0.7295441627502441, 'learning_rate': 4.4811667322506486e-05, 'epoch': 0.54}


 54%|█████▍    | 3130/5800 [3:59:43<2:26:35,  3.29s/it]

{'loss': 0.8787, 'grad_norm': 0.8593926429748535, 'learning_rate': 4.4539660537531066e-05, 'epoch': 0.54}


 54%|█████▍    | 3140/5800 [4:00:18<2:33:19,  3.46s/it]

{'loss': 0.8487, 'grad_norm': 0.9011257290840149, 'learning_rate': 4.4267817205191634e-05, 'epoch': 0.54}


 54%|█████▍    | 3150/5800 [4:00:54<2:41:16,  3.65s/it]

{'loss': 0.8733, 'grad_norm': 0.8291382789611816, 'learning_rate': 4.3996145462987606e-05, 'epoch': 0.55}


 54%|█████▍    | 3160/5800 [4:01:32<2:49:58,  3.86s/it]

{'loss': 0.8734, 'grad_norm': 0.8237162232398987, 'learning_rate': 4.3724653443281936e-05, 'epoch': 0.55}


 55%|█████▍    | 3170/5800 [4:02:09<2:43:20,  3.73s/it]

{'loss': 0.8954, 'grad_norm': 0.9533537030220032, 'learning_rate': 4.3453349273057686e-05, 'epoch': 0.55}


 55%|█████▍    | 3180/5800 [4:02:45<2:35:11,  3.55s/it]

{'loss': 0.8771, 'grad_norm': 0.8565129637718201, 'learning_rate': 4.318224107367471e-05, 'epoch': 0.55}


 55%|█████▌    | 3190/5800 [4:03:21<2:30:04,  3.45s/it]

{'loss': 0.8487, 'grad_norm': 0.8235485553741455, 'learning_rate': 4.2911336960626613e-05, 'epoch': 0.55}


 55%|█████▌    | 3200/5800 [4:03:57<2:35:59,  3.60s/it]

{'loss': 0.8739, 'grad_norm': 0.8455910086631775, 'learning_rate': 4.264064504329772e-05, 'epoch': 0.55}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.96it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.20it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6598584055900574, 'eval_runtime': 83.7506, 'eval_samples_per_second': 29.851, 'eval_steps_per_second': 3.737, 'epoch': 0.55}


 55%|█████▌    | 3210/5800 [4:06:25<3:40:08,  5.10s/it] 

{'loss': 0.8471, 'grad_norm': 0.8767728805541992, 'learning_rate': 4.237017342472041e-05, 'epoch': 0.56}


 56%|█████▌    | 3220/5800 [4:07:03<2:43:53,  3.81s/it]

{'loss': 0.8852, 'grad_norm': 0.8573007583618164, 'learning_rate': 4.20999302013325e-05, 'epoch': 0.56}


 56%|█████▌    | 3230/5800 [4:07:39<2:31:49,  3.54s/it]

{'loss': 0.8882, 'grad_norm': 0.8164220452308655, 'learning_rate': 4.1829923462734974e-05, 'epoch': 0.56}


 56%|█████▌    | 3240/5800 [4:08:16<2:33:57,  3.61s/it]

{'loss': 0.8964, 'grad_norm': 0.9677931070327759, 'learning_rate': 4.156016129144965e-05, 'epoch': 0.56}


 56%|█████▌    | 3250/5800 [4:08:53<2:40:24,  3.77s/it]

{'loss': 0.8825, 'grad_norm': 0.956145703792572, 'learning_rate': 4.1290651762677424e-05, 'epoch': 0.56}


 56%|█████▌    | 3260/5800 [4:09:30<2:28:46,  3.51s/it]

{'loss': 0.8731, 'grad_norm': 1.0125654935836792, 'learning_rate': 4.1021402944056416e-05, 'epoch': 0.56}


 56%|█████▋    | 3270/5800 [4:10:05<2:34:04,  3.65s/it]

{'loss': 0.8177, 'grad_norm': 0.8288902044296265, 'learning_rate': 4.075242289542058e-05, 'epoch': 0.57}


 57%|█████▋    | 3280/5800 [4:10:41<2:37:53,  3.76s/it]

{'loss': 0.8544, 'grad_norm': 0.8907710313796997, 'learning_rate': 4.048371966855828e-05, 'epoch': 0.57}


 57%|█████▋    | 3290/5800 [4:11:17<2:30:22,  3.59s/it]

{'loss': 0.8954, 'grad_norm': 0.8775736093521118, 'learning_rate': 4.021530130697141e-05, 'epoch': 0.57}


 57%|█████▋    | 3300/5800 [4:11:53<2:28:32,  3.57s/it]

{'loss': 0.8528, 'grad_norm': 1.0089690685272217, 'learning_rate': 3.9947175845634544e-05, 'epoch': 0.57}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6625052690505981, 'eval_runtime': 83.7421, 'eval_samples_per_second': 29.854, 'eval_steps_per_second': 3.738, 'epoch': 0.57}


 57%|█████▋    | 3310/5800 [4:13:54<3:17:39,  4.76s/it] 

{'loss': 0.8449, 'grad_norm': 1.0221985578536987, 'learning_rate': 3.967935131075442e-05, 'epoch': 0.57}


 57%|█████▋    | 3320/5800 [4:14:30<2:31:51,  3.67s/it]

{'loss': 0.899, 'grad_norm': 0.8658663630485535, 'learning_rate': 3.94118357195297e-05, 'epoch': 0.57}


 57%|█████▋    | 3330/5800 [4:15:05<2:32:24,  3.70s/it]

{'loss': 0.8235, 'grad_norm': 0.8022268414497375, 'learning_rate': 3.914463707991096e-05, 'epoch': 0.58}


 58%|█████▊    | 3340/5800 [4:15:43<2:34:41,  3.77s/it]

{'loss': 0.8452, 'grad_norm': 0.8720452189445496, 'learning_rate': 3.887776339036099e-05, 'epoch': 0.58}


 58%|█████▊    | 3350/5800 [4:16:20<2:33:27,  3.76s/it]

{'loss': 0.8588, 'grad_norm': 0.984487771987915, 'learning_rate': 3.861122263961534e-05, 'epoch': 0.58}


 58%|█████▊    | 3360/5800 [4:16:56<2:29:08,  3.67s/it]

{'loss': 0.8642, 'grad_norm': 0.8549281358718872, 'learning_rate': 3.834502280644322e-05, 'epoch': 0.58}


 58%|█████▊    | 3370/5800 [4:17:31<2:17:06,  3.39s/it]

{'loss': 0.8263, 'grad_norm': 0.9158286452293396, 'learning_rate': 3.8079171859408614e-05, 'epoch': 0.58}


 58%|█████▊    | 3380/5800 [4:18:07<2:24:52,  3.59s/it]

{'loss': 0.8495, 'grad_norm': 1.0675089359283447, 'learning_rate': 3.7813677756631774e-05, 'epoch': 0.58}


 58%|█████▊    | 3390/5800 [4:18:44<2:26:55,  3.66s/it]

{'loss': 0.8804, 'grad_norm': 0.9692695736885071, 'learning_rate': 3.7548548445551005e-05, 'epoch': 0.59}


 59%|█████▊    | 3400/5800 [4:19:20<2:20:20,  3.51s/it]

{'loss': 0.8265, 'grad_norm': 0.871367871761322, 'learning_rate': 3.72837918626847e-05, 'epoch': 0.59}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  3.99it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6595235466957092, 'eval_runtime': 83.7717, 'eval_samples_per_second': 29.843, 'eval_steps_per_second': 3.736, 'epoch': 0.59}


 59%|█████▉    | 3410/5800 [4:21:52<3:18:47,  4.99s/it] 

{'loss': 0.8198, 'grad_norm': 0.8301199674606323, 'learning_rate': 3.7019415933393887e-05, 'epoch': 0.59}


 59%|█████▉    | 3420/5800 [4:22:27<2:24:18,  3.64s/it]

{'loss': 0.8661, 'grad_norm': 0.8343949317932129, 'learning_rate': 3.675542857164487e-05, 'epoch': 0.59}


 59%|█████▉    | 3430/5800 [4:23:03<2:15:54,  3.44s/it]

{'loss': 0.8817, 'grad_norm': 0.9289570450782776, 'learning_rate': 3.649183767977237e-05, 'epoch': 0.59}


 59%|█████▉    | 3440/5800 [4:23:39<2:24:04,  3.66s/it]

{'loss': 0.8531, 'grad_norm': 0.8043168187141418, 'learning_rate': 3.622865114824296e-05, 'epoch': 0.6}


 59%|█████▉    | 3450/5800 [4:24:14<2:11:33,  3.36s/it]

{'loss': 0.8608, 'grad_norm': 0.8710333108901978, 'learning_rate': 3.5965876855418914e-05, 'epoch': 0.6}


 60%|█████▉    | 3460/5800 [4:24:50<2:20:26,  3.60s/it]

{'loss': 0.8664, 'grad_norm': 0.8030906915664673, 'learning_rate': 3.570352266732231e-05, 'epoch': 0.6}


                                                       

{'loss': 0.8574, 'grad_norm': 1.0066078901290894, 'learning_rate': 3.544159643739959e-05, 'epoch': 0.6}

 60%|█████▉    | 3470/5800 [4:25:27<2:19:16,  3.59s/it]




 60%|██████    | 3480/5800 [4:26:03<2:19:03,  3.60s/it]

{'loss': 0.8598, 'grad_norm': 1.028137445449829, 'learning_rate': 3.5180106006286545e-05, 'epoch': 0.6}


 60%|██████    | 3490/5800 [4:26:39<2:18:30,  3.60s/it]

{'loss': 0.8477, 'grad_norm': 0.8472615480422974, 'learning_rate': 3.4919059201573435e-05, 'epoch': 0.6}


 60%|██████    | 3500/5800 [4:27:15<2:20:03,  3.65s/it]

{'loss': 0.8483, 'grad_norm': 0.9736593961715698, 'learning_rate': 3.4658463837570836e-05, 'epoch': 0.61}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.05it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6575087904930115, 'eval_runtime': 83.7325, 'eval_samples_per_second': 29.857, 'eval_steps_per_second': 3.738, 'epoch': 0.61}


 61%|██████    | 3510/5800 [4:29:30<3:11:14,  5.01s/it] 

{'loss': 0.8619, 'grad_norm': 0.9046918749809265, 'learning_rate': 3.439832771507565e-05, 'epoch': 0.61}


 61%|██████    | 3520/5800 [4:30:06<2:14:22,  3.54s/it]

{'loss': 0.8814, 'grad_norm': 0.876126229763031, 'learning_rate': 3.4138658621137606e-05, 'epoch': 0.61}


 61%|██████    | 3530/5800 [4:30:41<2:15:09,  3.57s/it]

{'loss': 0.8376, 'grad_norm': 1.0104631185531616, 'learning_rate': 3.3879464328826174e-05, 'epoch': 0.61}


 61%|██████    | 3540/5800 [4:31:20<2:21:32,  3.76s/it]

{'loss': 0.9009, 'grad_norm': 0.8963676691055298, 'learning_rate': 3.362075259699781e-05, 'epoch': 0.61}


 61%|██████    | 3550/5800 [4:31:57<2:18:09,  3.68s/it]

{'loss': 0.8738, 'grad_norm': 0.8964412808418274, 'learning_rate': 3.3362531170063835e-05, 'epoch': 0.61}


 61%|██████▏   | 3560/5800 [4:32:34<2:17:00,  3.67s/it]

{'loss': 0.8575, 'grad_norm': 0.9453503489494324, 'learning_rate': 3.310480777775849e-05, 'epoch': 0.62}


 62%|██████▏   | 3570/5800 [4:33:11<2:19:11,  3.74s/it]

{'loss': 0.8305, 'grad_norm': 0.8934575319290161, 'learning_rate': 3.284759013490755e-05, 'epoch': 0.62}


 62%|██████▏   | 3580/5800 [4:33:47<2:11:40,  3.56s/it]

{'loss': 0.8129, 'grad_norm': 0.9231907725334167, 'learning_rate': 3.259088594119749e-05, 'epoch': 0.62}


 62%|██████▏   | 3590/5800 [4:34:23<2:14:00,  3.64s/it]

{'loss': 0.8458, 'grad_norm': 0.9398667812347412, 'learning_rate': 3.2334702880944886e-05, 'epoch': 0.62}


 62%|██████▏   | 3600/5800 [4:34:58<2:06:32,  3.45s/it]

{'loss': 0.8368, 'grad_norm': 0.8743715286254883, 'learning_rate': 3.207904862286647e-05, 'epoch': 0.62}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.09it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.28it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6557506918907166, 'eval_runtime': 83.7178, 'eval_samples_per_second': 29.862, 'eval_steps_per_second': 3.739, 'epoch': 0.62}


 62%|██████▏   | 3610/5800 [4:37:12<2:57:58,  4.88s/it] 

{'loss': 0.8428, 'grad_norm': 0.8043060302734375, 'learning_rate': 3.182393081984948e-05, 'epoch': 0.62}


 62%|██████▏   | 3620/5800 [4:37:47<2:08:59,  3.55s/it]

{'loss': 0.8156, 'grad_norm': 0.8945440649986267, 'learning_rate': 3.1569357108722695e-05, 'epoch': 0.63}


 63%|██████▎   | 3630/5800 [4:38:24<2:19:59,  3.87s/it]

{'loss': 0.8572, 'grad_norm': 0.9173685312271118, 'learning_rate': 3.131533511002774e-05, 'epoch': 0.63}


 63%|██████▎   | 3640/5800 [4:39:02<2:15:03,  3.75s/it]

{'loss': 0.8654, 'grad_norm': 0.9087398648262024, 'learning_rate': 3.1061872427790986e-05, 'epoch': 0.63}


 63%|██████▎   | 3650/5800 [4:39:37<2:04:04,  3.46s/it]

{'loss': 0.8103, 'grad_norm': 0.8149991035461426, 'learning_rate': 3.0808976649295915e-05, 'epoch': 0.63}


 63%|██████▎   | 3660/5800 [4:40:11<1:57:42,  3.30s/it]

{'loss': 0.8227, 'grad_norm': 0.797858715057373, 'learning_rate': 3.0556655344856056e-05, 'epoch': 0.63}


 63%|██████▎   | 3670/5800 [4:40:46<2:02:27,  3.45s/it]

{'loss': 0.8319, 'grad_norm': 0.8591544032096863, 'learning_rate': 3.030491606758832e-05, 'epoch': 0.63}


 63%|██████▎   | 3680/5800 [4:41:24<2:11:57,  3.73s/it]

{'loss': 0.8676, 'grad_norm': 0.838949978351593, 'learning_rate': 3.0053766353186885e-05, 'epoch': 0.64}


 64%|██████▎   | 3690/5800 [4:42:02<2:10:02,  3.70s/it]

{'loss': 0.8668, 'grad_norm': 0.841770589351654, 'learning_rate': 2.9803213719697737e-05, 'epoch': 0.64}


 64%|██████▍   | 3700/5800 [4:42:38<2:04:03,  3.54s/it]

{'loss': 0.8397, 'grad_norm': 0.9666942358016968, 'learning_rate': 2.9553265667293428e-05, 'epoch': 0.64}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.654485285282135, 'eval_runtime': 83.757, 'eval_samples_per_second': 29.848, 'eval_steps_per_second': 3.737, 'epoch': 0.64}


 64%|██████▍   | 3710/5800 [4:45:14<2:56:21,  5.06s/it] 

{'loss': 0.836, 'grad_norm': 0.8501335978507996, 'learning_rate': 2.9303929678048707e-05, 'epoch': 0.64}


 64%|██████▍   | 3720/5800 [4:45:51<2:11:00,  3.78s/it]

{'loss': 0.8592, 'grad_norm': 0.8128991723060608, 'learning_rate': 2.9055213215716515e-05, 'epoch': 0.64}


 64%|██████▍   | 3730/5800 [4:46:28<2:10:14,  3.78s/it]

{'loss': 0.8687, 'grad_norm': 0.9388959407806396, 'learning_rate': 2.880712372550458e-05, 'epoch': 0.65}


 64%|██████▍   | 3740/5800 [4:47:04<2:01:59,  3.55s/it]

{'loss': 0.8561, 'grad_norm': 0.8251085877418518, 'learning_rate': 2.8559668633852433e-05, 'epoch': 0.65}


 65%|██████▍   | 3750/5800 [4:47:41<2:06:55,  3.71s/it]

{'loss': 0.8483, 'grad_norm': 0.8402504324913025, 'learning_rate': 2.8312855348209242e-05, 'epoch': 0.65}


 65%|██████▍   | 3760/5800 [4:48:16<1:58:46,  3.49s/it]

{'loss': 0.8075, 'grad_norm': 0.8170495629310608, 'learning_rate': 2.8066691256812084e-05, 'epoch': 0.65}


 65%|██████▌   | 3770/5800 [4:48:52<1:58:25,  3.50s/it]

{'loss': 0.8047, 'grad_norm': 0.9519358277320862, 'learning_rate': 2.7821183728464605e-05, 'epoch': 0.65}


 65%|██████▌   | 3780/5800 [4:49:28<2:01:56,  3.62s/it]

{'loss': 0.8699, 'grad_norm': 0.9089421033859253, 'learning_rate': 2.7576340112316628e-05, 'epoch': 0.65}


 65%|██████▌   | 3790/5800 [4:50:04<2:01:18,  3.62s/it]

{'loss': 0.8511, 'grad_norm': 0.9570057988166809, 'learning_rate': 2.7332167737644048e-05, 'epoch': 0.66}


 66%|██████▌   | 3800/5800 [4:50:39<1:58:05,  3.54s/it]

{'loss': 0.8658, 'grad_norm': 0.9322117567062378, 'learning_rate': 2.708867391362948e-05, 'epoch': 0.66}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6553770303726196, 'eval_runtime': 83.8468, 'eval_samples_per_second': 29.816, 'eval_steps_per_second': 3.733, 'epoch': 0.66}


 66%|██████▌   | 3810/5800 [4:52:41<2:35:05,  4.68s/it] 

{'loss': 0.8439, 'grad_norm': 0.7686997652053833, 'learning_rate': 2.6845865929143465e-05, 'epoch': 0.66}


 66%|██████▌   | 3820/5800 [4:53:16<2:00:28,  3.65s/it]

{'loss': 0.8489, 'grad_norm': 0.910336434841156, 'learning_rate': 2.6603751052526238e-05, 'epoch': 0.66}


 66%|██████▌   | 3830/5800 [4:53:51<1:50:58,  3.38s/it]

{'loss': 0.8305, 'grad_norm': 0.8650779724121094, 'learning_rate': 2.63623365313702e-05, 'epoch': 0.66}


 66%|██████▌   | 3840/5800 [4:54:27<1:54:27,  3.50s/it]

{'loss': 0.8059, 'grad_norm': 0.8294004201889038, 'learning_rate': 2.6121629592302964e-05, 'epoch': 0.66}


 66%|██████▋   | 3850/5800 [4:55:02<1:58:59,  3.66s/it]

{'loss': 0.8437, 'grad_norm': 1.0551459789276123, 'learning_rate': 2.5881637440770983e-05, 'epoch': 0.67}


 67%|██████▋   | 3860/5800 [4:55:39<1:56:55,  3.62s/it]

{'loss': 0.8404, 'grad_norm': 0.8755999207496643, 'learning_rate': 2.5642367260823908e-05, 'epoch': 0.67}


 67%|██████▋   | 3870/5800 [4:56:15<1:55:46,  3.60s/it]

{'loss': 0.8636, 'grad_norm': 0.9033029675483704, 'learning_rate': 2.5403826214899506e-05, 'epoch': 0.67}


 67%|██████▋   | 3880/5800 [4:56:51<1:56:04,  3.63s/it]

{'loss': 0.849, 'grad_norm': 0.9658823609352112, 'learning_rate': 2.5166021443609288e-05, 'epoch': 0.67}


 67%|██████▋   | 3890/5800 [4:57:25<1:47:17,  3.37s/it]

{'loss': 0.8385, 'grad_norm': 0.9306058883666992, 'learning_rate': 2.4928960065524716e-05, 'epoch': 0.67}


 67%|██████▋   | 3900/5800 [4:58:00<1:49:17,  3.45s/it]

{'loss': 0.8254, 'grad_norm': 0.852604866027832, 'learning_rate': 2.469264917696412e-05, 'epoch': 0.67}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.97it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.29it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.00it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6552149653434753, 'eval_runtime': 83.8305, 'eval_samples_per_second': 29.822, 'eval_steps_per_second': 3.734, 'epoch': 0.67}


 67%|██████▋   | 3910/5800 [5:00:01<2:30:59,  4.79s/it] 

{'loss': 0.8734, 'grad_norm': 0.9169074296951294, 'learning_rate': 2.4457095851780353e-05, 'epoch': 0.68}


 68%|██████▊   | 3920/5800 [5:00:37<1:52:14,  3.58s/it]

{'loss': 0.8238, 'grad_norm': 0.9737362265586853, 'learning_rate': 2.422230714114891e-05, 'epoch': 0.68}


 68%|██████▊   | 3930/5800 [5:01:12<1:50:09,  3.53s/it]

{'loss': 0.8669, 'grad_norm': 0.8690634369850159, 'learning_rate': 2.398829007335695e-05, 'epoch': 0.68}


 68%|██████▊   | 3940/5800 [5:01:47<1:46:57,  3.45s/it]

{'loss': 0.8496, 'grad_norm': 0.8990504741668701, 'learning_rate': 2.3755051653592826e-05, 'epoch': 0.68}


 68%|██████▊   | 3950/5800 [5:02:22<1:47:54,  3.50s/it]

{'loss': 0.8468, 'grad_norm': 0.8085954785346985, 'learning_rate': 2.352259886373649e-05, 'epoch': 0.68}


 68%|██████▊   | 3960/5800 [5:02:58<1:50:04,  3.59s/it]

{'loss': 0.8527, 'grad_norm': 0.9067398905754089, 'learning_rate': 2.3290938662150424e-05, 'epoch': 0.69}


 68%|██████▊   | 3970/5800 [5:03:32<1:41:45,  3.34s/it]

{'loss': 0.8319, 'grad_norm': 0.9207316637039185, 'learning_rate': 2.3060077983471334e-05, 'epoch': 0.69}


 69%|██████▊   | 3980/5800 [5:04:09<1:48:57,  3.59s/it]

{'loss': 0.8756, 'grad_norm': 0.9318008422851562, 'learning_rate': 2.2830023738402662e-05, 'epoch': 0.69}


 69%|██████▉   | 3990/5800 [5:04:45<1:50:11,  3.65s/it]

{'loss': 0.8657, 'grad_norm': 0.8993096947669983, 'learning_rate': 2.2600782813507583e-05, 'epoch': 0.69}


 69%|██████▉   | 4000/5800 [5:05:21<1:45:49,  3.53s/it]

{'loss': 0.7899, 'grad_norm': 0.8442991971969604, 'learning_rate': 2.2372362071002946e-05, 'epoch': 0.69}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6561059355735779, 'eval_runtime': 83.7746, 'eval_samples_per_second': 29.842, 'eval_steps_per_second': 3.736, 'epoch': 0.69}


 69%|██████▉   | 4010/5800 [5:07:21<2:21:50,  4.75s/it] 

{'loss': 0.8553, 'grad_norm': 0.8599461317062378, 'learning_rate': 2.2144768348553818e-05, 'epoch': 0.69}


 69%|██████▉   | 4020/5800 [5:07:56<1:49:05,  3.68s/it]

{'loss': 0.867, 'grad_norm': 0.8618471622467041, 'learning_rate': 2.191800845906885e-05, 'epoch': 0.7}


 69%|██████▉   | 4030/5800 [5:08:32<1:47:36,  3.65s/it]

{'loss': 0.8543, 'grad_norm': 0.9367401003837585, 'learning_rate': 2.1692089190496285e-05, 'epoch': 0.7}


 70%|██████▉   | 4040/5800 [5:09:08<1:42:08,  3.48s/it]

{'loss': 0.8275, 'grad_norm': 0.8981068730354309, 'learning_rate': 2.1467017305620773e-05, 'epoch': 0.7}


 70%|██████▉   | 4050/5800 [5:09:42<1:41:04,  3.47s/it]

{'loss': 0.8153, 'grad_norm': 0.9293004274368286, 'learning_rate': 2.1242799541860962e-05, 'epoch': 0.7}


 70%|███████   | 4060/5800 [5:10:18<1:47:56,  3.72s/it]

{'loss': 0.8456, 'grad_norm': 0.8880953788757324, 'learning_rate': 2.1019442611067785e-05, 'epoch': 0.7}


 70%|███████   | 4070/5800 [5:10:54<1:42:12,  3.54s/it]

{'loss': 0.8578, 'grad_norm': 0.8906163573265076, 'learning_rate': 2.0796953199323555e-05, 'epoch': 0.7}


 70%|███████   | 4080/5800 [5:11:29<1:39:45,  3.48s/it]

{'loss': 0.8366, 'grad_norm': 0.885780930519104, 'learning_rate': 2.0575337966741814e-05, 'epoch': 0.71}


 71%|███████   | 4090/5800 [5:12:05<1:44:16,  3.66s/it]

{'loss': 0.8321, 'grad_norm': 0.9465751647949219, 'learning_rate': 2.0354603547267985e-05, 'epoch': 0.71}


 71%|███████   | 4100/5800 [5:12:42<1:42:24,  3.61s/it]

{'loss': 0.8519, 'grad_norm': 0.7990870475769043, 'learning_rate': 2.013475654848076e-05, 'epoch': 0.71}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6526935696601868, 'eval_runtime': 83.7932, 'eval_samples_per_second': 29.835, 'eval_steps_per_second': 3.735, 'epoch': 0.71}


 71%|███████   | 4110/5800 [5:15:05<2:17:45,  4.89s/it] 

{'loss': 0.8588, 'grad_norm': 0.8554151654243469, 'learning_rate': 1.9915803551394313e-05, 'epoch': 0.71}


 71%|███████   | 4120/5800 [5:15:42<1:43:50,  3.71s/it]

{'loss': 0.8244, 'grad_norm': 0.7776785492897034, 'learning_rate': 1.9697751110261347e-05, 'epoch': 0.71}


 71%|███████   | 4130/5800 [5:16:17<1:38:15,  3.53s/it]

{'loss': 0.8473, 'grad_norm': 0.8707390427589417, 'learning_rate': 1.9480605752376814e-05, 'epoch': 0.71}


 71%|███████▏  | 4140/5800 [5:16:53<1:36:35,  3.49s/it]

{'loss': 0.8279, 'grad_norm': 0.8835735321044922, 'learning_rate': 1.9264373977882598e-05, 'epoch': 0.72}


 72%|███████▏  | 4150/5800 [5:17:30<1:39:42,  3.63s/it]

{'loss': 0.8325, 'grad_norm': 0.8386856317520142, 'learning_rate': 1.9049062259572847e-05, 'epoch': 0.72}


 72%|███████▏  | 4160/5800 [5:18:09<1:46:11,  3.89s/it]

{'loss': 0.8722, 'grad_norm': 0.9012479782104492, 'learning_rate': 1.883467704270033e-05, 'epoch': 0.72}


 72%|███████▏  | 4170/5800 [5:18:46<1:42:35,  3.78s/it]

{'loss': 0.8134, 'grad_norm': 1.0068141222000122, 'learning_rate': 1.8621224744783405e-05, 'epoch': 0.72}


 72%|███████▏  | 4180/5800 [5:19:22<1:32:54,  3.44s/it]

{'loss': 0.8362, 'grad_norm': 0.8330762386322021, 'learning_rate': 1.840871175541396e-05, 'epoch': 0.72}


 72%|███████▏  | 4190/5800 [5:19:58<1:34:37,  3.53s/it]

{'loss': 0.8542, 'grad_norm': 0.8940346837043762, 'learning_rate': 1.8197144436066167e-05, 'epoch': 0.72}


                                                       

{'loss': 0.8573, 'grad_norm': 0.7980161905288696, 'learning_rate': 1.798652911990597e-05, 'epoch': 0.73}


 72%|███████▏  | 4200/5800 [5:20:35<1:34:35,  3.55s/it]
  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:18,  3.79it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.91it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.03it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [

{'eval_loss': 0.654498279094696, 'eval_runtime': 83.7111, 'eval_samples_per_second': 29.865, 'eval_steps_per_second': 3.739, 'epoch': 0.73}


 73%|███████▎  | 4210/5800 [5:22:35<2:04:14,  4.69s/it] 

{'loss': 0.851, 'grad_norm': 0.8033690452575684, 'learning_rate': 1.7776872111601574e-05, 'epoch': 0.73}


 73%|███████▎  | 4220/5800 [5:23:11<1:35:42,  3.63s/it]

{'loss': 0.8268, 'grad_norm': 0.9372020959854126, 'learning_rate': 1.75681796871347e-05, 'epoch': 0.73}


 73%|███████▎  | 4230/5800 [5:23:47<1:35:42,  3.66s/it]

{'loss': 0.8558, 'grad_norm': 0.9732645750045776, 'learning_rate': 1.736045809361272e-05, 'epoch': 0.73}


 73%|███████▎  | 4240/5800 [5:24:25<1:33:28,  3.59s/it]

{'loss': 0.8772, 'grad_norm': 0.8117193579673767, 'learning_rate': 1.7153713549081644e-05, 'epoch': 0.73}


 73%|███████▎  | 4250/5800 [5:25:01<1:31:12,  3.53s/it]

{'loss': 0.8645, 'grad_norm': 0.9765875339508057, 'learning_rate': 1.6947952242339992e-05, 'epoch': 0.74}


 73%|███████▎  | 4260/5800 [5:25:36<1:35:45,  3.73s/it]

{'loss': 0.8229, 'grad_norm': 0.8561289310455322, 'learning_rate': 1.6743180332753527e-05, 'epoch': 0.74}


 74%|███████▎  | 4270/5800 [5:26:11<1:30:02,  3.53s/it]

{'loss': 0.8559, 'grad_norm': 0.910971999168396, 'learning_rate': 1.653940395007089e-05, 'epoch': 0.74}


 74%|███████▍  | 4280/5800 [5:26:48<1:30:52,  3.59s/it]

{'loss': 0.8494, 'grad_norm': 0.8737408518791199, 'learning_rate': 1.6336629194240116e-05, 'epoch': 0.74}


 74%|███████▍  | 4290/5800 [5:27:23<1:27:22,  3.47s/it]

{'loss': 0.8416, 'grad_norm': 0.9456875324249268, 'learning_rate': 1.6134862135225987e-05, 'epoch': 0.74}


 74%|███████▍  | 4300/5800 [5:27:59<1:29:32,  3.58s/it]

{'loss': 0.8376, 'grad_norm': 0.8676287531852722, 'learning_rate': 1.59341088128284e-05, 'epoch': 0.74}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:01,  5.01it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6531615257263184, 'eval_runtime': 83.7612, 'eval_samples_per_second': 29.847, 'eval_steps_per_second': 3.737, 'epoch': 0.74}


 74%|███████▍  | 4310/5800 [5:30:01<2:01:27,  4.89s/it] 

{'loss': 0.8133, 'grad_norm': 0.8592010736465454, 'learning_rate': 1.573437523650152e-05, 'epoch': 0.75}


 74%|███████▍  | 4320/5800 [5:30:39<1:39:08,  4.02s/it]

{'loss': 0.8789, 'grad_norm': 0.9149606227874756, 'learning_rate': 1.5535667385173885e-05, 'epoch': 0.75}


 75%|███████▍  | 4330/5800 [5:31:20<1:28:09,  3.60s/it]

{'loss': 0.8815, 'grad_norm': 0.8884323239326477, 'learning_rate': 1.5337991207069503e-05, 'epoch': 0.75}


 75%|███████▍  | 4340/5800 [5:31:56<1:30:42,  3.73s/it]

{'loss': 0.8326, 'grad_norm': 0.8145123720169067, 'learning_rate': 1.514135261952968e-05, 'epoch': 0.75}


 75%|███████▌  | 4350/5800 [5:32:32<1:24:38,  3.50s/it]

{'loss': 0.839, 'grad_norm': 0.8850173950195312, 'learning_rate': 1.4945757508836001e-05, 'epoch': 0.75}


 75%|███████▌  | 4360/5800 [5:33:07<1:22:30,  3.44s/it]

{'loss': 0.8012, 'grad_norm': 0.8279746770858765, 'learning_rate': 1.4751211730034003e-05, 'epoch': 0.75}


 75%|███████▌  | 4370/5800 [5:33:43<1:23:13,  3.49s/it]

{'loss': 0.7818, 'grad_norm': 0.8701013922691345, 'learning_rate': 1.4557721106758037e-05, 'epoch': 0.76}


 76%|███████▌  | 4380/5800 [5:34:20<1:23:51,  3.54s/it]

{'loss': 0.8589, 'grad_norm': 0.8487833738327026, 'learning_rate': 1.4365291431056871e-05, 'epoch': 0.76}


 76%|███████▌  | 4390/5800 [5:34:55<1:24:11,  3.58s/it]

{'loss': 0.8442, 'grad_norm': 0.8643510341644287, 'learning_rate': 1.41739284632203e-05, 'epoch': 0.76}


 76%|███████▌  | 4400/5800 [5:35:32<1:23:23,  3.57s/it]

{'loss': 0.8598, 'grad_norm': 0.848984956741333, 'learning_rate': 1.3983637931606725e-05, 'epoch': 0.76}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6539838314056396, 'eval_runtime': 83.8049, 'eval_samples_per_second': 29.831, 'eval_steps_per_second': 3.735, 'epoch': 0.76}


 76%|███████▌  | 4410/5800 [5:37:33<1:54:17,  4.93s/it] 

{'loss': 0.8581, 'grad_norm': 0.8755882382392883, 'learning_rate': 1.3794425532471732e-05, 'epoch': 0.76}


 76%|███████▌  | 4420/5800 [5:38:09<1:23:26,  3.63s/it]

{'loss': 0.8299, 'grad_norm': 0.8237429857254028, 'learning_rate': 1.3606296929797497e-05, 'epoch': 0.76}


 76%|███████▋  | 4430/5800 [5:38:45<1:20:01,  3.50s/it]

{'loss': 0.8418, 'grad_norm': 0.8508621454238892, 'learning_rate': 1.3419257755123255e-05, 'epoch': 0.77}


 77%|███████▋  | 4440/5800 [5:39:21<1:19:34,  3.51s/it]

{'loss': 0.8267, 'grad_norm': 0.9261108636856079, 'learning_rate': 1.3233313607376757e-05, 'epoch': 0.77}


 77%|███████▋  | 4450/5800 [5:39:58<1:22:04,  3.65s/it]

{'loss': 0.8431, 'grad_norm': 0.7781408429145813, 'learning_rate': 1.3048470052706657e-05, 'epoch': 0.77}


 77%|███████▋  | 4460/5800 [5:40:35<1:22:02,  3.67s/it]

{'loss': 0.8676, 'grad_norm': 0.8403439521789551, 'learning_rate': 1.2864732624315867e-05, 'epoch': 0.77}


 77%|███████▋  | 4470/5800 [5:41:13<1:23:34,  3.77s/it]

{'loss': 0.8415, 'grad_norm': 0.9151895642280579, 'learning_rate': 1.2682106822295948e-05, 'epoch': 0.77}


 77%|███████▋  | 4480/5800 [5:41:49<1:18:40,  3.58s/it]

{'loss': 0.8594, 'grad_norm': 1.0098903179168701, 'learning_rate': 1.2500598113462458e-05, 'epoch': 0.78}


 77%|███████▋  | 4490/5800 [5:42:25<1:22:25,  3.78s/it]

{'loss': 0.8066, 'grad_norm': 0.7790947556495667, 'learning_rate': 1.2320211931191321e-05, 'epoch': 0.78}


 78%|███████▊  | 4500/5800 [5:43:01<1:18:32,  3.62s/it]

{'loss': 0.8478, 'grad_norm': 0.946742594242096, 'learning_rate': 1.2140953675256145e-05, 'epoch': 0.78}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.94it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.00it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6508173942565918, 'eval_runtime': 83.751, 'eval_samples_per_second': 29.85, 'eval_steps_per_second': 3.737, 'epoch': 0.78}


 78%|███████▊  | 4510/5800 [5:45:35<1:46:35,  4.96s/it] 

{'loss': 0.8551, 'grad_norm': 0.8273771405220032, 'learning_rate': 1.1962828711666623e-05, 'epoch': 0.78}


 78%|███████▊  | 4520/5800 [5:46:12<1:22:31,  3.87s/it]

{'loss': 0.8256, 'grad_norm': 0.922135591506958, 'learning_rate': 1.1785842372507883e-05, 'epoch': 0.78}


 78%|███████▊  | 4530/5800 [5:46:47<1:14:19,  3.51s/it]

{'loss': 0.8226, 'grad_norm': 1.0060337781906128, 'learning_rate': 1.1609999955780871e-05, 'epoch': 0.78}


 78%|███████▊  | 4540/5800 [5:47:24<1:16:57,  3.66s/it]

{'loss': 0.8477, 'grad_norm': 0.8869373798370361, 'learning_rate': 1.143530672524376e-05, 'epoch': 0.79}


 78%|███████▊  | 4550/5800 [5:48:00<1:15:50,  3.64s/it]

{'loss': 0.8318, 'grad_norm': 0.8254683613777161, 'learning_rate': 1.1261767910254423e-05, 'epoch': 0.79}


 79%|███████▊  | 4560/5800 [5:48:38<1:16:51,  3.72s/it]

{'loss': 0.8788, 'grad_norm': 0.955811619758606, 'learning_rate': 1.108938870561384e-05, 'epoch': 0.79}


 79%|███████▉  | 4570/5800 [5:49:13<1:14:01,  3.61s/it]

{'loss': 0.8113, 'grad_norm': 0.8777124285697937, 'learning_rate': 1.0918174271410575e-05, 'epoch': 0.79}


 79%|███████▉  | 4580/5800 [5:49:49<1:11:38,  3.52s/it]

{'loss': 0.8506, 'grad_norm': 0.8121776580810547, 'learning_rate': 1.0748129732866391e-05, 'epoch': 0.79}


 79%|███████▉  | 4590/5800 [5:50:24<1:11:32,  3.55s/it]

{'loss': 0.8303, 'grad_norm': 0.9414090514183044, 'learning_rate': 1.0579260180182783e-05, 'epoch': 0.79}


 79%|███████▉  | 4600/5800 [5:51:02<1:17:46,  3.89s/it]

{'loss': 0.8842, 'grad_norm': 0.8575741052627563, 'learning_rate': 1.0411570668388598e-05, 'epoch': 0.8}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:22,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.94it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.00it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6515723466873169, 'eval_runtime': 83.7616, 'eval_samples_per_second': 29.847, 'eval_steps_per_second': 3.737, 'epoch': 0.8}


 79%|███████▉  | 4610/5800 [5:53:01<1:29:51,  4.53s/it]

{'loss': 0.8161, 'grad_norm': 0.800351083278656, 'learning_rate': 1.0245066217188714e-05, 'epoch': 0.8}


 80%|███████▉  | 4620/5800 [5:53:37<1:10:53,  3.60s/it]

{'loss': 0.8387, 'grad_norm': 0.8377434015274048, 'learning_rate': 1.0079751810813848e-05, 'epoch': 0.8}


 80%|███████▉  | 4630/5800 [5:54:12<1:07:15,  3.45s/it]

{'loss': 0.8352, 'grad_norm': 0.8871465921401978, 'learning_rate': 9.915632397871228e-06, 'epoch': 0.8}


 80%|████████  | 4640/5800 [5:54:49<1:10:57,  3.67s/it]

{'loss': 0.8598, 'grad_norm': 0.9222315549850464, 'learning_rate': 9.752712891196557e-06, 'epoch': 0.8}


 80%|████████  | 4650/5800 [5:55:26<1:08:21,  3.57s/it]

{'loss': 0.9044, 'grad_norm': 0.9370970129966736, 'learning_rate': 9.590998167706938e-06, 'epoch': 0.8}


 80%|████████  | 4660/5800 [5:56:01<1:06:27,  3.50s/it]

{'loss': 0.8477, 'grad_norm': 0.9016663432121277, 'learning_rate': 9.430493068254831e-06, 'epoch': 0.81}


 81%|████████  | 4670/5800 [5:56:36<1:06:49,  3.55s/it]

{'loss': 0.8001, 'grad_norm': 0.9230769872665405, 'learning_rate': 9.271202397483215e-06, 'epoch': 0.81}


 81%|████████  | 4680/5800 [5:57:11<1:04:45,  3.47s/it]

{'loss': 0.8096, 'grad_norm': 0.8771980404853821, 'learning_rate': 9.113130923681717e-06, 'epoch': 0.81}


 81%|████████  | 4690/5800 [5:57:49<1:07:08,  3.63s/it]

{'loss': 0.8914, 'grad_norm': 0.8430121541023254, 'learning_rate': 8.956283378643888e-06, 'epoch': 0.81}


 81%|████████  | 4700/5800 [5:58:25<1:05:51,  3.59s/it]

{'loss': 0.8153, 'grad_norm': 0.8357175588607788, 'learning_rate': 8.800664457525553e-06, 'epoch': 0.81}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.16it/s][A
  1%|          | 3/313 [00:00<01:01,  5.03it/s][A
  1%|▏         | 4/313 [00:00<01:12,  4.23it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.10it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6509979963302612, 'eval_runtime': 83.7151, 'eval_samples_per_second': 29.863, 'eval_steps_per_second': 3.739, 'epoch': 0.81}


 81%|████████  | 4710/5800 [6:00:24<1:22:07,  4.52s/it]

{'loss': 0.8243, 'grad_norm': 0.8813720941543579, 'learning_rate': 8.646278818704302e-06, 'epoch': 0.81}


 81%|████████▏ | 4720/5800 [6:01:01<1:08:11,  3.79s/it]

{'loss': 0.8107, 'grad_norm': 0.8807786703109741, 'learning_rate': 8.493131083639965e-06, 'epoch': 0.82}


 82%|████████▏ | 4730/5800 [6:01:43<1:05:38,  3.68s/it]

{'loss': 0.8818, 'grad_norm': 0.907750129699707, 'learning_rate': 8.341225836736366e-06, 'epoch': 0.82}


 82%|████████▏ | 4740/5800 [6:02:18<1:01:30,  3.48s/it]

{'loss': 0.8222, 'grad_norm': 0.8916952610015869, 'learning_rate': 8.190567625204004e-06, 'epoch': 0.82}


 82%|████████▏ | 4750/5800 [6:02:52<1:02:19,  3.56s/it]

{'loss': 0.8002, 'grad_norm': 0.8048254251480103, 'learning_rate': 8.041160958923988e-06, 'epoch': 0.82}


 82%|████████▏ | 4760/5800 [6:03:28<1:00:36,  3.50s/it]

{'loss': 0.8542, 'grad_norm': 0.8713552951812744, 'learning_rate': 7.893010310313049e-06, 'epoch': 0.82}


 82%|████████▏ | 4770/5800 [6:04:02<58:48,  3.43s/it]  

{'loss': 0.8222, 'grad_norm': 0.8411381840705872, 'learning_rate': 7.74612011418961e-06, 'epoch': 0.83}


 82%|████████▏ | 4780/5800 [6:04:38<1:01:12,  3.60s/it]

{'loss': 0.8288, 'grad_norm': 0.886915385723114, 'learning_rate': 7.60049476764102e-06, 'epoch': 0.83}


 83%|████████▎ | 4790/5800 [6:05:14<1:01:18,  3.64s/it]

{'loss': 0.8277, 'grad_norm': 0.8959079384803772, 'learning_rate': 7.456138629892018e-06, 'epoch': 0.83}


 83%|████████▎ | 4800/5800 [6:05:51<1:01:25,  3.69s/it]

{'loss': 0.8802, 'grad_norm': 0.8886606693267822, 'learning_rate': 7.31305602217417e-06, 'epoch': 0.83}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6503723859786987, 'eval_runtime': 83.749, 'eval_samples_per_second': 29.851, 'eval_steps_per_second': 3.737, 'epoch': 0.83}


 83%|████████▎ | 4810/5800 [6:08:14<1:18:39,  4.77s/it] 

{'loss': 0.8324, 'grad_norm': 0.8921840190887451, 'learning_rate': 7.17125122759652e-06, 'epoch': 0.83}


 83%|████████▎ | 4820/5800 [6:08:48<55:16,  3.38s/it]  

{'loss': 0.8212, 'grad_norm': 0.845605731010437, 'learning_rate': 7.030728491017408e-06, 'epoch': 0.83}


 83%|████████▎ | 4830/5800 [6:09:24<1:01:15,  3.79s/it]

{'loss': 0.8395, 'grad_norm': 0.9134535193443298, 'learning_rate': 6.891492018917378e-06, 'epoch': 0.84}


 83%|████████▎ | 4840/5800 [6:09:59<56:49,  3.55s/it]  

{'loss': 0.8543, 'grad_norm': 0.8226277232170105, 'learning_rate': 6.753545979273274e-06, 'epoch': 0.84}


 84%|████████▎ | 4850/5800 [6:10:36<57:57,  3.66s/it]  

{'loss': 0.8605, 'grad_norm': 0.8438500165939331, 'learning_rate': 6.616894501433441e-06, 'epoch': 0.84}


 84%|████████▍ | 4860/5800 [6:11:13<56:07,  3.58s/it]  

{'loss': 0.8135, 'grad_norm': 0.8950964212417603, 'learning_rate': 6.481541675994163e-06, 'epoch': 0.84}


 84%|████████▍ | 4870/5800 [6:11:50<59:06,  3.81s/it]  

{'loss': 0.8206, 'grad_norm': 0.7902204394340515, 'learning_rate': 6.347491554677171e-06, 'epoch': 0.84}


 84%|████████▍ | 4880/5800 [6:12:26<55:43,  3.63s/it]

{'loss': 0.8268, 'grad_norm': 0.8832891583442688, 'learning_rate': 6.214748150208388e-06, 'epoch': 0.84}


 84%|████████▍ | 4890/5800 [6:13:02<52:55,  3.49s/it]

{'loss': 0.8181, 'grad_norm': 0.9893276691436768, 'learning_rate': 6.083315436197795e-06, 'epoch': 0.85}


 84%|████████▍ | 4900/5800 [6:13:41<58:15,  3.88s/it]  

{'loss': 0.861, 'grad_norm': 0.9865627884864807, 'learning_rate': 5.953197347020473e-06, 'epoch': 0.85}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.51it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.58it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.75it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.72it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.07it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6503816843032837, 'eval_runtime': 83.6927, 'eval_samples_per_second': 29.871, 'eval_steps_per_second': 3.74, 'epoch': 0.85}


 85%|████████▍ | 4910/5800 [6:15:42<1:10:04,  4.72s/it]

{'loss': 0.839, 'grad_norm': 0.8991257548332214, 'learning_rate': 5.824397777698859e-06, 'epoch': 0.85}


 85%|████████▍ | 4920/5800 [6:16:17<51:21,  3.50s/it]  

{'loss': 0.7874, 'grad_norm': 0.8012162446975708, 'learning_rate': 5.696920583786109e-06, 'epoch': 0.85}


 85%|████████▌ | 4930/5800 [6:16:54<51:55,  3.58s/it]

{'loss': 0.8339, 'grad_norm': 0.7983232140541077, 'learning_rate': 5.570769581250734e-06, 'epoch': 0.85}


 85%|████████▌ | 4940/5800 [6:17:31<52:33,  3.67s/it]

{'loss': 0.8195, 'grad_norm': 0.7671348452568054, 'learning_rate': 5.445948546362317e-06, 'epoch': 0.85}


 85%|████████▌ | 4950/5800 [6:18:07<51:26,  3.63s/it]

{'loss': 0.824, 'grad_norm': 0.7504797577857971, 'learning_rate': 5.3224612155785225e-06, 'epoch': 0.86}


 86%|████████▌ | 4960/5800 [6:18:42<49:43,  3.55s/it]

{'loss': 0.812, 'grad_norm': 0.7647593021392822, 'learning_rate': 5.200311285433213e-06, 'epoch': 0.86}


 86%|████████▌ | 4970/5800 [6:19:18<48:43,  3.52s/it]

{'loss': 0.8361, 'grad_norm': 0.9653403759002686, 'learning_rate': 5.079502412425785e-06, 'epoch': 0.86}


 86%|████████▌ | 4980/5800 [6:19:52<44:56,  3.29s/it]

{'loss': 0.8263, 'grad_norm': 0.7896056771278381, 'learning_rate': 4.9600382129118e-06, 'epoch': 0.86}


 86%|████████▌ | 4990/5800 [6:20:29<48:32,  3.60s/it]

{'loss': 0.8514, 'grad_norm': 0.8942765593528748, 'learning_rate': 4.841922262994586e-06, 'epoch': 0.86}


 86%|████████▌ | 5000/5800 [6:21:06<49:21,  3.70s/it]

{'loss': 0.842, 'grad_norm': 0.8576406240463257, 'learning_rate': 4.725158098418308e-06, 'epoch': 0.87}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.10it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6501176953315735, 'eval_runtime': 83.7433, 'eval_samples_per_second': 29.853, 'eval_steps_per_second': 3.738, 'epoch': 0.87}


 86%|████████▋ | 5010/5800 [6:23:29<1:02:32,  4.75s/it]

{'loss': 0.811, 'grad_norm': 1.0966944694519043, 'learning_rate': 4.609749214462089e-06, 'epoch': 0.87}


 87%|████████▋ | 5020/5800 [6:24:05<47:15,  3.64s/it]  

{'loss': 0.8169, 'grad_norm': 0.8019987344741821, 'learning_rate': 4.495699065835368e-06, 'epoch': 0.87}


 87%|████████▋ | 5030/5800 [6:24:41<43:33,  3.39s/it]

{'loss': 0.8086, 'grad_norm': 0.838291585445404, 'learning_rate': 4.3830110665745e-06, 'epoch': 0.87}


 87%|████████▋ | 5040/5800 [6:25:17<48:36,  3.84s/it]

{'loss': 0.8396, 'grad_norm': 0.8680248856544495, 'learning_rate': 4.271688589940537e-06, 'epoch': 0.87}


 87%|████████▋ | 5050/5800 [6:25:53<45:01,  3.60s/it]

{'loss': 0.8489, 'grad_norm': 0.7845426797866821, 'learning_rate': 4.161734968318309e-06, 'epoch': 0.87}


 87%|████████▋ | 5060/5800 [6:26:29<44:17,  3.59s/it]

{'loss': 0.8004, 'grad_norm': 0.8403197526931763, 'learning_rate': 4.053153493116596e-06, 'epoch': 0.88}


 87%|████████▋ | 5070/5800 [6:27:06<47:23,  3.90s/it]

{'loss': 0.829, 'grad_norm': 0.8972249627113342, 'learning_rate': 3.945947414669632e-06, 'epoch': 0.88}


 88%|████████▊ | 5080/5800 [6:27:42<43:21,  3.61s/it]

{'loss': 0.8154, 'grad_norm': 0.8747959136962891, 'learning_rate': 3.840119942139825e-06, 'epoch': 0.88}


 88%|████████▊ | 5090/5800 [6:28:17<42:52,  3.62s/it]

{'loss': 0.8063, 'grad_norm': 0.9360843896865845, 'learning_rate': 3.7356742434216775e-06, 'epoch': 0.88}


 88%|████████▊ | 5100/5800 [6:28:55<42:01,  3.60s/it]

{'loss': 0.8535, 'grad_norm': 0.9068611264228821, 'learning_rate': 3.6326134450469618e-06, 'epoch': 0.88}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.22it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:21,  3.70it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.77it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6496022343635559, 'eval_runtime': 83.6951, 'eval_samples_per_second': 29.87, 'eval_steps_per_second': 3.74, 'epoch': 0.88}


 88%|████████▊ | 5110/5800 [6:31:25<55:52,  4.86s/it]  

{'loss': 0.8625, 'grad_norm': 0.8242397308349609, 'learning_rate': 3.53094063209109e-06, 'epoch': 0.88}


 88%|████████▊ | 5120/5800 [6:32:03<44:28,  3.92s/it]

{'loss': 0.867, 'grad_norm': 0.8037448525428772, 'learning_rate': 3.4306588480808424e-06, 'epoch': 0.89}


 88%|████████▊ | 5130/5800 [6:32:39<40:21,  3.61s/it]

{'loss': 0.815, 'grad_norm': 0.8369397521018982, 'learning_rate': 3.3317710949032022e-06, 'epoch': 0.89}


 89%|████████▊ | 5140/5800 [6:33:14<39:20,  3.58s/it]

{'loss': 0.8148, 'grad_norm': 0.7860448360443115, 'learning_rate': 3.2342803327155e-06, 'epoch': 0.89}


 89%|████████▉ | 5150/5800 [6:33:53<41:42,  3.85s/it]

{'loss': 0.8681, 'grad_norm': 0.8447273969650269, 'learning_rate': 3.138189479856818e-06, 'epoch': 0.89}


 89%|████████▉ | 5160/5800 [6:34:30<39:20,  3.69s/it]

{'loss': 0.8074, 'grad_norm': 0.7752765417098999, 'learning_rate': 3.04350141276063e-06, 'epoch': 0.89}


 89%|████████▉ | 5170/5800 [6:35:05<37:16,  3.55s/it]

{'loss': 0.8275, 'grad_norm': 0.8633524179458618, 'learning_rate': 2.9502189658686897e-06, 'epoch': 0.89}


 89%|████████▉ | 5180/5800 [6:35:42<37:44,  3.65s/it]

{'loss': 0.812, 'grad_norm': 0.9065542817115784, 'learning_rate': 2.8583449315461807e-06, 'epoch': 0.9}


 89%|████████▉ | 5190/5800 [6:36:18<36:18,  3.57s/it]

{'loss': 0.8229, 'grad_norm': 0.8139297962188721, 'learning_rate': 2.7678820599981615e-06, 'epoch': 0.9}


 90%|████████▉ | 5200/5800 [6:36:55<36:31,  3.65s/it]

{'loss': 0.8314, 'grad_norm': 1.05283522605896, 'learning_rate': 2.678833059187158e-06, 'epoch': 0.9}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.56it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:38,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:22,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.02it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6504056453704834, 'eval_runtime': 83.807, 'eval_samples_per_second': 29.83, 'eval_steps_per_second': 3.735, 'epoch': 0.9}


 90%|████████▉ | 5210/5800 [6:38:54<46:17,  4.71s/it]  

{'loss': 0.8375, 'grad_norm': 0.935758113861084, 'learning_rate': 2.591200594752202e-06, 'epoch': 0.9}


 90%|█████████ | 5220/5800 [6:39:30<35:09,  3.64s/it]

{'loss': 0.8233, 'grad_norm': 0.944236695766449, 'learning_rate': 2.504987289928973e-06, 'epoch': 0.9}


 90%|█████████ | 5230/5800 [6:40:06<34:21,  3.62s/it]

{'loss': 0.7932, 'grad_norm': 0.9816475510597229, 'learning_rate': 2.420195725471297e-06, 'epoch': 0.9}


 90%|█████████ | 5240/5800 [6:40:42<33:55,  3.63s/it]

{'loss': 0.8223, 'grad_norm': 0.9601492881774902, 'learning_rate': 2.3368284395738684e-06, 'epoch': 0.91}


 91%|█████████ | 5250/5800 [6:41:19<33:30,  3.66s/it]

{'loss': 0.8419, 'grad_norm': 0.9596843123435974, 'learning_rate': 2.2548879277963064e-06, 'epoch': 0.91}


 91%|█████████ | 5260/5800 [6:41:56<33:03,  3.67s/it]

{'loss': 0.823, 'grad_norm': 0.924895167350769, 'learning_rate': 2.1743766429884295e-06, 'epoch': 0.91}


 91%|█████████ | 5270/5800 [6:42:32<31:11,  3.53s/it]

{'loss': 0.7985, 'grad_norm': 0.9037458300590515, 'learning_rate': 2.0952969952168276e-06, 'epoch': 0.91}


 91%|█████████ | 5280/5800 [6:43:08<30:47,  3.55s/it]

{'loss': 0.8019, 'grad_norm': 0.8102977871894836, 'learning_rate': 2.017651351692734e-06, 'epoch': 0.91}


 91%|█████████ | 5290/5800 [6:43:45<30:46,  3.62s/it]

{'loss': 0.8408, 'grad_norm': 0.8493402600288391, 'learning_rate': 1.941442036701141e-06, 'epoch': 0.92}


 91%|█████████▏| 5300/5800 [6:44:22<30:19,  3.64s/it]

{'loss': 0.8443, 'grad_norm': 0.9564257860183716, 'learning_rate': 1.86667133153125e-06, 'epoch': 0.92}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6497800946235657, 'eval_runtime': 83.8965, 'eval_samples_per_second': 29.799, 'eval_steps_per_second': 3.731, 'epoch': 0.92}


 92%|█████████▏| 5310/5800 [6:46:22<37:32,  4.60s/it]  

{'loss': 0.8371, 'grad_norm': 0.9974389672279358, 'learning_rate': 1.7933414744081612e-06, 'epoch': 0.92}


 92%|█████████▏| 5320/5800 [6:46:58<27:54,  3.49s/it]

{'loss': 0.8093, 'grad_norm': 0.8989366292953491, 'learning_rate': 1.7214546604258753e-06, 'epoch': 0.92}


 92%|█████████▏| 5330/5800 [6:47:34<27:24,  3.50s/it]

{'loss': 0.856, 'grad_norm': 0.8425466418266296, 'learning_rate': 1.6510130414816138e-06, 'epoch': 0.92}


 92%|█████████▏| 5340/5800 [6:48:11<26:44,  3.49s/it]

{'loss': 0.8514, 'grad_norm': 0.8291002511978149, 'learning_rate': 1.5820187262113595e-06, 'epoch': 0.92}


 92%|█████████▏| 5350/5800 [6:48:47<25:54,  3.45s/it]

{'loss': 0.8745, 'grad_norm': 0.9805192947387695, 'learning_rate': 1.5144737799267615e-06, 'epoch': 0.93}


 92%|█████████▏| 5360/5800 [6:49:22<26:12,  3.57s/it]

{'loss': 0.8309, 'grad_norm': 1.0622402429580688, 'learning_rate': 1.448380224553303e-06, 'epoch': 0.93}


 93%|█████████▎| 5370/5800 [6:49:58<25:07,  3.51s/it]

{'loss': 0.8557, 'grad_norm': 0.9509430527687073, 'learning_rate': 1.3837400385697808e-06, 'epoch': 0.93}


 93%|█████████▎| 5380/5800 [6:50:33<25:11,  3.60s/it]

{'loss': 0.8208, 'grad_norm': 0.8721368908882141, 'learning_rate': 1.320555156949077e-06, 'epoch': 0.93}


 93%|█████████▎| 5390/5800 [6:51:09<25:13,  3.69s/it]

{'loss': 0.8095, 'grad_norm': 0.9346984624862671, 'learning_rate': 1.2588274711002323e-06, 'epoch': 0.93}


 93%|█████████▎| 5400/5800 [6:51:46<26:07,  3.92s/it]

{'loss': 0.8119, 'grad_norm': 0.8308231830596924, 'learning_rate': 1.1985588288118576e-06, 'epoch': 0.93}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.08it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:32,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6497735381126404, 'eval_runtime': 83.8579, 'eval_samples_per_second': 29.812, 'eval_steps_per_second': 3.733, 'epoch': 0.93}


 93%|█████████▎| 5410/5800 [6:53:46<29:33,  4.55s/it]  

{'loss': 0.8345, 'grad_norm': 0.8601716160774231, 'learning_rate': 1.139751034196762e-06, 'epoch': 0.94}


 93%|█████████▎| 5420/5800 [6:54:23<23:05,  3.64s/it]

{'loss': 0.8636, 'grad_norm': 0.8301287293434143, 'learning_rate': 1.0824058476379962e-06, 'epoch': 0.94}


 94%|█████████▎| 5430/5800 [6:54:58<21:27,  3.48s/it]

{'loss': 0.839, 'grad_norm': 0.866981565952301, 'learning_rate': 1.0265249857361436e-06, 'epoch': 0.94}


 94%|█████████▍| 5440/5800 [6:55:35<22:51,  3.81s/it]

{'loss': 0.8614, 'grad_norm': 0.7659264206886292, 'learning_rate': 9.72110121257941e-07, 'epoch': 0.94}


 94%|█████████▍| 5450/5800 [6:56:09<20:54,  3.59s/it]

{'loss': 0.8453, 'grad_norm': 0.9161050319671631, 'learning_rate': 9.191628830861831e-07, 'epoch': 0.94}


 94%|█████████▍| 5460/5800 [6:56:45<19:05,  3.37s/it]

{'loss': 0.8521, 'grad_norm': 0.900425910949707, 'learning_rate': 8.67684856170975e-07, 'epoch': 0.94}


 94%|█████████▍| 5470/5800 [6:57:23<20:51,  3.79s/it]

{'loss': 0.8649, 'grad_norm': 0.7466237545013428, 'learning_rate': 8.176775814823123e-07, 'epoch': 0.95}


 94%|█████████▍| 5480/5800 [6:57:58<18:55,  3.55s/it]

{'loss': 0.8027, 'grad_norm': 0.9621884822845459, 'learning_rate': 7.691425559639087e-07, 'epoch': 0.95}


 95%|█████████▍| 5490/5800 [6:58:33<18:13,  3.53s/it]

{'loss': 0.8445, 'grad_norm': 0.9054902791976929, 'learning_rate': 7.220812324884086e-07, 'epoch': 0.95}


 95%|█████████▍| 5500/5800 [6:59:09<17:41,  3.54s/it]

{'loss': 0.8499, 'grad_norm': 0.8950086236000061, 'learning_rate': 6.764950198139053e-07, 'epoch': 0.95}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.07it/s][A
  1%|          | 3/313 [00:00<01:02,  4.99it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.08it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.73it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.77it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6496924161911011, 'eval_runtime': 83.8827, 'eval_samples_per_second': 29.804, 'eval_steps_per_second': 3.731, 'epoch': 0.95}


 95%|█████████▌| 5510/5800 [7:01:09<22:40,  4.69s/it]  

{'loss': 0.8712, 'grad_norm': 1.1229912042617798, 'learning_rate': 6.323852825417476e-07, 'epoch': 0.95}


 95%|█████████▌| 5520/5800 [7:01:46<17:28,  3.74s/it]

{'loss': 0.869, 'grad_norm': 0.9790769815444946, 'learning_rate': 5.897533410757162e-07, 'epoch': 0.96}


 95%|█████████▌| 5530/5800 [7:02:21<15:31,  3.45s/it]

{'loss': 0.8412, 'grad_norm': 1.1556309461593628, 'learning_rate': 5.486004715824666e-07, 'epoch': 0.96}


 96%|█████████▌| 5540/5800 [7:02:58<16:25,  3.79s/it]

{'loss': 0.8475, 'grad_norm': 0.8884724378585815, 'learning_rate': 5.089279059533658e-07, 'epoch': 0.96}


 96%|█████████▌| 5550/5800 [7:03:32<14:42,  3.53s/it]

{'loss': 0.8204, 'grad_norm': 0.8958462476730347, 'learning_rate': 4.70736831767582e-07, 'epoch': 0.96}


 96%|█████████▌| 5560/5800 [7:04:09<14:42,  3.68s/it]

{'loss': 0.8825, 'grad_norm': 0.8758114576339722, 'learning_rate': 4.34028392256558e-07, 'epoch': 0.96}


 96%|█████████▌| 5570/5800 [7:04:45<13:41,  3.57s/it]

{'loss': 0.8319, 'grad_norm': 0.9355749487876892, 'learning_rate': 3.9880368626978304e-07, 'epoch': 0.96}


 96%|█████████▌| 5580/5800 [7:05:21<12:54,  3.52s/it]

{'loss': 0.8491, 'grad_norm': 0.9216390252113342, 'learning_rate': 3.650637682418856e-07, 'epoch': 0.97}


 96%|█████████▋| 5590/5800 [7:05:59<13:42,  3.92s/it]

{'loss': 0.8688, 'grad_norm': 0.8600363731384277, 'learning_rate': 3.328096481610865e-07, 'epoch': 0.97}


 97%|█████████▋| 5600/5800 [7:06:37<12:33,  3.77s/it]

{'loss': 0.8803, 'grad_norm': 0.9192337989807129, 'learning_rate': 3.0204229153895645e-07, 'epoch': 0.97}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.06it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.67it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:04<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.650088906288147, 'eval_runtime': 83.8771, 'eval_samples_per_second': 29.806, 'eval_steps_per_second': 3.732, 'epoch': 0.97}


 97%|█████████▋| 5610/5800 [7:08:37<14:35,  4.61s/it]  

{'loss': 0.8085, 'grad_norm': 1.0736945867538452, 'learning_rate': 2.7276261938152803e-07, 'epoch': 0.97}


 97%|█████████▋| 5620/5800 [7:09:13<11:04,  3.69s/it]

{'loss': 0.8376, 'grad_norm': 0.8591208457946777, 'learning_rate': 2.4497150816170655e-07, 'epoch': 0.97}


 97%|█████████▋| 5630/5800 [7:09:49<10:13,  3.61s/it]

{'loss': 0.8488, 'grad_norm': 0.869664192199707, 'learning_rate': 2.1866978979303564e-07, 'epoch': 0.97}


 97%|█████████▋| 5640/5800 [7:10:28<10:36,  3.98s/it]

{'loss': 0.8718, 'grad_norm': 0.8761467933654785, 'learning_rate': 1.9385825160480598e-07, 'epoch': 0.98}


 97%|█████████▋| 5650/5800 [7:11:03<08:39,  3.47s/it]

{'loss': 0.8026, 'grad_norm': 0.8824387788772583, 'learning_rate': 1.7053763631849073e-07, 'epoch': 0.98}


 98%|█████████▊| 5660/5800 [7:11:39<08:25,  3.61s/it]

{'loss': 0.8403, 'grad_norm': 0.9064235687255859, 'learning_rate': 1.4870864202548572e-07, 'epoch': 0.98}


 98%|█████████▊| 5670/5800 [7:12:15<08:11,  3.78s/it]

{'loss': 0.8269, 'grad_norm': 0.912030816078186, 'learning_rate': 1.2837192216623717e-07, 'epoch': 0.98}


 98%|█████████▊| 5680/5800 [7:12:51<07:18,  3.65s/it]

{'loss': 0.8287, 'grad_norm': 0.8889880180358887, 'learning_rate': 1.095280855106795e-07, 'epoch': 0.98}


 98%|█████████▊| 5690/5800 [7:13:28<06:25,  3.50s/it]

{'loss': 0.8157, 'grad_norm': 0.7941980957984924, 'learning_rate': 9.217769614000004e-08, 'epoch': 0.98}


 98%|█████████▊| 5700/5800 [7:14:04<06:07,  3.68s/it]

{'loss': 0.8373, 'grad_norm': 0.8611682057380676, 'learning_rate': 7.632127342975803e-08, 'epoch': 0.99}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:43,  7.11it/s][A
  1%|          | 3/313 [00:00<01:02,  5.00it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.50it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.26it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.30it/s][A
  3%|▎         | 10/313 [00:02<01:25,  3.56it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.89it/s][A
  5%|▌         | 16/313 [00:04<01:14,  4.01it/s][A
  5%|▌         | 17/313 [00:04<01:13,  4.05it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.95it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6498417258262634, 'eval_runtime': 83.8933, 'eval_samples_per_second': 29.8, 'eval_steps_per_second': 3.731, 'epoch': 0.99}


 98%|█████████▊| 5710/5800 [7:16:05<07:09,  4.77s/it]

{'loss': 0.8244, 'grad_norm': 0.8659815788269043, 'learning_rate': 6.195929203434703e-08, 'epoch': 0.99}


 99%|█████████▊| 5720/5800 [7:16:41<04:43,  3.55s/it]

{'loss': 0.8436, 'grad_norm': 0.8483352661132812, 'learning_rate': 4.909218187276743e-08, 'epoch': 0.99}


 99%|█████████▉| 5730/5800 [7:17:18<04:20,  3.72s/it]

{'loss': 0.8574, 'grad_norm': 0.8471155762672424, 'learning_rate': 3.7720328115781187e-08, 'epoch': 0.99}


 99%|█████████▉| 5740/5800 [7:17:53<03:39,  3.66s/it]

{'loss': 0.8269, 'grad_norm': 0.8730392456054688, 'learning_rate': 2.784407117435439e-08, 'epoch': 0.99}


 99%|█████████▉| 5750/5800 [7:18:29<02:56,  3.54s/it]

{'loss': 0.8333, 'grad_norm': 0.8075110912322998, 'learning_rate': 1.9463706689493154e-08, 'epoch': 0.99}


 99%|█████████▉| 5760/5800 [7:19:05<02:24,  3.61s/it]

{'loss': 0.8163, 'grad_norm': 0.7977719306945801, 'learning_rate': 1.2579485523378509e-08, 'epoch': 1.0}


 99%|█████████▉| 5770/5800 [7:19:40<01:47,  3.57s/it]

{'loss': 0.8255, 'grad_norm': 0.8898499011993408, 'learning_rate': 7.19161375185573e-09, 'epoch': 1.0}


100%|█████████▉| 5780/5800 [7:20:16<01:08,  3.40s/it]

{'loss': 0.8612, 'grad_norm': 1.1900169849395752, 'learning_rate': 3.300252658283709e-09, 'epoch': 1.0}


100%|█████████▉| 5790/5800 [7:20:54<00:36,  3.60s/it]

{'loss': 0.7519, 'grad_norm': 0.793248176574707, 'learning_rate': 9.055187286832745e-10, 'epoch': 1.0}


100%|██████████| 5800/5800 [7:21:29<00:00,  3.52s/it]

{'loss': 0.7166, 'grad_norm': 0.7854896187782288, 'learning_rate': 7.483648256645382e-12, 'epoch': 1.0}



  0%|          | 0/313 [00:00<?, ?it/s][A
  1%|          | 2/313 [00:00<00:44,  7.06it/s][A
  1%|          | 3/313 [00:00<01:02,  4.98it/s][A
  1%|▏         | 4/313 [00:00<01:13,  4.21it/s][A
  2%|▏         | 5/313 [00:01<01:26,  3.55it/s][A
  2%|▏         | 6/313 [00:01<01:27,  3.49it/s][A
  2%|▏         | 7/313 [00:01<01:39,  3.09it/s][A
  3%|▎         | 8/313 [00:02<01:33,  3.27it/s][A
  3%|▎         | 9/313 [00:02<01:31,  3.31it/s][A
  3%|▎         | 10/313 [00:02<01:24,  3.57it/s][A
  4%|▎         | 11/313 [00:02<01:20,  3.74it/s][A
  4%|▍         | 12/313 [00:03<01:21,  3.68it/s][A
  4%|▍         | 13/313 [00:03<01:20,  3.71it/s][A
  4%|▍         | 14/313 [00:03<01:19,  3.78it/s][A
  5%|▍         | 15/313 [00:03<01:16,  3.90it/s][A
  5%|▌         | 16/313 [00:04<01:13,  4.02it/s][A
  5%|▌         | 17/313 [00:04<01:12,  4.06it/s][A
  6%|▌         | 18/313 [00:04<01:14,  3.96it/s][A
  6%|▌         | 19/313 [00:04<01:13,  4.01it/s][A
  6%|▋         | 20/313 [00:

{'eval_loss': 0.6499903202056885, 'eval_runtime': 83.8473, 'eval_samples_per_second': 29.816, 'eval_steps_per_second': 3.733, 'epoch': 1.0}
{'train_runtime': 26576.4904, 'train_samples_per_second': 6.984, 'train_steps_per_second': 0.218, 'train_loss': 0.8717163933556655, 'epoch': 1.0}





TrainOutput(global_step=5800, training_loss=0.8717163933556655, metrics={'train_runtime': 26576.4904, 'train_samples_per_second': 6.984, 'train_steps_per_second': 0.218, 'total_flos': 3.1927415068648243e+18, 'train_loss': 0.8717163933556655, 'epoch': 1.0034603572818894})

In [15]:
trainer.push_to_hub('first_qwen3_14b')

100%|██████████| 2/2 [00:03<00:00,  1.67s/it]


CommitInfo(commit_url='https://huggingface.co/mika5883/qwen3-14b_rugec/commit/c715e348c7788f6cdab5ac6c0bd73d3f496562e1', commit_message='first_qwen3_14b', commit_description='', oid='c715e348c7788f6cdab5ac6c0bd73d3f496562e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mika5883/qwen3-14b_rugec', endpoint='https://huggingface.co', repo_type='model', repo_id='mika5883/qwen3-14b_rugec'), pr_revision=None, pr_num=None)