# Preliminaries

In [None]:
%pip install --upgrade -q pip cryptography==43.0.3 nvidia-nvshmem-cu12 pandas==2.2.2 numpy==2.0.2 yandex-cloud-ml-sdk nvidia-ml-py3 fastai pydantic==2.12.3 fsspec==2025.3.0 datasets dvc soxr pyopenssl==24.2.1
%pip install --upgrade -q torchaudio torchdata torchtext torch torchvision --index-url https://download.pytorch.org/whl/cu126
%pip install --upgrade -q pybind11 sentencepiece transformers
%pip install --upgrade -q peft dotenv bitsandbytes

In [None]:
import os
import itertools
import random
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.trainer_callback import EarlyStoppingCallback
from peft import LoraConfig, TaskType, get_peft_model

os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"
# os.environ["HF_HOME"] = os.environ.get("TRANSFORMERS_CACHE", "~/.cache/huggingface")

# Load the data

In [None]:
df_train = pd.read_parquet("train.parquet")[["ce", "ru"]]
df_dev = pd.read_parquet("dev.parquet")[["ce", "ru"]]

# Download the original model and its tokenizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "jbochi/madlad400-3b-mt"
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map=None,
)

model = model.to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Dataset configuration

In [None]:
MAX_LENGTH = 256

In [None]:
class ParallelSentencesDataset(Dataset):
    def __init__(self, df, tokenizer, main_lang, max_length=MAX_LENGTH):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.main_lang = main_lang

        self.languages = self.df.columns.to_list()
        self.secondary_langs = list(
            filter(lambda x: x != self.main_lang, self.languages)
        )
        self.directions = list(
            itertools.product([self.main_lang], self.secondary_langs)
        ) + list(itertools.product(self.secondary_langs, [self.main_lang]))

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        # select translation direction
        direction = random.choice(self.directions)

        src_lang, tgt_lang = direction

        src_text, tgt_text = self.df.iloc[idx][[src_lang, tgt_lang]]
        tokens = self.tokenizer(
            f"<2{tgt_lang}> {src_text}",
            text_target=tgt_text,
            truncation=True,
            max_length=self.max_length,
        )
        return tokens


dataset_train = ParallelSentencesDataset(
    df=df_train,
    main_lang="ce",
    tokenizer=tokenizer,
)

dataset_eval = ParallelSentencesDataset(
    df=df_dev,
    main_lang="ce",
    tokenizer=tokenizer,
)

# Initialize LoRA adapter

Check the modules of MADLAD-400

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="all",
    target_modules=["q", "v", "k", "o", "wi_0", "wi_1", "wo", "lm_head"], # train all linear layers with LoRA
    modules_to_save=[
        "shared", # train embeddings as they are, because no adapter may be applied to them
    ],
    use_rslora=True,  # rank-stabilized LoRA for better stability
    # use_dora=True,
    layers_to_transform=None, # apply LoRA to all the layers of the model
    init_lora_weights="pissa",
)

lora_model = get_peft_model(model, peft_config)

lora_model.print_trainable_parameters()

trainable params: 364,741,632 || all params: 3,305,114,624 || trainable%: 11.0357


# Set up the collator

In [None]:
# this object is used for proper post processing of tokenized batches before feeding them to the model during train process

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=lora_model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8,
)

# Trainig arguments

In [None]:
BATCH_SIZE = 10
NUM_EPOCHS = 16

training_args = Seq2SeqTrainingArguments(
    output_dir="./madlad-lora-checkpoints",
    learning_rate=1e-4,
    optim="adamw_torch_fused",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=12,
    # it may be used for saving GPU memory, however we've failed with setting this up
    gradient_checkpointing=False,

    # must have for new languages, as the gradient is tending to be too big at the start
    max_grad_norm=0.5,


    warmup_steps=200,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    num_train_epochs=NUM_EPOCHS,

    # three must have parameters here! without these settings the gradients will very soon blow up!
    fp16=False,
    bf16=True,
    bf16_full_eval=True,

    logging_steps=20,
    save_strategy="steps",
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=MAX_LENGTH,
    dataloader_pin_memory=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    generation_num_beams=4,
    report_to="none",
    logging_dir="./lora_logs",
)

# Train the model

In [None]:
if training_args.gradient_checkpointing:
    lora_model.gradient_checkpointing_enable()

lora_model.train()

trainer = Seq2SeqTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    data_collator=data_collator,
    # this callback was too harsh, it's better to set it as early_stopping_patience=5, early_stopping_threshold=0.001
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3, 
            early_stopping_threshold=0.01,
        )
    ],
)


trainer.train()

lora_model.save_pretrained("./madlad-lora-final")
tokenizer.save_pretrained("./madlad-lora-final")

  0%|          | 20/28720 [02:34<60:28:37,  7.59s/it]

{'loss': 5.9047, 'grad_norm': 7.833569526672363, 'learning_rate': 9.5e-06, 'epoch': 0.01}


  0%|          | 40/28720 [05:07<60:46:35,  7.63s/it]

{'loss': 5.2804, 'grad_norm': 4.561985015869141, 'learning_rate': 1.9500000000000003e-05, 'epoch': 0.03}


  0%|          | 60/28720 [07:40<60:55:42,  7.65s/it]

{'loss': 4.9519, 'grad_norm': 5.66672945022583, 'learning_rate': 2.95e-05, 'epoch': 0.04}


  0%|          | 80/28720 [10:12<60:42:26,  7.63s/it]

{'loss': 4.5295, 'grad_norm': 5.562053680419922, 'learning_rate': 3.9500000000000005e-05, 'epoch': 0.06}


  0%|          | 100/28720 [12:45<60:53:55,  7.66s/it]

{'loss': 4.3562, 'grad_norm': 5.696022987365723, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.07}


  0%|          | 120/28720 [15:18<60:57:38,  7.67s/it]

{'loss': 4.1224, 'grad_norm': 5.313017845153809, 'learning_rate': 5.95e-05, 'epoch': 0.08}


  0%|          | 140/28720 [17:51<60:39:28,  7.64s/it]

{'loss': 3.8324, 'grad_norm': 5.816738605499268, 'learning_rate': 6.95e-05, 'epoch': 0.1}


  1%|          | 160/28720 [20:24<60:33:31,  7.63s/it]

{'loss': 3.6137, 'grad_norm': 4.466238975524902, 'learning_rate': 7.950000000000001e-05, 'epoch': 0.11}


  1%|          | 180/28720 [22:57<60:52:08,  7.68s/it]

{'loss': 3.5809, 'grad_norm': 4.824036121368408, 'learning_rate': 8.950000000000001e-05, 'epoch': 0.13}


  1%|          | 200/28720 [25:31<61:04:34,  7.71s/it]

{'loss': 3.5315, 'grad_norm': 5.123225212097168, 'learning_rate': 9.95e-05, 'epoch': 0.14}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.76it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.86it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.89it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.51it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.22it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.10it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.96it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.93it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.88it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.87it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.80it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.78it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.80it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.78it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.74it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 3.338717222213745, 'eval_runtime': 7.5364, 'eval_samples_per_second': 47.768, 'eval_steps_per_second': 4.777, 'epoch': 0.14}


  1%|          | 220/28720 [29:45<60:57:59,  7.70s/it] 

{'loss': 3.4729, 'grad_norm': 23.802146911621094, 'learning_rate': 9.999989049151896e-05, 'epoch': 0.15}


  1%|          | 240/28720 [32:18<59:40:27,  7.54s/it]

{'loss': 3.3863, 'grad_norm': 5.235472679138184, 'learning_rate': 9.999953860885233e-05, 'epoch': 0.17}


  1%|          | 260/28720 [34:47<59:06:39,  7.48s/it]

{'loss': 3.2626, 'grad_norm': 4.8367156982421875, 'learning_rate': 9.999894405036032e-05, 'epoch': 0.18}


  1%|          | 280/28720 [37:16<58:42:41,  7.43s/it]

{'loss': 3.2074, 'grad_norm': 5.291466236114502, 'learning_rate': 9.999810681892863e-05, 'epoch': 0.2}


  1%|          | 300/28720 [39:45<58:43:52,  7.44s/it]

{'loss': 3.1849, 'grad_norm': 5.873101234436035, 'learning_rate': 9.999702691862085e-05, 'epoch': 0.21}


  1%|          | 320/28720 [42:14<58:56:03,  7.47s/it]

{'loss': 2.9894, 'grad_norm': 4.606978416442871, 'learning_rate': 9.999570435467831e-05, 'epoch': 0.22}


  1%|          | 340/28720 [44:43<58:46:20,  7.46s/it]

{'loss': 3.0, 'grad_norm': 4.842965126037598, 'learning_rate': 9.999413913352015e-05, 'epoch': 0.24}


  1%|▏         | 360/28720 [47:14<59:53:16,  7.60s/it]

{'loss': 2.9286, 'grad_norm': 6.023223400115967, 'learning_rate': 9.999233126274331e-05, 'epoch': 0.25}


  1%|▏         | 380/28720 [49:46<59:57:12,  7.62s/it]

{'loss': 2.8827, 'grad_norm': 4.243201732635498, 'learning_rate': 9.999028075112237e-05, 'epoch': 0.26}


  1%|▏         | 400/28720 [52:16<58:27:52,  7.43s/it]

{'loss': 2.8569, 'grad_norm': 4.349167823791504, 'learning_rate': 9.998798760860962e-05, 'epoch': 0.28}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.75it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.84it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.88it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.43it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.19it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.08it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.00it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.96it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.92it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.89it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.86it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.85it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.80it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.80it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.81it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.83it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.77it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.80it/s]

{'eval_loss': 2.9613823890686035, 'eval_runtime': 7.5092, 'eval_samples_per_second': 47.941, 'eval_steps_per_second': 4.794, 'epoch': 0.28}


  1%|▏         | 420/28720 [56:33<58:20:45,  7.42s/it] 

{'loss': 2.7484, 'grad_norm': 6.857264041900635, 'learning_rate': 9.998545184633494e-05, 'epoch': 0.29}


  2%|▏         | 440/28720 [59:01<58:08:17,  7.40s/it]

{'loss': 2.7754, 'grad_norm': 4.832244396209717, 'learning_rate': 9.998267347660586e-05, 'epoch': 0.31}


  2%|▏         | 460/28720 [1:01:30<58:23:48,  7.44s/it]

{'loss': 2.7182, 'grad_norm': 7.148543834686279, 'learning_rate': 9.997965251290734e-05, 'epoch': 0.32}


  2%|▏         | 480/28720 [1:03:59<58:17:55,  7.43s/it]

{'loss': 2.6969, 'grad_norm': 4.621957302093506, 'learning_rate': 9.99763889699018e-05, 'epoch': 0.33}


  2%|▏         | 500/28720 [1:06:28<58:20:10,  7.44s/it]

{'loss': 2.6075, 'grad_norm': 5.154018402099609, 'learning_rate': 9.997288286342908e-05, 'epoch': 0.35}


  2%|▏         | 520/28720 [1:08:56<58:01:43,  7.41s/it]

{'loss': 2.6099, 'grad_norm': 4.564573764801025, 'learning_rate': 9.996913421050624e-05, 'epoch': 0.36}


  2%|▏         | 540/28720 [1:11:25<58:04:16,  7.42s/it]

{'loss': 2.5802, 'grad_norm': 5.201085567474365, 'learning_rate': 9.996514302932765e-05, 'epoch': 0.38}


  2%|▏         | 560/28720 [1:13:54<58:19:02,  7.46s/it]

{'loss': 2.6243, 'grad_norm': 5.008784770965576, 'learning_rate': 9.996090933926469e-05, 'epoch': 0.39}


  2%|▏         | 580/28720 [1:16:23<58:13:48,  7.45s/it]

{'loss': 2.529, 'grad_norm': 5.026313781738281, 'learning_rate': 9.995643316086589e-05, 'epoch': 0.4}


  2%|▏         | 600/28720 [1:18:52<58:14:12,  7.46s/it]

{'loss': 2.4525, 'grad_norm': 4.653861045837402, 'learning_rate': 9.995171451585662e-05, 'epoch': 0.42}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.63it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.81it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.84it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.48it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.09it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.01it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.95it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.91it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.89it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.83it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.82it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.78it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.76it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.72it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.76it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.80it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.78it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.82it/s]

{'eval_loss': 2.567532777786255, 'eval_runtime': 7.5127, 'eval_samples_per_second': 47.919, 'eval_steps_per_second': 4.792, 'epoch': 0.42}


  2%|▏         | 620/28720 [1:23:10<58:30:07,  7.49s/it] 

{'loss': 2.4623, 'grad_norm': 5.109018802642822, 'learning_rate': 9.994675342713913e-05, 'epoch': 0.43}


  2%|▏         | 640/28720 [1:25:39<58:08:23,  7.45s/it]

{'loss': 2.4865, 'grad_norm': 5.814417839050293, 'learning_rate': 9.994154991879237e-05, 'epoch': 0.45}


  2%|▏         | 660/28720 [1:28:08<58:15:47,  7.47s/it]

{'loss': 2.4308, 'grad_norm': 4.240172863006592, 'learning_rate': 9.993610401607188e-05, 'epoch': 0.46}


  2%|▏         | 680/28720 [1:30:36<57:45:58,  7.42s/it]

{'loss': 2.4249, 'grad_norm': 4.276431083679199, 'learning_rate': 9.993041574540967e-05, 'epoch': 0.47}


  2%|▏         | 700/28720 [1:33:05<57:58:04,  7.45s/it]

{'loss': 2.4384, 'grad_norm': 4.151264190673828, 'learning_rate': 9.992448513441413e-05, 'epoch': 0.49}


  3%|▎         | 720/28720 [1:35:34<57:51:13,  7.44s/it]

{'loss': 2.4269, 'grad_norm': 4.409785270690918, 'learning_rate': 9.991831221186983e-05, 'epoch': 0.5}


  3%|▎         | 740/28720 [1:38:03<57:47:01,  7.43s/it]

{'loss': 2.3954, 'grad_norm': 4.608275413513184, 'learning_rate': 9.991189700773744e-05, 'epoch': 0.52}


  3%|▎         | 760/28720 [1:40:32<57:57:18,  7.46s/it]

{'loss': 2.3787, 'grad_norm': 4.035380840301514, 'learning_rate': 9.990523955315355e-05, 'epoch': 0.53}


  3%|▎         | 780/28720 [1:43:01<57:57:21,  7.47s/it]

{'loss': 2.3457, 'grad_norm': 3.9581987857818604, 'learning_rate': 9.989833988043052e-05, 'epoch': 0.54}


  3%|▎         | 800/28720 [1:45:31<58:02:01,  7.48s/it]

{'loss': 2.2415, 'grad_norm': 3.9160547256469727, 'learning_rate': 9.989119802305631e-05, 'epoch': 0.56}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.76it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.88it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.88it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.48it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.10it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.01it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.96it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.93it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.85it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.84it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.85it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.82it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.78it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.79it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.78it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.79it/s]

{'eval_loss': 2.381009817123413, 'eval_runtime': 7.5519, 'eval_samples_per_second': 47.67, 'eval_steps_per_second': 4.767, 'epoch': 0.56}


  3%|▎         | 820/28720 [1:49:43<59:11:30,  7.64s/it] 

{'loss': 2.2693, 'grad_norm': 4.432214736938477, 'learning_rate': 9.988381401569439e-05, 'epoch': 0.57}


  3%|▎         | 840/28720 [1:52:15<58:35:08,  7.56s/it]

{'loss': 2.2617, 'grad_norm': 5.434338569641113, 'learning_rate': 9.987618789418349e-05, 'epoch': 0.59}


  3%|▎         | 860/28720 [1:54:46<58:33:21,  7.57s/it]

{'loss': 2.2699, 'grad_norm': 4.182005882263184, 'learning_rate': 9.986831969553744e-05, 'epoch': 0.6}


  3%|▎         | 880/28720 [1:57:18<59:07:02,  7.64s/it]

{'loss': 2.2111, 'grad_norm': 4.449812412261963, 'learning_rate': 9.986020945794506e-05, 'epoch': 0.61}


  3%|▎         | 900/28720 [1:59:50<58:43:16,  7.60s/it]

{'loss': 2.2472, 'grad_norm': 3.9677953720092773, 'learning_rate': 9.985185722076983e-05, 'epoch': 0.63}


  3%|▎         | 920/28720 [2:02:22<58:21:05,  7.56s/it]

{'loss': 2.2377, 'grad_norm': 4.524963855743408, 'learning_rate': 9.984326302454992e-05, 'epoch': 0.64}


  3%|▎         | 940/28720 [2:04:51<57:32:19,  7.46s/it]

{'loss': 2.1865, 'grad_norm': 4.613027095794678, 'learning_rate': 9.983442691099772e-05, 'epoch': 0.65}


  3%|▎         | 960/28720 [2:07:21<57:36:47,  7.47s/it]

{'loss': 2.162, 'grad_norm': 7.155683517456055, 'learning_rate': 9.982534892299988e-05, 'epoch': 0.67}


  3%|▎         | 980/28720 [2:09:51<57:52:33,  7.51s/it]

{'loss': 2.1709, 'grad_norm': 4.11006498336792, 'learning_rate': 9.981602910461696e-05, 'epoch': 0.68}


  3%|▎         | 1000/28720 [2:12:24<59:29:19,  7.73s/it]

{'loss': 2.0952, 'grad_norm': 3.618540048599243, 'learning_rate': 9.98064675010833e-05, 'epoch': 0.7}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.74it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.85it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.88it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.47it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.10it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  4.99it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.80it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.74it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.66it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.68it/s][A
 33%|███▎      | 12/36 [00:02<00:05,  4.69it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.62it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.67it/s][A
 42%|████▏     | 15/36 [00:03<00:04,  4.60it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.64it/s][A
 47%|████▋     | 17/36 [00:03<00:04,  4.63it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.63it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.54it/s][A
 56%|█████▌    | 20/36 [00:04<00:03,  4.62it/s]

{'eval_loss': 2.2976226806640625, 'eval_runtime': 7.7702, 'eval_samples_per_second': 46.331, 'eval_steps_per_second': 4.633, 'epoch': 0.7}


  4%|▎         | 1020/28720 [2:16:39<59:06:20,  7.68s/it] 

{'loss': 2.1466, 'grad_norm': 3.839866876602173, 'learning_rate': 9.979666415880666e-05, 'epoch': 0.71}


  4%|▎         | 1040/28720 [2:19:11<58:35:08,  7.62s/it]

{'loss': 2.1783, 'grad_norm': 3.886914014816284, 'learning_rate': 9.978661912536824e-05, 'epoch': 0.72}


  4%|▎         | 1060/28720 [2:21:43<58:33:31,  7.62s/it]

{'loss': 2.1079, 'grad_norm': 4.6306562423706055, 'learning_rate': 9.977633244952219e-05, 'epoch': 0.74}


  4%|▍         | 1080/28720 [2:24:16<58:38:00,  7.64s/it]

{'loss': 2.1306, 'grad_norm': 5.187190055847168, 'learning_rate': 9.976580418119552e-05, 'epoch': 0.75}


  4%|▍         | 1100/28720 [2:26:49<58:49:21,  7.67s/it]

{'loss': 2.0885, 'grad_norm': 8.629783630371094, 'learning_rate': 9.975503437148783e-05, 'epoch': 0.77}


  4%|▍         | 1120/28720 [2:29:22<58:45:47,  7.66s/it]

{'loss': 2.0563, 'grad_norm': 4.301577091217041, 'learning_rate': 9.974402307267105e-05, 'epoch': 0.78}


  4%|▍         | 1140/28720 [2:31:55<58:33:03,  7.64s/it]

{'loss': 2.1004, 'grad_norm': 3.910496711730957, 'learning_rate': 9.973277033818919e-05, 'epoch': 0.79}


  4%|▍         | 1160/28720 [2:34:28<58:28:22,  7.64s/it]

{'loss': 2.0258, 'grad_norm': 24.51922607421875, 'learning_rate': 9.972127622265813e-05, 'epoch': 0.81}


  4%|▍         | 1180/28720 [2:37:01<58:34:50,  7.66s/it]

{'loss': 2.1085, 'grad_norm': 7.250113010406494, 'learning_rate': 9.970954078186519e-05, 'epoch': 0.82}


  4%|▍         | 1200/28720 [2:39:34<58:17:15,  7.62s/it]

{'loss': 2.0442, 'grad_norm': 5.049782752990723, 'learning_rate': 9.969756407276909e-05, 'epoch': 0.84}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.72it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.86it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.77it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.41it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.18it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.06it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.96it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.92it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.87it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.86it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.84it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.83it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.80it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.81it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.82it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.77it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.80it/s]

{'eval_loss': 2.1879780292510986, 'eval_runtime': 7.5481, 'eval_samples_per_second': 47.694, 'eval_steps_per_second': 4.769, 'epoch': 0.84}


  4%|▍         | 1220/28720 [2:43:52<58:50:58,  7.70s/it] 

{'loss': 2.0468, 'grad_norm': 4.131608486175537, 'learning_rate': 9.968534615349952e-05, 'epoch': 0.85}


  4%|▍         | 1240/28720 [2:46:25<58:19:22,  7.64s/it]

{'loss': 2.0069, 'grad_norm': 4.2672319412231445, 'learning_rate': 9.96728870833569e-05, 'epoch': 0.86}


  4%|▍         | 1260/28720 [2:48:58<58:28:34,  7.67s/it]

{'loss': 2.0518, 'grad_norm': 4.051925182342529, 'learning_rate': 9.96601869228121e-05, 'epoch': 0.88}


  4%|▍         | 1280/28720 [2:51:32<58:41:15,  7.70s/it]

{'loss': 1.9841, 'grad_norm': 4.279064178466797, 'learning_rate': 9.96472457335061e-05, 'epoch': 0.89}


  5%|▍         | 1300/28720 [2:54:06<58:39:51,  7.70s/it]

{'loss': 2.0339, 'grad_norm': 4.3438334465026855, 'learning_rate': 9.963406357824978e-05, 'epoch': 0.91}


  5%|▍         | 1320/28720 [2:56:40<58:41:43,  7.71s/it]

{'loss': 2.0176, 'grad_norm': 4.358459949493408, 'learning_rate': 9.962064052102355e-05, 'epoch': 0.92}


  5%|▍         | 1340/28720 [2:59:14<58:16:48,  7.66s/it]

{'loss': 2.0084, 'grad_norm': 4.5173115730285645, 'learning_rate': 9.9606976626977e-05, 'epoch': 0.93}


  5%|▍         | 1360/28720 [3:01:47<58:06:10,  7.65s/it]

{'loss': 1.957, 'grad_norm': 36.750492095947266, 'learning_rate': 9.959307196242871e-05, 'epoch': 0.95}


  5%|▍         | 1380/28720 [3:04:21<58:24:56,  7.69s/it]

{'loss': 1.968, 'grad_norm': 3.6332614421844482, 'learning_rate': 9.95789265948658e-05, 'epoch': 0.96}


  5%|▍         | 1400/28720 [3:06:55<58:15:15,  7.68s/it]

{'loss': 1.9395, 'grad_norm': 3.7323031425476074, 'learning_rate': 9.956454059294371e-05, 'epoch': 0.98}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.65it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.78it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.83it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.36it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.12it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.02it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.92it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.88it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.82it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.81it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.82it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.74it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.74it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.76it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.77it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.78it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.72it/s][A
 56%|█████▌    | 20/36 [00:04<00:03,  4.75it/s]

{'eval_loss': 2.115324020385742, 'eval_runtime': 7.6719, 'eval_samples_per_second': 46.924, 'eval_steps_per_second': 4.692, 'epoch': 0.98}


  5%|▍         | 1420/28720 [3:11:12<59:12:19,  7.81s/it] 

{'loss': 1.8985, 'grad_norm': 4.139351844787598, 'learning_rate': 9.954991402648574e-05, 'epoch': 0.99}


  5%|▌         | 1440/28720 [3:13:43<56:49:00,  7.50s/it]

{'loss': 1.9288, 'grad_norm': 4.328453063964844, 'learning_rate': 9.953504696648288e-05, 'epoch': 1.0}


  5%|▌         | 1460/28720 [3:16:16<57:49:10,  7.64s/it]

{'loss': 1.8598, 'grad_norm': 3.7819244861602783, 'learning_rate': 9.951993948509326e-05, 'epoch': 1.02}


  5%|▌         | 1480/28720 [3:18:49<57:43:15,  7.63s/it]

{'loss': 1.8856, 'grad_norm': 3.586270809173584, 'learning_rate': 9.950459165564202e-05, 'epoch': 1.03}


  5%|▌         | 1500/28720 [3:21:22<57:41:40,  7.63s/it]

{'loss': 1.9193, 'grad_norm': 4.226072788238525, 'learning_rate': 9.948900355262073e-05, 'epoch': 1.04}


  5%|▌         | 1520/28720 [3:23:54<57:43:34,  7.64s/it]

{'loss': 1.8666, 'grad_norm': 3.7151453495025635, 'learning_rate': 9.947317525168724e-05, 'epoch': 1.06}


  5%|▌         | 1540/28720 [3:26:27<58:16:09,  7.72s/it]

{'loss': 1.9106, 'grad_norm': 3.678696632385254, 'learning_rate': 9.94571068296652e-05, 'epoch': 1.07}


  5%|▌         | 1560/28720 [3:29:00<57:38:27,  7.64s/it]

{'loss': 1.8778, 'grad_norm': 4.655883312225342, 'learning_rate': 9.944079836454365e-05, 'epoch': 1.09}


  6%|▌         | 1580/28720 [3:31:33<57:49:55,  7.67s/it]

{'loss': 1.859, 'grad_norm': 4.043760299682617, 'learning_rate': 9.942424993547672e-05, 'epoch': 1.1}


  6%|▌         | 1600/28720 [3:34:06<57:43:39,  7.66s/it]

{'loss': 1.8338, 'grad_norm': 3.975090265274048, 'learning_rate': 9.940746162278324e-05, 'epoch': 1.11}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.63it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.75it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.83it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.44it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.19it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.06it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.95it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.91it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.88it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.83it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.82it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.81it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.79it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.79it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.80it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.75it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.77it/s]

{'eval_loss': 1.9707907438278198, 'eval_runtime': 7.5754, 'eval_samples_per_second': 47.522, 'eval_steps_per_second': 4.752, 'epoch': 1.11}


  6%|▌         | 1620/28720 [3:38:23<57:51:38,  7.69s/it] 

{'loss': 1.8152, 'grad_norm': 4.025564193725586, 'learning_rate': 9.939043350794633e-05, 'epoch': 1.13}


  6%|▌         | 1640/28720 [3:40:56<56:31:44,  7.51s/it]

{'loss': 1.8052, 'grad_norm': 3.527029037475586, 'learning_rate': 9.937316567361297e-05, 'epoch': 1.14}


  6%|▌         | 1660/28720 [3:43:26<56:21:18,  7.50s/it]

{'loss': 1.8586, 'grad_norm': 3.9588372707366943, 'learning_rate': 9.935565820359365e-05, 'epoch': 1.16}


  6%|▌         | 1680/28720 [3:45:56<56:59:58,  7.59s/it]

{'loss': 1.8175, 'grad_norm': 3.245211601257324, 'learning_rate': 9.933791118286194e-05, 'epoch': 1.17}


  6%|▌         | 1700/28720 [3:48:30<57:29:06,  7.66s/it]

{'loss': 1.813, 'grad_norm': 3.948108673095703, 'learning_rate': 9.931992469755411e-05, 'epoch': 1.18}


  6%|▌         | 1720/28720 [3:51:03<57:28:10,  7.66s/it]

{'loss': 1.8086, 'grad_norm': 4.580286979675293, 'learning_rate': 9.930169883496867e-05, 'epoch': 1.2}


  6%|▌         | 1740/28720 [3:53:36<57:28:05,  7.67s/it]

{'loss': 1.7778, 'grad_norm': 3.49418306350708, 'learning_rate': 9.928323368356595e-05, 'epoch': 1.21}


  6%|▌         | 1760/28720 [3:56:10<57:16:19,  7.65s/it]

{'loss': 1.7677, 'grad_norm': 4.166897773742676, 'learning_rate': 9.926452933296771e-05, 'epoch': 1.23}


  6%|▌         | 1780/28720 [3:58:42<57:00:39,  7.62s/it]

{'loss': 1.7932, 'grad_norm': 3.779027223587036, 'learning_rate': 9.924558587395665e-05, 'epoch': 1.24}


  6%|▋         | 1800/28720 [4:01:14<56:50:00,  7.60s/it]

{'loss': 1.7847, 'grad_norm': 3.952899694442749, 'learning_rate': 9.9226403398476e-05, 'epoch': 1.25}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.66it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.79it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.84it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.46it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.18it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.07it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.97it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.90it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.84it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.83it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.83it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.82it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.83it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.78it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.78it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.79it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.80it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.75it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 2.0018465518951416, 'eval_runtime': 7.5589, 'eval_samples_per_second': 47.626, 'eval_steps_per_second': 4.763, 'epoch': 1.25}


  6%|▋         | 1820/28720 [4:05:38<57:33:52,  7.70s/it] 

{'loss': 1.7915, 'grad_norm': 3.656069755554199, 'learning_rate': 9.92069819996291e-05, 'epoch': 1.27}


  6%|▋         | 1840/28720 [4:08:11<57:05:07,  7.65s/it]

{'loss': 1.7921, 'grad_norm': 3.8725852966308594, 'learning_rate': 9.918732177167889e-05, 'epoch': 1.28}


  6%|▋         | 1860/28720 [4:10:44<56:12:46,  7.53s/it]

{'loss': 1.7496, 'grad_norm': 3.148010492324829, 'learning_rate': 9.916742281004749e-05, 'epoch': 1.3}


  7%|▋         | 1880/28720 [4:13:13<55:41:07,  7.47s/it]

{'loss': 1.8019, 'grad_norm': 3.8715054988861084, 'learning_rate': 9.914728521131574e-05, 'epoch': 1.31}


  7%|▋         | 1900/28720 [4:15:43<55:40:53,  7.47s/it]

{'loss': 1.7711, 'grad_norm': 4.788589954376221, 'learning_rate': 9.91269090732227e-05, 'epoch': 1.32}


  7%|▋         | 1920/28720 [4:18:12<55:30:22,  7.46s/it]

{'loss': 1.7748, 'grad_norm': 6.797977924346924, 'learning_rate': 9.910629449466521e-05, 'epoch': 1.34}


  7%|▋         | 1940/28720 [4:20:42<55:44:17,  7.49s/it]

{'loss': 1.8129, 'grad_norm': 3.462165594100952, 'learning_rate': 9.908544157569738e-05, 'epoch': 1.35}


  7%|▋         | 1960/28720 [4:23:12<55:44:28,  7.50s/it]

{'loss': 1.744, 'grad_norm': 3.50915789604187, 'learning_rate': 9.906435041753016e-05, 'epoch': 1.36}


  7%|▋         | 1980/28720 [4:25:41<55:26:13,  7.46s/it]

{'loss': 1.7333, 'grad_norm': 3.618558645248413, 'learning_rate': 9.904302112253075e-05, 'epoch': 1.38}


  7%|▋         | 2000/28720 [4:28:11<56:00:30,  7.55s/it]

{'loss': 1.7949, 'grad_norm': 3.82189679145813, 'learning_rate': 9.902145379422218e-05, 'epoch': 1.39}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.74it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.84it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.87it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.47it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.19it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.07it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.98it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.93it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.86it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.84it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.82it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.83it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.79it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.80it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.76it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 1.8946475982666016, 'eval_runtime': 7.5612, 'eval_samples_per_second': 47.611, 'eval_steps_per_second': 4.761, 'epoch': 1.39}


  7%|▋         | 2020/28720 [4:32:35<57:03:57,  7.69s/it] 

{'loss': 1.7631, 'grad_norm': 4.134222507476807, 'learning_rate': 9.899964853728283e-05, 'epoch': 1.41}


  7%|▋         | 2040/28720 [4:35:08<56:47:03,  7.66s/it]

{'loss': 1.7541, 'grad_norm': 3.3652257919311523, 'learning_rate': 9.897760545754585e-05, 'epoch': 1.42}


  7%|▋         | 2060/28720 [4:37:42<56:43:47,  7.66s/it]

{'loss': 1.769, 'grad_norm': 3.7884178161621094, 'learning_rate': 9.895532466199866e-05, 'epoch': 1.43}


  7%|▋         | 2080/28720 [4:40:15<56:34:06,  7.64s/it]

{'loss': 1.7678, 'grad_norm': 3.6816370487213135, 'learning_rate': 9.893280625878247e-05, 'epoch': 1.45}


  7%|▋         | 2100/28720 [4:42:48<56:53:57,  7.69s/it]

{'loss': 1.7551, 'grad_norm': 3.4257683753967285, 'learning_rate': 9.891005035719174e-05, 'epoch': 1.46}


  7%|▋         | 2120/28720 [4:45:21<56:40:15,  7.67s/it]

{'loss': 1.7335, 'grad_norm': 3.744096517562866, 'learning_rate': 9.888705706767364e-05, 'epoch': 1.48}


  7%|▋         | 2140/28720 [4:47:54<56:18:37,  7.63s/it]

{'loss': 1.7073, 'grad_norm': 9.18854808807373, 'learning_rate': 9.886382650182749e-05, 'epoch': 1.49}


  8%|▊         | 2160/28720 [4:50:27<56:24:36,  7.65s/it]

{'loss': 1.7248, 'grad_norm': 3.4030368328094482, 'learning_rate': 9.884035877240428e-05, 'epoch': 1.5}


  8%|▊         | 2180/28720 [4:53:00<56:21:02,  7.64s/it]

{'loss': 1.7274, 'grad_norm': 5.847539901733398, 'learning_rate': 9.881665399330605e-05, 'epoch': 1.52}


  8%|▊         | 2200/28720 [4:55:33<56:23:03,  7.65s/it]

{'loss': 1.6996, 'grad_norm': 3.52908992767334, 'learning_rate': 9.879271227958544e-05, 'epoch': 1.53}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.70it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.83it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.85it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.48it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.20it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.07it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.99it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.93it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.87it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.86it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.84it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.82it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.80it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.80it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.75it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 1.8503642082214355, 'eval_runtime': 7.542, 'eval_samples_per_second': 47.733, 'eval_steps_per_second': 4.773, 'epoch': 1.53}


  8%|▊         | 2220/28720 [4:59:53<56:30:11,  7.68s/it] 

{'loss': 1.7114, 'grad_norm': 3.48752760887146, 'learning_rate': 9.876853374744498e-05, 'epoch': 1.55}


  8%|▊         | 2240/28720 [5:02:26<56:19:56,  7.66s/it]

{'loss': 1.6803, 'grad_norm': 3.35085391998291, 'learning_rate': 9.874411851423669e-05, 'epoch': 1.56}


  8%|▊         | 2260/28720 [5:04:59<56:16:43,  7.66s/it]

{'loss': 1.7051, 'grad_norm': 3.3698246479034424, 'learning_rate': 9.87194666984614e-05, 'epoch': 1.57}


  8%|▊         | 2280/28720 [5:07:32<56:14:07,  7.66s/it]

{'loss': 1.6726, 'grad_norm': 3.3705992698669434, 'learning_rate': 9.869457841976816e-05, 'epoch': 1.59}


  8%|▊         | 2300/28720 [5:10:05<55:59:59,  7.63s/it]

{'loss': 1.6771, 'grad_norm': 3.5190014839172363, 'learning_rate': 9.866945379895383e-05, 'epoch': 1.6}


  8%|▊         | 2320/28720 [5:12:38<56:05:36,  7.65s/it]

{'loss': 1.6477, 'grad_norm': 3.2467048168182373, 'learning_rate': 9.864409295796225e-05, 'epoch': 1.62}


  8%|▊         | 2340/28720 [5:15:11<56:06:18,  7.66s/it]

{'loss': 1.655, 'grad_norm': 3.732212543487549, 'learning_rate': 9.861849601988383e-05, 'epoch': 1.63}


  8%|▊         | 2360/28720 [5:17:44<55:58:23,  7.64s/it]

{'loss': 1.6422, 'grad_norm': 3.5311572551727295, 'learning_rate': 9.859266310895488e-05, 'epoch': 1.64}


  8%|▊         | 2380/28720 [5:20:17<55:58:16,  7.65s/it]

{'loss': 1.641, 'grad_norm': 3.8681583404541016, 'learning_rate': 9.856659435055702e-05, 'epoch': 1.66}


  8%|▊         | 2400/28720 [5:22:50<56:08:27,  7.68s/it]

{'loss': 1.6632, 'grad_norm': 3.5935487747192383, 'learning_rate': 9.854028987121654e-05, 'epoch': 1.67}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.63it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.71it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.80it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.42it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.19it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.06it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.95it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.91it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.88it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.81it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.79it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.78it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.79it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.78it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.80it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.75it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 1.8363616466522217, 'eval_runtime': 7.5798, 'eval_samples_per_second': 47.495, 'eval_steps_per_second': 4.749, 'epoch': 1.67}


  8%|▊         | 2420/28720 [5:27:05<55:59:10,  7.66s/it] 

{'loss': 1.6656, 'grad_norm': 3.017007827758789, 'learning_rate': 9.851374979860387e-05, 'epoch': 1.69}


  8%|▊         | 2440/28720 [5:29:38<55:45:06,  7.64s/it]

{'loss': 1.6803, 'grad_norm': 3.3520734310150146, 'learning_rate': 9.848697426153288e-05, 'epoch': 1.7}


  9%|▊         | 2460/28720 [5:32:10<55:56:32,  7.67s/it]

{'loss': 1.6549, 'grad_norm': 3.3779561519622803, 'learning_rate': 9.845996338996027e-05, 'epoch': 1.71}


  9%|▊         | 2480/28720 [5:34:45<56:09:15,  7.70s/it]

{'loss': 1.6485, 'grad_norm': 3.5913422107696533, 'learning_rate': 9.843271731498494e-05, 'epoch': 1.73}


  9%|▊         | 2500/28720 [5:37:18<55:49:01,  7.66s/it]

{'loss': 1.6727, 'grad_norm': 3.687265634536743, 'learning_rate': 9.840523616884742e-05, 'epoch': 1.74}


  9%|▉         | 2520/28720 [5:39:51<55:45:26,  7.66s/it]

{'loss': 1.6521, 'grad_norm': 3.2821123600006104, 'learning_rate': 9.837752008492914e-05, 'epoch': 1.76}


  9%|▉         | 2540/28720 [5:42:24<55:40:57,  7.66s/it]

{'loss': 1.644, 'grad_norm': 3.286233425140381, 'learning_rate': 9.834956919775179e-05, 'epoch': 1.77}


  9%|▉         | 2560/28720 [5:44:57<55:41:36,  7.66s/it]

{'loss': 1.6507, 'grad_norm': 3.4685497283935547, 'learning_rate': 9.832138364297672e-05, 'epoch': 1.78}


  9%|▉         | 2580/28720 [5:47:30<55:41:59,  7.67s/it]

{'loss': 1.6369, 'grad_norm': 3.7872934341430664, 'learning_rate': 9.829296355740425e-05, 'epoch': 1.8}


  9%|▉         | 2600/28720 [5:50:03<55:21:17,  7.63s/it]

{'loss': 1.6436, 'grad_norm': 3.4841055870056152, 'learning_rate': 9.8264309078973e-05, 'epoch': 1.81}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.75it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.84it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.89it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.50it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.25it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.11it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.00it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.95it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.91it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.89it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.88it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.87it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.86it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.82it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.82it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.81it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.78it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.81it/s]

{'eval_loss': 1.773193359375, 'eval_runtime': 7.5206, 'eval_samples_per_second': 47.869, 'eval_steps_per_second': 4.787, 'epoch': 1.81}


  9%|▉         | 2620/28720 [5:54:18<56:01:12,  7.73s/it] 

{'loss': 1.6778, 'grad_norm': 3.3202075958251953, 'learning_rate': 9.823542034675926e-05, 'epoch': 1.82}


  9%|▉         | 2640/28720 [5:56:51<55:24:05,  7.65s/it]

{'loss': 1.639, 'grad_norm': 4.493223667144775, 'learning_rate': 9.820629750097623e-05, 'epoch': 1.84}


  9%|▉         | 2660/28720 [5:59:24<55:30:27,  7.67s/it]

{'loss': 1.6829, 'grad_norm': 3.238520383834839, 'learning_rate': 9.817694068297345e-05, 'epoch': 1.85}


  9%|▉         | 2680/28720 [6:01:57<55:21:57,  7.65s/it]

{'loss': 1.6131, 'grad_norm': 3.4307475090026855, 'learning_rate': 9.814735003523603e-05, 'epoch': 1.87}


  9%|▉         | 2700/28720 [6:04:31<55:26:45,  7.67s/it]

{'loss': 1.6611, 'grad_norm': 4.830063343048096, 'learning_rate': 9.811752570138398e-05, 'epoch': 1.88}


  9%|▉         | 2720/28720 [6:07:04<55:15:55,  7.65s/it]

{'loss': 1.6219, 'grad_norm': 3.327212333679199, 'learning_rate': 9.808746782617156e-05, 'epoch': 1.89}


 10%|▉         | 2740/28720 [6:09:38<55:12:29,  7.65s/it]

{'loss': 1.588, 'grad_norm': 3.7200095653533936, 'learning_rate': 9.805717655548648e-05, 'epoch': 1.91}


 10%|▉         | 2760/28720 [6:12:11<54:55:43,  7.62s/it]

{'loss': 1.6021, 'grad_norm': 3.3549087047576904, 'learning_rate': 9.802665203634931e-05, 'epoch': 1.92}


 10%|▉         | 2780/28720 [6:14:43<55:01:56,  7.64s/it]

{'loss': 1.6094, 'grad_norm': 3.792928457260132, 'learning_rate': 9.799589441691263e-05, 'epoch': 1.94}


 10%|▉         | 2800/28720 [6:17:16<54:51:27,  7.62s/it]

{'loss': 1.6045, 'grad_norm': 3.7489278316497803, 'learning_rate': 9.796490384646048e-05, 'epoch': 1.95}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.42it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.73it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.83it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.46it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.18it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.07it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.91it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.78it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.81it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.81it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.81it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.81it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.78it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.79it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.75it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.73it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.75it/s]

{'eval_loss': 1.7485276460647583, 'eval_runtime': 7.5751, 'eval_samples_per_second': 47.524, 'eval_steps_per_second': 4.752, 'epoch': 1.95}


 10%|▉         | 2820/28720 [6:21:19<54:07:55,  7.52s/it] 

{'loss': 1.6295, 'grad_norm': 4.080222129821777, 'learning_rate': 9.793368047540748e-05, 'epoch': 1.96}


 10%|▉         | 2840/28720 [6:23:49<53:38:17,  7.46s/it]

{'loss': 1.6171, 'grad_norm': 3.6085405349731445, 'learning_rate': 9.790222445529815e-05, 'epoch': 1.98}


 10%|▉         | 2860/28720 [6:26:18<53:38:45,  7.47s/it]

{'loss': 1.608, 'grad_norm': 3.2492475509643555, 'learning_rate': 9.787053593880624e-05, 'epoch': 1.99}


 10%|█         | 2880/28720 [6:28:45<53:37:03,  7.47s/it]

{'loss': 1.6344, 'grad_norm': 2.9009759426116943, 'learning_rate': 9.78386150797339e-05, 'epoch': 2.01}


 10%|█         | 2900/28720 [6:31:19<55:36:54,  7.75s/it]

{'loss': 1.5216, 'grad_norm': 3.4290828704833984, 'learning_rate': 9.780646203301096e-05, 'epoch': 2.02}


 10%|█         | 2920/28720 [6:33:53<54:46:55,  7.64s/it]

{'loss': 1.5492, 'grad_norm': 3.1367573738098145, 'learning_rate': 9.777407695469423e-05, 'epoch': 2.03}


 10%|█         | 2940/28720 [6:36:25<54:33:19,  7.62s/it]

{'loss': 1.5005, 'grad_norm': 3.5165863037109375, 'learning_rate': 9.774146000196665e-05, 'epoch': 2.05}


 10%|█         | 2960/28720 [6:38:56<53:11:36,  7.43s/it]

{'loss': 1.5177, 'grad_norm': 5.072206974029541, 'learning_rate': 9.77086113331366e-05, 'epoch': 2.06}


 10%|█         | 2980/28720 [6:41:25<53:23:06,  7.47s/it]

{'loss': 1.5374, 'grad_norm': 4.203068733215332, 'learning_rate': 9.767553110763712e-05, 'epoch': 2.08}


 10%|█         | 3000/28720 [6:43:55<53:15:28,  7.45s/it]

{'loss': 1.5659, 'grad_norm': 3.6268041133880615, 'learning_rate': 9.764221948602511e-05, 'epoch': 2.09}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.73it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.83it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.86it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.47it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.24it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.10it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.01it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.96it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.92it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.89it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.87it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.86it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.85it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.83it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.82it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.82it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.84it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.78it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.79it/s]

{'eval_loss': 1.8018817901611328, 'eval_runtime': 7.5034, 'eval_samples_per_second': 47.978, 'eval_steps_per_second': 4.798, 'epoch': 2.09}



100%|██████████| 36/36 [00:07<00:00,  4.88it/s][A
 11%|█         | 3020/28720 [6:48:11<54:46:07,  7.67s/it] 

{'loss': 1.585, 'grad_norm': 3.7122347354888916, 'learning_rate': 9.760867662998057e-05, 'epoch': 2.1}


 11%|█         | 3040/28720 [6:50:45<55:02:20,  7.72s/it]

{'loss': 1.5718, 'grad_norm': 3.6239070892333984, 'learning_rate': 9.757490270230578e-05, 'epoch': 2.12}


 11%|█         | 3060/28720 [6:53:19<54:58:41,  7.71s/it]

{'loss': 1.5156, 'grad_norm': 3.202385663986206, 'learning_rate': 9.754089786692461e-05, 'epoch': 2.13}


 11%|█         | 3080/28720 [6:55:53<54:42:22,  7.68s/it]

{'loss': 1.577, 'grad_norm': 3.140993356704712, 'learning_rate': 9.750666228888158e-05, 'epoch': 2.14}


 11%|█         | 3100/28720 [6:58:27<54:51:21,  7.71s/it]

{'loss': 1.5813, 'grad_norm': 7.078457355499268, 'learning_rate': 9.74721961343412e-05, 'epoch': 2.16}


 11%|█         | 3120/28720 [7:01:01<54:44:38,  7.70s/it]

{'loss': 1.5457, 'grad_norm': 3.267578363418579, 'learning_rate': 9.743749957058703e-05, 'epoch': 2.17}


 11%|█         | 3140/28720 [7:03:35<54:28:45,  7.67s/it]

{'loss': 1.5352, 'grad_norm': 4.0500288009643555, 'learning_rate': 9.7402572766021e-05, 'epoch': 2.19}


 11%|█         | 3160/28720 [7:06:08<54:20:04,  7.65s/it]

{'loss': 1.5642, 'grad_norm': 3.528367280960083, 'learning_rate': 9.736741589016246e-05, 'epoch': 2.2}


 11%|█         | 3180/28720 [7:08:41<54:23:37,  7.67s/it]

{'loss': 1.5479, 'grad_norm': 4.203574180603027, 'learning_rate': 9.73320291136475e-05, 'epoch': 2.21}


 11%|█         | 3200/28720 [7:11:16<54:45:22,  7.72s/it]

{'loss': 1.524, 'grad_norm': 3.677737236022949, 'learning_rate': 9.729641260822798e-05, 'epoch': 2.23}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.43it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.74it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.70it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.36it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.12it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.02it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.88it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.86it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.83it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.82it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.83it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.81it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.75it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.75it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.77it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.75it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.68it/s][A
 56%|█████▌    | 20/36 [00:04<00:03,  4.67it/s]

{'eval_loss': 1.7582255601882935, 'eval_runtime': 7.6659, 'eval_samples_per_second': 46.961, 'eval_steps_per_second': 4.696, 'epoch': 2.23}


 11%|█         | 3220/28720 [7:15:32<54:45:11,  7.73s/it] 

{'loss': 1.5299, 'grad_norm': 3.512570858001709, 'learning_rate': 9.726056654677079e-05, 'epoch': 2.24}


 11%|█▏        | 3240/28720 [7:18:05<54:16:34,  7.67s/it]

{'loss': 1.5212, 'grad_norm': 3.1014668941497803, 'learning_rate': 9.722449110325701e-05, 'epoch': 2.26}


 11%|█▏        | 3260/28720 [7:20:39<54:19:35,  7.68s/it]

{'loss': 1.487, 'grad_norm': 3.5714640617370605, 'learning_rate': 9.718818645278101e-05, 'epoch': 2.27}


 11%|█▏        | 3280/28720 [7:23:13<54:23:01,  7.70s/it]

{'loss': 1.5246, 'grad_norm': 3.2348053455352783, 'learning_rate': 9.715165277154963e-05, 'epoch': 2.28}


 11%|█▏        | 3300/28720 [7:25:47<54:21:02,  7.70s/it]

{'loss': 1.4972, 'grad_norm': 3.3077573776245117, 'learning_rate': 9.711489023688132e-05, 'epoch': 2.3}


 12%|█▏        | 3320/28720 [7:28:20<54:11:39,  7.68s/it]

{'loss': 1.5069, 'grad_norm': 3.5279715061187744, 'learning_rate': 9.707789902720532e-05, 'epoch': 2.31}


 12%|█▏        | 3340/28720 [7:30:54<54:05:39,  7.67s/it]

{'loss': 1.4966, 'grad_norm': 3.794558048248291, 'learning_rate': 9.704067932206071e-05, 'epoch': 2.33}


 12%|█▏        | 3360/28720 [7:33:28<54:08:59,  7.69s/it]

{'loss': 1.5474, 'grad_norm': 3.3143839836120605, 'learning_rate': 9.700323130209565e-05, 'epoch': 2.34}


 12%|█▏        | 3380/28720 [7:36:02<54:12:48,  7.70s/it]

{'loss': 1.4419, 'grad_norm': 3.1643288135528564, 'learning_rate': 9.696555514906634e-05, 'epoch': 2.35}


 12%|█▏        | 3400/28720 [7:38:35<53:49:49,  7.65s/it]

{'loss': 1.5015, 'grad_norm': 3.138195276260376, 'learning_rate': 9.692765104583634e-05, 'epoch': 2.37}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.69it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.81it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.84it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.46it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.06it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.94it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.89it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.87it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.84it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.83it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.80it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.80it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.78it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.78it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.79it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.79it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.77it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.77it/s]

{'eval_loss': 1.7060545682907104, 'eval_runtime': 7.5483, 'eval_samples_per_second': 47.693, 'eval_steps_per_second': 4.769, 'epoch': 2.37}


 12%|█▏        | 3420/28720 [7:42:54<54:17:35,  7.73s/it] 

{'loss': 1.5215, 'grad_norm': 2.9270832538604736, 'learning_rate': 9.688951917637554e-05, 'epoch': 2.38}


 12%|█▏        | 3440/28720 [7:45:27<53:39:08,  7.64s/it]

{'loss': 1.5434, 'grad_norm': 3.5750668048858643, 'learning_rate': 9.685115972575928e-05, 'epoch': 2.4}


 12%|█▏        | 3460/28720 [7:48:00<53:28:27,  7.62s/it]

{'loss': 1.4565, 'grad_norm': 3.2922792434692383, 'learning_rate': 9.681257288016755e-05, 'epoch': 2.41}


 12%|█▏        | 3480/28720 [7:50:33<53:32:11,  7.64s/it]

{'loss': 1.489, 'grad_norm': 3.0215282440185547, 'learning_rate': 9.677375882688391e-05, 'epoch': 2.42}


 12%|█▏        | 3500/28720 [7:53:06<53:45:44,  7.67s/it]

{'loss': 1.4693, 'grad_norm': 3.387080669403076, 'learning_rate': 9.673471775429479e-05, 'epoch': 2.44}


 12%|█▏        | 3520/28720 [7:55:36<53:10:54,  7.60s/it]

{'loss': 1.5377, 'grad_norm': 4.164060592651367, 'learning_rate': 9.669544985188841e-05, 'epoch': 2.45}


 12%|█▏        | 3540/28720 [7:58:06<52:29:28,  7.50s/it]

{'loss': 1.478, 'grad_norm': 3.334707736968994, 'learning_rate': 9.665595531025396e-05, 'epoch': 2.47}


 12%|█▏        | 3560/28720 [8:00:36<52:53:37,  7.57s/it]

{'loss': 1.5322, 'grad_norm': 3.2418627738952637, 'learning_rate': 9.661623432108058e-05, 'epoch': 2.48}


 12%|█▏        | 3580/28720 [8:03:09<53:24:30,  7.65s/it]

{'loss': 1.5181, 'grad_norm': 3.319058418273926, 'learning_rate': 9.657628707715655e-05, 'epoch': 2.49}


 13%|█▎        | 3600/28720 [8:05:43<53:23:56,  7.65s/it]

{'loss': 1.5091, 'grad_norm': 3.309311866760254, 'learning_rate': 9.65361137723683e-05, 'epoch': 2.51}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.71it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.81it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.86it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.50it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.22it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.10it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.98it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.94it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.84it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.83it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.85it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.83it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.83it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.83it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.84it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.78it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.78it/s]

{'eval_loss': 1.713065266609192, 'eval_runtime': 7.5438, 'eval_samples_per_second': 47.722, 'eval_steps_per_second': 4.772, 'epoch': 2.51}


 13%|█▎        | 3620/28720 [8:09:59<53:36:00,  7.69s/it] 

{'loss': 1.4698, 'grad_norm': 3.304373025894165, 'learning_rate': 9.649571460169938e-05, 'epoch': 2.52}


 13%|█▎        | 3640/28720 [8:12:32<53:15:27,  7.64s/it]

{'loss': 1.5014, 'grad_norm': 9.316969871520996, 'learning_rate': 9.645508976122966e-05, 'epoch': 2.53}


 13%|█▎        | 3660/28720 [8:15:05<53:09:32,  7.64s/it]

{'loss': 1.5121, 'grad_norm': 3.454190492630005, 'learning_rate': 9.641423944813428e-05, 'epoch': 2.55}


 13%|█▎        | 3680/28720 [8:17:39<53:16:09,  7.66s/it]

{'loss': 1.5081, 'grad_norm': 3.6629507541656494, 'learning_rate': 9.637316386068275e-05, 'epoch': 2.56}


 13%|█▎        | 3700/28720 [8:20:12<53:07:05,  7.64s/it]

{'loss': 1.5119, 'grad_norm': 3.489668607711792, 'learning_rate': 9.633186319823796e-05, 'epoch': 2.58}


 13%|█▎        | 3720/28720 [8:22:45<53:05:32,  7.65s/it]

{'loss': 1.5337, 'grad_norm': 3.3878164291381836, 'learning_rate': 9.629033766125519e-05, 'epoch': 2.59}


 13%|█▎        | 3740/28720 [8:25:17<52:56:33,  7.63s/it]

{'loss': 1.5027, 'grad_norm': 3.30159592628479, 'learning_rate': 9.624858745128119e-05, 'epoch': 2.6}


 13%|█▎        | 3760/28720 [8:27:50<52:58:56,  7.64s/it]

{'loss': 1.4907, 'grad_norm': 3.141054391860962, 'learning_rate': 9.620661277095313e-05, 'epoch': 2.62}


 13%|█▎        | 3780/28720 [8:30:23<52:13:34,  7.54s/it]

{'loss': 1.4884, 'grad_norm': 2.8711767196655273, 'learning_rate': 9.616441382399774e-05, 'epoch': 2.63}


 13%|█▎        | 3800/28720 [8:32:52<51:32:40,  7.45s/it]

{'loss': 1.4786, 'grad_norm': 3.7589612007141113, 'learning_rate': 9.61219908152302e-05, 'epoch': 2.65}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.71it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.80it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.84it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.47it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.19it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.06it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.93it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.90it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.88it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.85it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.84it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.83it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.80it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.74it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.74it/s]

{'eval_loss': 1.6515618562698364, 'eval_runtime': 7.5614, 'eval_samples_per_second': 47.61, 'eval_steps_per_second': 4.761, 'epoch': 2.65}


 13%|█▎        | 3820/28720 [8:37:08<53:17:29,  7.70s/it] 

{'loss': 1.4817, 'grad_norm': 3.0826575756073, 'learning_rate': 9.607934395055315e-05, 'epoch': 2.66}


 13%|█▎        | 3840/28720 [8:39:42<53:26:55,  7.73s/it]

{'loss': 1.4996, 'grad_norm': 3.472005844116211, 'learning_rate': 9.603647343695576e-05, 'epoch': 2.67}


 13%|█▎        | 3860/28720 [8:42:15<52:47:21,  7.64s/it]

{'loss': 1.442, 'grad_norm': 3.375777006149292, 'learning_rate': 9.599337948251273e-05, 'epoch': 2.69}


 14%|█▎        | 3880/28720 [8:44:48<52:47:32,  7.65s/it]

{'loss': 1.4379, 'grad_norm': 3.569967746734619, 'learning_rate': 9.595006229638317e-05, 'epoch': 2.7}


 14%|█▎        | 3900/28720 [8:47:22<52:48:23,  7.66s/it]

{'loss': 1.4507, 'grad_norm': 3.2118980884552, 'learning_rate': 9.590652208880972e-05, 'epoch': 2.72}


 14%|█▎        | 3920/28720 [8:49:55<52:46:12,  7.66s/it]

{'loss': 1.4591, 'grad_norm': 3.683055877685547, 'learning_rate': 9.586275907111746e-05, 'epoch': 2.73}


 14%|█▎        | 3940/28720 [8:52:28<52:49:54,  7.68s/it]

{'loss': 1.4958, 'grad_norm': 3.0828654766082764, 'learning_rate': 9.581877345571284e-05, 'epoch': 2.74}


 14%|█▍        | 3960/28720 [8:55:01<52:35:43,  7.65s/it]

{'loss': 1.4879, 'grad_norm': 3.3878097534179688, 'learning_rate': 9.577456545608276e-05, 'epoch': 2.76}


 14%|█▍        | 3980/28720 [8:57:34<52:45:45,  7.68s/it]

{'loss': 1.4831, 'grad_norm': 3.2465264797210693, 'learning_rate': 9.573013528679348e-05, 'epoch': 2.77}


 14%|█▍        | 4000/28720 [9:00:07<52:32:09,  7.65s/it]

{'loss': 1.4601, 'grad_norm': 3.6202073097229004, 'learning_rate': 9.568548316348951e-05, 'epoch': 2.79}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.62it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.80it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.86it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.43it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.16it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.04it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.94it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.90it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.88it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.84it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.83it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.84it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.82it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.75it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.78it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.67it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.70it/s]

{'eval_loss': 1.6346986293792725, 'eval_runtime': 7.5649, 'eval_samples_per_second': 47.588, 'eval_steps_per_second': 4.759, 'epoch': 2.79}


 14%|█▍        | 4020/28720 [9:04:21<52:44:32,  7.69s/it] 

{'loss': 1.4645, 'grad_norm': 2.95229172706604, 'learning_rate': 9.56406093028927e-05, 'epoch': 2.8}


 14%|█▍        | 4040/28720 [9:06:54<52:25:15,  7.65s/it]

{'loss': 1.4453, 'grad_norm': 3.4241323471069336, 'learning_rate': 9.559551392280107e-05, 'epoch': 2.81}


 14%|█▍        | 4060/28720 [9:09:27<52:32:55,  7.67s/it]

{'loss': 1.478, 'grad_norm': 3.123868465423584, 'learning_rate': 9.555019724208778e-05, 'epoch': 2.83}


 14%|█▍        | 4080/28720 [9:12:02<52:41:23,  7.70s/it]

{'loss': 1.5125, 'grad_norm': 2.9910426139831543, 'learning_rate': 9.550465948070018e-05, 'epoch': 2.84}


 14%|█▍        | 4100/28720 [9:14:36<52:30:49,  7.68s/it]

{'loss': 1.5148, 'grad_norm': 3.6954901218414307, 'learning_rate': 9.545890085965854e-05, 'epoch': 2.86}


 14%|█▍        | 4120/28720 [9:17:09<52:15:43,  7.65s/it]

{'loss': 1.481, 'grad_norm': 3.067256212234497, 'learning_rate': 9.541292160105516e-05, 'epoch': 2.87}


 14%|█▍        | 4140/28720 [9:19:42<52:05:58,  7.63s/it]

{'loss': 1.4561, 'grad_norm': 3.1063790321350098, 'learning_rate': 9.536672192805314e-05, 'epoch': 2.88}


 14%|█▍        | 4160/28720 [9:22:15<52:18:23,  7.67s/it]

{'loss': 1.4585, 'grad_norm': 3.277224540710449, 'learning_rate': 9.532030206488549e-05, 'epoch': 2.9}


 15%|█▍        | 4180/28720 [9:24:48<52:38:33,  7.72s/it]

{'loss': 1.477, 'grad_norm': 3.3748228549957275, 'learning_rate': 9.527366223685382e-05, 'epoch': 2.91}


 15%|█▍        | 4200/28720 [9:27:22<52:20:28,  7.68s/it]

{'loss': 1.4153, 'grad_norm': 3.4725189208984375, 'learning_rate': 9.522680267032742e-05, 'epoch': 2.93}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.70it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.83it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.87it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.48it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.08it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.98it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.93it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.90it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.87it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.85it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.84it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.84it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.80it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.81it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.81it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.76it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.72it/s]

{'eval_loss': 1.5959805250167847, 'eval_runtime': 7.5404, 'eval_samples_per_second': 47.743, 'eval_steps_per_second': 4.774, 'epoch': 2.93}


 15%|█▍        | 4220/28720 [9:31:34<52:33:48,  7.72s/it] 

{'loss': 1.4951, 'grad_norm': 3.2429275512695312, 'learning_rate': 9.517972359274203e-05, 'epoch': 2.94}


 15%|█▍        | 4240/28720 [9:34:09<52:50:16,  7.77s/it]

{'loss': 1.4896, 'grad_norm': 3.2576711177825928, 'learning_rate': 9.513242523259889e-05, 'epoch': 2.95}


 15%|█▍        | 4260/28720 [9:36:42<52:04:58,  7.67s/it]

{'loss': 1.4461, 'grad_norm': 3.3985910415649414, 'learning_rate': 9.508490781946344e-05, 'epoch': 2.97}


 15%|█▍        | 4280/28720 [9:39:16<52:06:08,  7.67s/it]

{'loss': 1.4545, 'grad_norm': 3.0984551906585693, 'learning_rate': 9.50371715839644e-05, 'epoch': 2.98}


 15%|█▍        | 4300/28720 [9:41:49<51:54:26,  7.65s/it]

{'loss': 1.3941, 'grad_norm': 3.4362032413482666, 'learning_rate': 9.498921675779248e-05, 'epoch': 2.99}


 15%|█▌        | 4320/28720 [9:44:20<51:51:06,  7.65s/it]

{'loss': 1.4067, 'grad_norm': 3.485450506210327, 'learning_rate': 9.49410435736994e-05, 'epoch': 3.01}


 15%|█▌        | 4340/28720 [9:46:53<51:35:03,  7.62s/it]

{'loss': 1.3963, 'grad_norm': 3.351193904876709, 'learning_rate': 9.489265226549663e-05, 'epoch': 3.02}


 15%|█▌        | 4360/28720 [9:49:25<51:25:44,  7.60s/it]

{'loss': 1.3885, 'grad_norm': 3.3001747131347656, 'learning_rate': 9.484404306805436e-05, 'epoch': 3.04}


 15%|█▌        | 4380/28720 [9:51:57<51:16:36,  7.58s/it]

{'loss': 1.3863, 'grad_norm': 3.348989248275757, 'learning_rate': 9.479521621730035e-05, 'epoch': 3.05}


 15%|█▌        | 4400/28720 [9:54:29<51:19:46,  7.60s/it]

{'loss': 1.3808, 'grad_norm': 3.1232473850250244, 'learning_rate': 9.474617195021865e-05, 'epoch': 3.06}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.68it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.82it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.84it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.45it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.23it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.05it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.96it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.91it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.87it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.84it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.81it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.82it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.81it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.82it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.83it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.76it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.79it/s]

{'eval_loss': 1.5929573774337769, 'eval_runtime': 7.5401, 'eval_samples_per_second': 47.745, 'eval_steps_per_second': 4.774, 'epoch': 3.06}


 15%|█▌        | 4420/28720 [9:58:44<51:52:40,  7.69s/it] 

{'loss': 1.403, 'grad_norm': 3.564481258392334, 'learning_rate': 9.469691050484863e-05, 'epoch': 3.08}


 15%|█▌        | 4440/28720 [10:01:18<51:40:30,  7.66s/it]

{'loss': 1.3797, 'grad_norm': 3.046065330505371, 'learning_rate': 9.464743212028377e-05, 'epoch': 3.09}


 16%|█▌        | 4460/28720 [10:03:51<51:30:13,  7.64s/it]

{'loss': 1.3802, 'grad_norm': 3.217961549758911, 'learning_rate': 9.459773703667043e-05, 'epoch': 3.11}


 16%|█▌        | 4480/28720 [10:06:24<51:26:56,  7.64s/it]

{'loss': 1.3569, 'grad_norm': 3.330310821533203, 'learning_rate': 9.454782549520674e-05, 'epoch': 3.12}


 16%|█▌        | 4500/28720 [10:08:57<51:16:33,  7.62s/it]

{'loss': 1.3338, 'grad_norm': 2.976837635040283, 'learning_rate': 9.44976977381414e-05, 'epoch': 3.13}


 16%|█▌        | 4520/28720 [10:11:29<51:13:16,  7.62s/it]

{'loss': 1.3711, 'grad_norm': 3.344744920730591, 'learning_rate': 9.444735400877259e-05, 'epoch': 3.15}


 16%|█▌        | 4540/28720 [10:14:02<51:12:38,  7.62s/it]

{'loss': 1.393, 'grad_norm': 3.4091951847076416, 'learning_rate': 9.439679455144666e-05, 'epoch': 3.16}


 16%|█▌        | 4560/28720 [10:16:35<51:13:54,  7.63s/it]

{'loss': 1.3848, 'grad_norm': 4.761604309082031, 'learning_rate': 9.434601961155704e-05, 'epoch': 3.18}


 16%|█▌        | 4580/28720 [10:19:08<51:12:39,  7.64s/it]

{'loss': 1.3682, 'grad_norm': 2.922485828399658, 'learning_rate': 9.429502943554302e-05, 'epoch': 3.19}


 16%|█▌        | 4600/28720 [10:21:40<51:07:18,  7.63s/it]

{'loss': 1.3876, 'grad_norm': 3.6845085620880127, 'learning_rate': 9.424382427088856e-05, 'epoch': 3.2}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.46it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.78it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.83it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.45it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.09it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  4.98it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.94it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.89it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.85it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.82it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.82it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.83it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.81it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.82it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.82it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.82it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.79it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.81it/s]

{'eval_loss': 1.6405799388885498, 'eval_runtime': 7.54, 'eval_samples_per_second': 47.746, 'eval_steps_per_second': 4.775, 'epoch': 3.2}


 16%|█▌        | 4620/28720 [10:25:50<51:30:58,  7.70s/it] 

{'loss': 1.3748, 'grad_norm': 3.2915751934051514, 'learning_rate': 9.419240436612103e-05, 'epoch': 3.22}


 16%|█▌        | 4640/28720 [10:28:22<51:21:40,  7.68s/it]

{'loss': 1.3622, 'grad_norm': 3.9795773029327393, 'learning_rate': 9.414076997081012e-05, 'epoch': 3.23}


 16%|█▌        | 4660/28720 [10:30:55<51:02:42,  7.64s/it]

{'loss': 1.3785, 'grad_norm': 3.3884992599487305, 'learning_rate': 9.40889213355665e-05, 'epoch': 3.25}


 16%|█▋        | 4680/28720 [10:33:27<50:56:47,  7.63s/it]

{'loss': 1.3886, 'grad_norm': 2.953395366668701, 'learning_rate': 9.403685871204073e-05, 'epoch': 3.26}


 16%|█▋        | 4700/28720 [10:36:00<50:58:13,  7.64s/it]

{'loss': 1.3628, 'grad_norm': 3.0544145107269287, 'learning_rate': 9.398458235292195e-05, 'epoch': 3.27}


 16%|█▋        | 4720/28720 [10:38:32<50:56:12,  7.64s/it]

{'loss': 1.3816, 'grad_norm': 3.1548447608947754, 'learning_rate': 9.393209251193664e-05, 'epoch': 3.29}


 17%|█▋        | 4740/28720 [10:41:05<50:57:28,  7.65s/it]

{'loss': 1.3781, 'grad_norm': 3.7517712116241455, 'learning_rate': 9.387938944384749e-05, 'epoch': 3.3}


 17%|█▋        | 4760/28720 [10:43:38<50:58:15,  7.66s/it]

{'loss': 1.4058, 'grad_norm': 3.686467170715332, 'learning_rate': 9.382647340445205e-05, 'epoch': 3.31}


 17%|█▋        | 4780/28720 [10:46:12<50:52:33,  7.65s/it]

{'loss': 1.3818, 'grad_norm': 3.010509967803955, 'learning_rate': 9.377334465058156e-05, 'epoch': 3.33}


 17%|█▋        | 4800/28720 [10:48:45<50:50:10,  7.65s/it]

{'loss': 1.3651, 'grad_norm': 3.032909393310547, 'learning_rate': 9.372000344009972e-05, 'epoch': 3.34}



  0%|          | 0/36 [00:00<?, ?it/s][A
  6%|▌         | 2/36 [00:00<00:03,  9.76it/s][A
  8%|▊         | 3/36 [00:00<00:04,  6.87it/s][A
 11%|█         | 4/36 [00:00<00:05,  5.91it/s][A
 14%|█▍        | 5/36 [00:00<00:05,  5.49it/s][A
 17%|█▋        | 6/36 [00:01<00:05,  5.21it/s][A
 19%|█▉        | 7/36 [00:01<00:05,  5.09it/s][A
 22%|██▏       | 8/36 [00:01<00:05,  5.01it/s][A
 25%|██▌       | 9/36 [00:01<00:05,  4.96it/s][A
 28%|██▊       | 10/36 [00:01<00:05,  4.92it/s][A
 31%|███       | 11/36 [00:02<00:05,  4.89it/s][A
 33%|███▎      | 12/36 [00:02<00:04,  4.88it/s][A
 36%|███▌      | 13/36 [00:02<00:04,  4.86it/s][A
 39%|███▉      | 14/36 [00:02<00:04,  4.85it/s][A
 42%|████▏     | 15/36 [00:02<00:04,  4.84it/s][A
 44%|████▍     | 16/36 [00:03<00:04,  4.83it/s][A
 47%|████▋     | 17/36 [00:03<00:03,  4.83it/s][A
 50%|█████     | 18/36 [00:03<00:03,  4.84it/s][A
 53%|█████▎    | 19/36 [00:03<00:03,  4.80it/s][A
 56%|█████▌    | 20/36 [00:03<00:03,  4.81it/s]

{'eval_loss': 1.6028015613555908, 'eval_runtime': 7.5206, 'eval_samples_per_second': 47.868, 'eval_steps_per_second': 4.787, 'epoch': 3.34}


 17%|█▋        | 4800/28720 [10:51:05<50:50:10,  7.65s/it]

{'train_runtime': 39065.3069, 'train_samples_per_second': 88.198, 'train_steps_per_second': 0.735, 'train_loss': 1.906110961039861, 'epoch': 3.34}


 17%|█▋        | 4800/28720 [10:51:05<54:04:35,  8.14s/it]


('./madlad-lora-final/tokenizer_config.json',
 './madlad-lora-final/special_tokens_map.json',
 './madlad-lora-final/spiece.model',
 './madlad-lora-final/added_tokens.json')

# Check if it provides translations

In [12]:
lora_model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): ModulesToSaveWrapper(
        (original_module): Embedding(256000, 1024)
        (modules_to_save): ModuleDict(
          (default): Embedding(256000, 1024)
        )
      )
      (encoder): T5Stack(
        (embed_tokens): Embedding(256000, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                   

In [None]:
inputs = tokenizer("<2ce> I love Mary, she cooks very well!", return_tensors="pt").to(
    device
)
outputs = lora_model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Суна Марем дукха веза, цо чӀогӀа дика яа йо!']


In [None]:
inputs = tokenizer(
    "<2en> Стигална кӀел къахьоьгуш, ша мел динчу хӀуманах буьсун болу хӀун пайда оьцу адамо?",
    return_tensors="pt",
).to(device)
outputs = lora_model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['What profit has man, when he tries under the heaven, and when he tries, and']


In [None]:
inputs = tokenizer(
    "<2ce> Когда я зашел в пещеру, Павел вышел наружу?", return_tensors="pt"
).to(device)
outputs = lora_model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Со хьех чу ваьлча, ПахӀал араваьлла.']
