In [1]:
!pip install datasets evaluate sacrebleu





In [2]:
# Required imports
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict
import evaluate

# Load the Tydi XOR dataset from Hugging Face and filter for languages
dataset = load_dataset("coastalcph/tydi_xor_rc")

# Filter the dataset for Finnish, Japanese, and Russian languages
languages = ['fi', 'ja', 'ru']
train_data = dataset["train"].filter(lambda x: x['lang'] in languages)
valid_data = dataset["validation"].filter(lambda x: x['lang'] in languages)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Instantiate tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)

Using device: cuda


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [5]:
input_text = "Question: ビスカヤ県で初めて進出した大規模鉱業会社は何? Context: Another consequence of the Carlist defeat and ensuing abolition of the Basque institutional system was the Liberalization of the industries on the Basque Provinces, especially in Biscay. The liberalization of the mines, industries and ports attracted many companies, specially British Mining Companies, that established in Biscay along with small local societies, such as Ybarra-Mier y Compañía, creating a big industrial society, based on iron mining and industry. These expansion created very big mining companies, such as Orconera Iron Ore Company Limited and Societé Franco-Belge des Mines de Somorrostro."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids, max_new_tokens=128)
print(tokenizer.decode(outputs[0]))

<pad> mining</s>


In [43]:
# Preprocessing function for tokenization
def preprocess_data(batch):
    inputs = []
    targets = []

    # For each question-context-answer pair
    for question, context, lang, answer, answerable in zip(
        batch["question"], batch["context"], batch["lang"], batch["answer"], batch["answerable"]
    ):
        # Construct input as "question: <question> context: <context>"
        input_text = f"Question: {question} Context: {context}"
        inputs.append(input_text)
        target_text = answer
        targets.append(target_text)

    # Tokenize input and target pairs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the train and validation sets
tokenized_train = train_data.map(preprocess_data, batched=True)
tokenized_valid = valid_data.map(preprocess_data, batched=True)

tokenized_train_shuffled = tokenized_train.shuffle(seed=10)
tokenized_valid_shuffled = tokenized_valid.shuffle(seed=10)

In [36]:
# Define metric for evaluation
sacrebleu = evaluate.load("sacrebleu")

# Custom evaluation function to compute BLEU scores
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references = [[label] for label in decoded_labels]

    # Compute scores
    sacrebleu_score = sacrebleu.compute(predictions=decoded_preds, references=references)

    return {
        "sacrebleu": sacrebleu_score["score"]
    }

In [37]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsFlanT5_trial2",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_shuffled.select(range(8)),
    eval_dataset=tokenized_valid.select(range(8)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

                                             
 33%|███▎      | 2/6 [00:02<00:03,  1.23it/s]

{'eval_loss': 45.62352752685547, 'eval_sacrebleu': 2.2999414544192485, 'eval_runtime': 1.2134, 'eval_samples_per_second': 6.593, 'eval_steps_per_second': 1.648, 'epoch': 1.0}


                                             
 67%|██████▋   | 4/6 [00:05<00:02,  1.10s/it]

{'eval_loss': 44.80213165283203, 'eval_sacrebleu': 2.2999414544192485, 'eval_runtime': 1.1799, 'eval_samples_per_second': 6.78, 'eval_steps_per_second': 1.695, 'epoch': 2.0}


                                             
100%|██████████| 6/6 [00:09<00:00,  1.57s/it]

{'eval_loss': 44.439456939697266, 'eval_sacrebleu': 2.2999414544192485, 'eval_runtime': 1.1789, 'eval_samples_per_second': 6.786, 'eval_steps_per_second': 1.696, 'epoch': 3.0}
{'train_runtime': 9.4397, 'train_samples_per_second': 2.542, 'train_steps_per_second': 0.636, 'train_loss': 45.65257263183594, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=45.65257263183594, metrics={'train_runtime': 9.4397, 'train_samples_per_second': 2.542, 'train_steps_per_second': 0.636, 'total_flos': 4461372112896.0, 'train_loss': 45.65257263183594, 'epoch': 3.0})

In [44]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsFlanT5_trial2",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_shuffled.select(range(100)),
    eval_dataset=tokenized_valid_shuffled.select(range(80)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

 13%|█▎        | 10/75 [00:06<00:44,  1.45it/s]

{'loss': 41.3672, 'grad_norm': 108.16876983642578, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.4}


 27%|██▋       | 20/75 [00:14<00:42,  1.29it/s]

{'loss': 36.9554, 'grad_norm': 126.0171890258789, 'learning_rate': 2.2e-05, 'epoch': 0.8}



 33%|███▎      | 25/75 [00:29<00:39,  1.28it/s]

{'eval_loss': 34.143638610839844, 'eval_sacrebleu': 0.33009840697807663, 'eval_runtime': 11.8269, 'eval_samples_per_second': 6.764, 'eval_steps_per_second': 1.691, 'epoch': 1.0}


 40%|████      | 30/75 [00:33<01:12,  1.61s/it]

{'loss': 34.6899, 'grad_norm': 97.27703857421875, 'learning_rate': 1.8e-05, 'epoch': 1.2}


 53%|█████▎    | 40/75 [00:41<00:27,  1.28it/s]

{'loss': 32.1704, 'grad_norm': 96.74372863769531, 'learning_rate': 1.4e-05, 'epoch': 1.6}


 67%|██████▋   | 50/75 [00:48<00:19,  1.30it/s]

{'loss': 30.1755, 'grad_norm': 104.20182037353516, 'learning_rate': 9.999999999999999e-06, 'epoch': 2.0}



 67%|██████▋   | 50/75 [01:00<00:19,  1.30it/s]

{'eval_loss': 28.867650985717773, 'eval_sacrebleu': 4.1004707348396074, 'eval_runtime': 11.9187, 'eval_samples_per_second': 6.712, 'eval_steps_per_second': 1.678, 'epoch': 2.0}


 80%|████████  | 60/75 [01:08<00:13,  1.11it/s]

{'loss': 29.2766, 'grad_norm': 92.80297088623047, 'learning_rate': 6e-06, 'epoch': 2.4}


 93%|█████████▎| 70/75 [01:15<00:03,  1.29it/s]

{'loss': 28.6698, 'grad_norm': 91.4073486328125, 'learning_rate': 2e-06, 'epoch': 2.8}



100%|██████████| 75/75 [01:32<00:00,  1.23s/it]

{'eval_loss': 27.107858657836914, 'eval_sacrebleu': 4.695020828639544, 'eval_runtime': 11.3024, 'eval_samples_per_second': 7.078, 'eval_steps_per_second': 1.77, 'epoch': 3.0}
{'train_runtime': 92.3626, 'train_samples_per_second': 3.248, 'train_steps_per_second': 0.812, 'train_loss': 32.96554321289062, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=32.96554321289062, metrics={'train_runtime': 92.3626, 'train_samples_per_second': 3.248, 'train_steps_per_second': 0.812, 'total_flos': 55767151411200.0, 'train_loss': 32.96554321289062, 'epoch': 3.0})

In [47]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsFlanT5_trial2",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_shuffled,
    eval_dataset=tokenized_valid_shuffled,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  0%|          | 10/4809 [00:06<56:02,  1.43it/s]

{'loss': 27.2423, 'grad_norm': 90.99745178222656, 'learning_rate': 2.9937616968184656e-05, 'epoch': 0.01}


  0%|          | 20/4809 [00:12<44:03,  1.81it/s]  

{'loss': 24.2288, 'grad_norm': 100.7487564086914, 'learning_rate': 2.9875233936369307e-05, 'epoch': 0.01}


  1%|          | 30/4809 [00:18<46:44,  1.70it/s]

{'loss': 20.3587, 'grad_norm': 72.30579376220703, 'learning_rate': 2.9812850904553962e-05, 'epoch': 0.02}


  1%|          | 40/4809 [00:26<1:06:15,  1.20it/s]

{'loss': 17.6171, 'grad_norm': 71.92424774169922, 'learning_rate': 2.9750467872738617e-05, 'epoch': 0.02}


  1%|          | 50/4809 [00:34<1:03:37,  1.25it/s]

{'loss': 15.4959, 'grad_norm': 77.10942840576172, 'learning_rate': 2.968808484092327e-05, 'epoch': 0.03}


  1%|          | 60/4809 [00:42<1:04:20,  1.23it/s]

{'loss': 12.271, 'grad_norm': 61.339595794677734, 'learning_rate': 2.9625701809107924e-05, 'epoch': 0.04}


  1%|▏         | 70/4809 [00:50<1:03:21,  1.25it/s]

{'loss': 10.1263, 'grad_norm': 60.74800491333008, 'learning_rate': 2.956331877729258e-05, 'epoch': 0.04}


  2%|▏         | 80/4809 [00:59<1:03:31,  1.24it/s]

{'loss': 8.5595, 'grad_norm': 43.09628677368164, 'learning_rate': 2.950093574547723e-05, 'epoch': 0.05}


  2%|▏         | 90/4809 [01:07<1:02:39,  1.26it/s]

{'loss': 7.2319, 'grad_norm': 33.469505310058594, 'learning_rate': 2.9438552713661885e-05, 'epoch': 0.06}


  2%|▏         | 100/4809 [01:14<1:01:19,  1.28it/s]

{'loss': 6.542, 'grad_norm': 27.909311294555664, 'learning_rate': 2.937616968184654e-05, 'epoch': 0.06}


  2%|▏         | 110/4809 [01:22<1:01:09,  1.28it/s]

{'loss': 5.881, 'grad_norm': 22.034198760986328, 'learning_rate': 2.9313786650031192e-05, 'epoch': 0.07}


  2%|▏         | 120/4809 [01:30<59:01,  1.32it/s]  

{'loss': 5.4469, 'grad_norm': 24.208585739135742, 'learning_rate': 2.9251403618215847e-05, 'epoch': 0.07}


  3%|▎         | 130/4809 [01:38<1:02:02,  1.26it/s]

{'loss': 4.983, 'grad_norm': 21.955699920654297, 'learning_rate': 2.9189020586400502e-05, 'epoch': 0.08}


  3%|▎         | 140/4809 [01:46<1:00:13,  1.29it/s]

{'loss': 4.6796, 'grad_norm': 21.231557846069336, 'learning_rate': 2.9126637554585154e-05, 'epoch': 0.09}


  3%|▎         | 150/4809 [01:53<58:19,  1.33it/s]  

{'loss': 4.4252, 'grad_norm': 20.622066497802734, 'learning_rate': 2.906425452276981e-05, 'epoch': 0.09}


  3%|▎         | 160/4809 [02:01<59:04,  1.31it/s]  

{'loss': 4.1011, 'grad_norm': 22.8644962310791, 'learning_rate': 2.9001871490954464e-05, 'epoch': 0.1}


  4%|▎         | 170/4809 [02:08<57:57,  1.33it/s]

{'loss': 3.9151, 'grad_norm': 22.62520980834961, 'learning_rate': 2.8939488459139115e-05, 'epoch': 0.11}


  4%|▎         | 180/4809 [02:16<1:01:24,  1.26it/s]

{'loss': 3.7803, 'grad_norm': 23.402334213256836, 'learning_rate': 2.887710542732377e-05, 'epoch': 0.11}


  4%|▍         | 190/4809 [02:24<58:26,  1.32it/s]  

{'loss': 3.5271, 'grad_norm': 24.825143814086914, 'learning_rate': 2.8814722395508425e-05, 'epoch': 0.12}


  4%|▍         | 200/4809 [02:31<57:47,  1.33it/s]  

{'loss': 3.4299, 'grad_norm': 24.859169006347656, 'learning_rate': 2.8752339363693077e-05, 'epoch': 0.12}


  4%|▍         | 210/4809 [02:39<57:56,  1.32it/s]

{'loss': 3.215, 'grad_norm': 24.616464614868164, 'learning_rate': 2.8689956331877732e-05, 'epoch': 0.13}


  5%|▍         | 220/4809 [02:47<59:01,  1.30it/s]

{'loss': 3.065, 'grad_norm': 24.2674617767334, 'learning_rate': 2.8627573300062387e-05, 'epoch': 0.14}


  5%|▍         | 230/4809 [02:54<1:02:36,  1.22it/s]

{'loss': 2.8828, 'grad_norm': 25.957332611083984, 'learning_rate': 2.856519026824704e-05, 'epoch': 0.14}


  5%|▍         | 240/4809 [03:02<1:02:29,  1.22it/s]

{'loss': 2.7298, 'grad_norm': 24.884878158569336, 'learning_rate': 2.850280723643169e-05, 'epoch': 0.15}


  5%|▌         | 250/4809 [03:10<59:32,  1.28it/s]  

{'loss': 2.525, 'grad_norm': 27.246824264526367, 'learning_rate': 2.8440424204616345e-05, 'epoch': 0.16}


  5%|▌         | 260/4809 [03:18<1:00:02,  1.26it/s]

{'loss': 2.371, 'grad_norm': 24.770191192626953, 'learning_rate': 2.8378041172800997e-05, 'epoch': 0.16}


  6%|▌         | 270/4809 [03:25<48:23,  1.56it/s]  

{'loss': 2.2333, 'grad_norm': 25.911584854125977, 'learning_rate': 2.831565814098565e-05, 'epoch': 0.17}


  6%|▌         | 280/4809 [03:30<38:22,  1.97it/s]

{'loss': 2.0971, 'grad_norm': 28.533281326293945, 'learning_rate': 2.8253275109170307e-05, 'epoch': 0.17}


  6%|▌         | 290/4809 [03:38<57:12,  1.32it/s]

{'loss': 1.9284, 'grad_norm': 26.120807647705078, 'learning_rate': 2.8190892077354958e-05, 'epoch': 0.18}


  6%|▌         | 300/4809 [03:46<58:27,  1.29it/s]  

{'loss': 1.7799, 'grad_norm': 25.743793487548828, 'learning_rate': 2.8128509045539613e-05, 'epoch': 0.19}


  6%|▋         | 310/4809 [03:54<1:00:13,  1.25it/s]

{'loss': 1.6226, 'grad_norm': 27.25642967224121, 'learning_rate': 2.8066126013724268e-05, 'epoch': 0.19}


  7%|▋         | 320/4809 [04:00<37:40,  1.99it/s]  

{'loss': 1.4715, 'grad_norm': 27.60928726196289, 'learning_rate': 2.800374298190892e-05, 'epoch': 0.2}


  7%|▋         | 330/4809 [04:04<34:52,  2.14it/s]

{'loss': 1.4431, 'grad_norm': 25.1280517578125, 'learning_rate': 2.7941359950093575e-05, 'epoch': 0.21}


  7%|▋         | 340/4809 [04:09<35:19,  2.11it/s]

{'loss': 1.2291, 'grad_norm': 23.15038299560547, 'learning_rate': 2.787897691827823e-05, 'epoch': 0.21}


  7%|▋         | 350/4809 [04:14<34:41,  2.14it/s]

{'loss': 1.1681, 'grad_norm': 24.23664665222168, 'learning_rate': 2.781659388646288e-05, 'epoch': 0.22}


  7%|▋         | 360/4809 [04:18<34:32,  2.15it/s]

{'loss': 1.0877, 'grad_norm': 21.635183334350586, 'learning_rate': 2.7754210854647536e-05, 'epoch': 0.22}


  8%|▊         | 370/4809 [04:23<34:38,  2.14it/s]

{'loss': 0.9734, 'grad_norm': 20.682432174682617, 'learning_rate': 2.769182782283219e-05, 'epoch': 0.23}


  8%|▊         | 380/4809 [04:28<34:25,  2.14it/s]

{'loss': 0.9039, 'grad_norm': 20.05738067626953, 'learning_rate': 2.7629444791016843e-05, 'epoch': 0.24}


  8%|▊         | 390/4809 [04:33<34:19,  2.15it/s]

{'loss': 0.8262, 'grad_norm': 18.887842178344727, 'learning_rate': 2.7567061759201498e-05, 'epoch': 0.24}


  8%|▊         | 400/4809 [04:37<34:51,  2.11it/s]

{'loss': 0.7761, 'grad_norm': 18.453298568725586, 'learning_rate': 2.7504678727386153e-05, 'epoch': 0.25}


  9%|▊         | 410/4809 [04:42<34:20,  2.13it/s]

{'loss': 0.6871, 'grad_norm': 16.216567993164062, 'learning_rate': 2.7442295695570804e-05, 'epoch': 0.26}


  9%|▊         | 420/4809 [04:47<34:10,  2.14it/s]

{'loss': 0.6283, 'grad_norm': 15.300580978393555, 'learning_rate': 2.737991266375546e-05, 'epoch': 0.26}


  9%|▉         | 430/4809 [04:51<34:17,  2.13it/s]

{'loss': 0.591, 'grad_norm': 13.093653678894043, 'learning_rate': 2.731752963194011e-05, 'epoch': 0.27}


  9%|▉         | 440/4809 [04:56<34:03,  2.14it/s]

{'loss': 0.5184, 'grad_norm': 12.442919731140137, 'learning_rate': 2.7255146600124766e-05, 'epoch': 0.27}


  9%|▉         | 450/4809 [05:01<34:33,  2.10it/s]

{'loss': 0.4671, 'grad_norm': 11.15705680847168, 'learning_rate': 2.719276356830942e-05, 'epoch': 0.28}


 10%|▉         | 460/4809 [05:05<34:27,  2.10it/s]

{'loss': 0.4444, 'grad_norm': 11.004963874816895, 'learning_rate': 2.7130380536494073e-05, 'epoch': 0.29}


 10%|▉         | 470/4809 [05:10<34:21,  2.11it/s]

{'loss': 0.4072, 'grad_norm': 9.691847801208496, 'learning_rate': 2.7067997504678728e-05, 'epoch': 0.29}


 10%|▉         | 480/4809 [05:15<34:05,  2.12it/s]

{'loss': 0.3723, 'grad_norm': 8.832423210144043, 'learning_rate': 2.7005614472863383e-05, 'epoch': 0.3}


 10%|█         | 490/4809 [05:20<34:04,  2.11it/s]

{'loss': 0.3317, 'grad_norm': 7.220129489898682, 'learning_rate': 2.6943231441048034e-05, 'epoch': 0.31}


 10%|█         | 500/4809 [05:24<33:54,  2.12it/s]

{'loss': 0.3284, 'grad_norm': 7.216028690338135, 'learning_rate': 2.688084840923269e-05, 'epoch': 0.31}


 11%|█         | 510/4809 [05:31<35:41,  2.01it/s]  

{'loss': 0.3086, 'grad_norm': 6.563718318939209, 'learning_rate': 2.6818465377417344e-05, 'epoch': 0.32}


 11%|█         | 520/4809 [05:35<34:44,  2.06it/s]

{'loss': 0.2742, 'grad_norm': 6.532592296600342, 'learning_rate': 2.6756082345601996e-05, 'epoch': 0.32}


 11%|█         | 530/4809 [05:40<33:34,  2.12it/s]

{'loss': 0.2963, 'grad_norm': 5.802054405212402, 'learning_rate': 2.669369931378665e-05, 'epoch': 0.33}


 11%|█         | 540/4809 [05:45<33:28,  2.13it/s]

{'loss': 0.2201, 'grad_norm': 5.115708351135254, 'learning_rate': 2.6631316281971306e-05, 'epoch': 0.34}


 11%|█▏        | 550/4809 [05:49<33:08,  2.14it/s]

{'loss': 0.2128, 'grad_norm': 4.428168296813965, 'learning_rate': 2.6568933250155957e-05, 'epoch': 0.34}


 12%|█▏        | 560/4809 [05:54<33:32,  2.11it/s]

{'loss': 0.2105, 'grad_norm': 4.729807376861572, 'learning_rate': 2.6506550218340612e-05, 'epoch': 0.35}


 12%|█▏        | 570/4809 [05:59<33:14,  2.13it/s]

{'loss': 0.1707, 'grad_norm': 4.132705211639404, 'learning_rate': 2.6444167186525267e-05, 'epoch': 0.36}


 12%|█▏        | 580/4809 [06:04<33:17,  2.12it/s]

{'loss': 0.18, 'grad_norm': 2.986055612564087, 'learning_rate': 2.638178415470992e-05, 'epoch': 0.36}


 12%|█▏        | 590/4809 [06:08<33:00,  2.13it/s]

{'loss': 0.1647, 'grad_norm': 2.8585803508758545, 'learning_rate': 2.6319401122894574e-05, 'epoch': 0.37}


 12%|█▏        | 600/4809 [06:13<33:02,  2.12it/s]

{'loss': 0.1525, 'grad_norm': 2.6038336753845215, 'learning_rate': 2.625701809107923e-05, 'epoch': 0.37}


 13%|█▎        | 610/4809 [06:18<33:03,  2.12it/s]

{'loss': 0.151, 'grad_norm': 2.635577917098999, 'learning_rate': 2.619463505926388e-05, 'epoch': 0.38}


 13%|█▎        | 620/4809 [06:22<33:10,  2.10it/s]

{'loss': 0.1503, 'grad_norm': 2.578784465789795, 'learning_rate': 2.6132252027448536e-05, 'epoch': 0.39}


 13%|█▎        | 630/4809 [06:27<32:35,  2.14it/s]

{'loss': 0.1364, 'grad_norm': 1.8428678512573242, 'learning_rate': 2.606986899563319e-05, 'epoch': 0.39}


 13%|█▎        | 640/4809 [06:32<32:48,  2.12it/s]

{'loss': 0.124, 'grad_norm': 2.0807738304138184, 'learning_rate': 2.6007485963817842e-05, 'epoch': 0.4}


 14%|█▎        | 650/4809 [06:37<32:43,  2.12it/s]

{'loss': 0.114, 'grad_norm': 1.9096308946609497, 'learning_rate': 2.5945102932002497e-05, 'epoch': 0.41}


 14%|█▎        | 660/4809 [06:41<32:46,  2.11it/s]

{'loss': 0.1083, 'grad_norm': 1.5349746942520142, 'learning_rate': 2.5882719900187152e-05, 'epoch': 0.41}


 14%|█▍        | 670/4809 [06:46<32:21,  2.13it/s]

{'loss': 0.1109, 'grad_norm': 1.3394591808319092, 'learning_rate': 2.5820336868371804e-05, 'epoch': 0.42}


 14%|█▍        | 680/4809 [06:51<32:13,  2.14it/s]

{'loss': 0.0966, 'grad_norm': 1.3263989686965942, 'learning_rate': 2.575795383655646e-05, 'epoch': 0.42}


 14%|█▍        | 690/4809 [06:55<31:59,  2.15it/s]

{'loss': 0.084, 'grad_norm': 1.1627918481826782, 'learning_rate': 2.5695570804741114e-05, 'epoch': 0.43}


 15%|█▍        | 700/4809 [07:00<31:53,  2.15it/s]

{'loss': 0.0771, 'grad_norm': 0.9439238905906677, 'learning_rate': 2.5633187772925762e-05, 'epoch': 0.44}


 15%|█▍        | 710/4809 [07:05<31:52,  2.14it/s]

{'loss': 0.0974, 'grad_norm': 1.1174700260162354, 'learning_rate': 2.5570804741110417e-05, 'epoch': 0.44}


 15%|█▍        | 720/4809 [07:10<31:52,  2.14it/s]

{'loss': 0.0871, 'grad_norm': 0.9527707695960999, 'learning_rate': 2.5508421709295072e-05, 'epoch': 0.45}


 15%|█▌        | 730/4809 [07:14<32:07,  2.12it/s]

{'loss': 0.0865, 'grad_norm': 1.3006795644760132, 'learning_rate': 2.5446038677479723e-05, 'epoch': 0.46}


 15%|█▌        | 740/4809 [07:19<31:40,  2.14it/s]

{'loss': 0.0969, 'grad_norm': 0.8623494505882263, 'learning_rate': 2.538365564566438e-05, 'epoch': 0.46}


 16%|█▌        | 750/4809 [07:24<31:50,  2.12it/s]

{'loss': 0.0794, 'grad_norm': 0.855465829372406, 'learning_rate': 2.5321272613849033e-05, 'epoch': 0.47}


 16%|█▌        | 760/4809 [07:28<31:40,  2.13it/s]

{'loss': 0.0823, 'grad_norm': 0.7056512236595154, 'learning_rate': 2.5258889582033685e-05, 'epoch': 0.47}


 16%|█▌        | 770/4809 [07:33<31:41,  2.12it/s]

{'loss': 0.0668, 'grad_norm': 0.7678564786911011, 'learning_rate': 2.519650655021834e-05, 'epoch': 0.48}


 16%|█▌        | 780/4809 [07:38<31:20,  2.14it/s]

{'loss': 0.069, 'grad_norm': 0.6135876774787903, 'learning_rate': 2.5134123518402995e-05, 'epoch': 0.49}


 16%|█▋        | 790/4809 [07:42<31:17,  2.14it/s]

{'loss': 0.0724, 'grad_norm': 0.6549837589263916, 'learning_rate': 2.5071740486587647e-05, 'epoch': 0.49}


 17%|█▋        | 800/4809 [07:47<31:32,  2.12it/s]

{'loss': 0.0757, 'grad_norm': 1.4589420557022095, 'learning_rate': 2.50093574547723e-05, 'epoch': 0.5}


 17%|█▋        | 810/4809 [07:52<31:17,  2.13it/s]

{'loss': 0.0853, 'grad_norm': 15.681805610656738, 'learning_rate': 2.4946974422956957e-05, 'epoch': 0.51}


 17%|█▋        | 820/4809 [07:56<31:07,  2.14it/s]

{'loss': 0.075, 'grad_norm': 0.6952224969863892, 'learning_rate': 2.4884591391141608e-05, 'epoch': 0.51}


 17%|█▋        | 830/4809 [08:01<30:56,  2.14it/s]

{'loss': 0.0756, 'grad_norm': 0.6395836472511292, 'learning_rate': 2.4822208359326263e-05, 'epoch': 0.52}


 17%|█▋        | 840/4809 [08:06<30:52,  2.14it/s]

{'loss': 0.0787, 'grad_norm': 0.5270175933837891, 'learning_rate': 2.4759825327510918e-05, 'epoch': 0.52}


 18%|█▊        | 850/4809 [08:11<30:49,  2.14it/s]

{'loss': 0.0718, 'grad_norm': 0.6006007790565491, 'learning_rate': 2.469744229569557e-05, 'epoch': 0.53}


 18%|█▊        | 860/4809 [08:15<30:54,  2.13it/s]

{'loss': 0.0703, 'grad_norm': 0.6481507420539856, 'learning_rate': 2.4635059263880225e-05, 'epoch': 0.54}


 18%|█▊        | 870/4809 [08:20<32:10,  2.04it/s]

{'loss': 0.0646, 'grad_norm': 0.5660766959190369, 'learning_rate': 2.457267623206488e-05, 'epoch': 0.54}


 18%|█▊        | 880/4809 [08:27<49:44,  1.32it/s]

{'loss': 0.0615, 'grad_norm': 0.458453506231308, 'learning_rate': 2.451029320024953e-05, 'epoch': 0.55}


 19%|█▊        | 890/4809 [08:33<33:16,  1.96it/s]

{'loss': 0.0661, 'grad_norm': 0.39964696764945984, 'learning_rate': 2.4447910168434186e-05, 'epoch': 0.56}


 19%|█▊        | 900/4809 [08:38<31:04,  2.10it/s]

{'loss': 0.0619, 'grad_norm': 0.4716455936431885, 'learning_rate': 2.438552713661884e-05, 'epoch': 0.56}


 19%|█▉        | 910/4809 [08:43<30:58,  2.10it/s]

{'loss': 0.0675, 'grad_norm': 6.093461990356445, 'learning_rate': 2.4323144104803493e-05, 'epoch': 0.57}


 19%|█▉        | 920/4809 [08:48<30:57,  2.09it/s]

{'loss': 0.0705, 'grad_norm': 0.6540942788124084, 'learning_rate': 2.4260761072988148e-05, 'epoch': 0.57}


 19%|█▉        | 930/4809 [08:52<30:45,  2.10it/s]

{'loss': 0.0512, 'grad_norm': 0.40196236968040466, 'learning_rate': 2.4198378041172803e-05, 'epoch': 0.58}


 20%|█▉        | 940/4809 [08:57<30:27,  2.12it/s]

{'loss': 0.0572, 'grad_norm': 0.6145085096359253, 'learning_rate': 2.4135995009357454e-05, 'epoch': 0.59}


 20%|█▉        | 950/4809 [09:02<30:11,  2.13it/s]

{'loss': 0.0773, 'grad_norm': 0.7424068450927734, 'learning_rate': 2.407361197754211e-05, 'epoch': 0.59}


 20%|█▉        | 960/4809 [09:07<30:17,  2.12it/s]

{'loss': 0.0541, 'grad_norm': 0.6715793013572693, 'learning_rate': 2.4011228945726764e-05, 'epoch': 0.6}


 20%|██        | 970/4809 [09:11<30:15,  2.11it/s]

{'loss': 0.0675, 'grad_norm': 0.4110478460788727, 'learning_rate': 2.3948845913911416e-05, 'epoch': 0.61}


 20%|██        | 980/4809 [09:16<30:01,  2.13it/s]

{'loss': 0.0691, 'grad_norm': 0.5415746569633484, 'learning_rate': 2.388646288209607e-05, 'epoch': 0.61}


 21%|██        | 990/4809 [09:21<30:07,  2.11it/s]

{'loss': 0.0664, 'grad_norm': 0.40490007400512695, 'learning_rate': 2.3824079850280726e-05, 'epoch': 0.62}


 21%|██        | 1000/4809 [09:25<30:04,  2.11it/s]

{'loss': 0.0555, 'grad_norm': 0.37321215867996216, 'learning_rate': 2.3761696818465378e-05, 'epoch': 0.62}


 21%|██        | 1010/4809 [09:32<31:28,  2.01it/s]

{'loss': 0.0678, 'grad_norm': 0.8753653764724731, 'learning_rate': 2.3699313786650033e-05, 'epoch': 0.63}


 21%|██        | 1020/4809 [09:37<30:33,  2.07it/s]

{'loss': 0.0587, 'grad_norm': 0.42629265785217285, 'learning_rate': 2.3636930754834688e-05, 'epoch': 0.64}


 21%|██▏       | 1030/4809 [09:41<30:05,  2.09it/s]

{'loss': 0.0642, 'grad_norm': 0.49161532521247864, 'learning_rate': 2.357454772301934e-05, 'epoch': 0.64}


 22%|██▏       | 1040/4809 [09:46<29:37,  2.12it/s]

{'loss': 0.0622, 'grad_norm': 0.48323628306388855, 'learning_rate': 2.3512164691203994e-05, 'epoch': 0.65}


 22%|██▏       | 1050/4809 [09:51<29:54,  2.10it/s]

{'loss': 0.0572, 'grad_norm': 0.3899824917316437, 'learning_rate': 2.344978165938865e-05, 'epoch': 0.66}


 22%|██▏       | 1060/4809 [09:56<29:29,  2.12it/s]

{'loss': 0.0659, 'grad_norm': 0.6025416254997253, 'learning_rate': 2.33873986275733e-05, 'epoch': 0.66}


 22%|██▏       | 1070/4809 [10:00<29:49,  2.09it/s]

{'loss': 0.0896, 'grad_norm': 0.42370298504829407, 'learning_rate': 2.3325015595757956e-05, 'epoch': 0.67}


 22%|██▏       | 1080/4809 [10:05<29:47,  2.09it/s]

{'loss': 0.074, 'grad_norm': 0.4440394639968872, 'learning_rate': 2.326263256394261e-05, 'epoch': 0.67}


 23%|██▎       | 1090/4809 [10:10<29:32,  2.10it/s]

{'loss': 0.048, 'grad_norm': 0.38542065024375916, 'learning_rate': 2.3200249532127262e-05, 'epoch': 0.68}


 23%|██▎       | 1100/4809 [10:15<29:29,  2.10it/s]

{'loss': 0.0538, 'grad_norm': 0.4099353849887848, 'learning_rate': 2.3137866500311917e-05, 'epoch': 0.69}


 23%|██▎       | 1110/4809 [10:19<28:53,  2.13it/s]

{'loss': 0.0624, 'grad_norm': 0.6218278408050537, 'learning_rate': 2.3075483468496572e-05, 'epoch': 0.69}


 23%|██▎       | 1120/4809 [10:24<29:05,  2.11it/s]

{'loss': 0.0612, 'grad_norm': 0.32824525237083435, 'learning_rate': 2.3013100436681224e-05, 'epoch': 0.7}


 23%|██▎       | 1130/4809 [10:29<29:15,  2.10it/s]

{'loss': 0.0529, 'grad_norm': 0.3911222815513611, 'learning_rate': 2.295071740486588e-05, 'epoch': 0.7}


 24%|██▎       | 1140/4809 [10:34<29:12,  2.09it/s]

{'loss': 0.0589, 'grad_norm': 0.4738723635673523, 'learning_rate': 2.2888334373050534e-05, 'epoch': 0.71}


 24%|██▍       | 1150/4809 [10:38<29:18,  2.08it/s]

{'loss': 0.0659, 'grad_norm': 0.5136259198188782, 'learning_rate': 2.2825951341235186e-05, 'epoch': 0.72}


 24%|██▍       | 1160/4809 [10:43<28:43,  2.12it/s]

{'loss': 0.0594, 'grad_norm': 0.3901600241661072, 'learning_rate': 2.2763568309419837e-05, 'epoch': 0.72}


 24%|██▍       | 1170/4809 [10:48<28:42,  2.11it/s]

{'loss': 0.059, 'grad_norm': 0.35076314210891724, 'learning_rate': 2.2701185277604492e-05, 'epoch': 0.73}


 25%|██▍       | 1180/4809 [10:53<29:21,  2.06it/s]

{'loss': 0.0603, 'grad_norm': 1.390964150428772, 'learning_rate': 2.2638802245789144e-05, 'epoch': 0.74}


 25%|██▍       | 1190/4809 [10:57<28:48,  2.09it/s]

{'loss': 0.0517, 'grad_norm': 0.31100836396217346, 'learning_rate': 2.25764192139738e-05, 'epoch': 0.74}


 25%|██▍       | 1200/4809 [11:02<29:48,  2.02it/s]

{'loss': 0.05, 'grad_norm': 0.47953206300735474, 'learning_rate': 2.2514036182158454e-05, 'epoch': 0.75}


 25%|██▌       | 1210/4809 [11:10<45:29,  1.32it/s]

{'loss': 0.0704, 'grad_norm': 0.4182959794998169, 'learning_rate': 2.2451653150343105e-05, 'epoch': 0.75}


 25%|██▌       | 1220/4809 [11:17<45:12,  1.32it/s]

{'loss': 0.0562, 'grad_norm': 0.5505791902542114, 'learning_rate': 2.238927011852776e-05, 'epoch': 0.76}


 26%|██▌       | 1230/4809 [11:25<44:48,  1.33it/s]

{'loss': 0.0554, 'grad_norm': 0.4082561135292053, 'learning_rate': 2.2326887086712415e-05, 'epoch': 0.77}


 26%|██▌       | 1240/4809 [11:32<44:35,  1.33it/s]

{'loss': 0.0622, 'grad_norm': 0.35641977190971375, 'learning_rate': 2.2264504054897067e-05, 'epoch': 0.77}


 26%|██▌       | 1250/4809 [11:40<44:38,  1.33it/s]

{'loss': 0.0553, 'grad_norm': 0.5319871306419373, 'learning_rate': 2.2202121023081722e-05, 'epoch': 0.78}


 26%|██▌       | 1260/4809 [11:47<44:34,  1.33it/s]

{'loss': 0.0661, 'grad_norm': 0.3815642297267914, 'learning_rate': 2.2139737991266377e-05, 'epoch': 0.79}


 26%|██▋       | 1270/4809 [11:55<44:40,  1.32it/s]

{'loss': 0.0709, 'grad_norm': 1.6178721189498901, 'learning_rate': 2.207735495945103e-05, 'epoch': 0.79}


 27%|██▋       | 1280/4809 [12:03<44:14,  1.33it/s]

{'loss': 0.06, 'grad_norm': 0.3216295540332794, 'learning_rate': 2.2014971927635683e-05, 'epoch': 0.8}


 27%|██▋       | 1290/4809 [12:10<44:09,  1.33it/s]

{'loss': 0.0503, 'grad_norm': 0.4869525730609894, 'learning_rate': 2.1952588895820335e-05, 'epoch': 0.8}


 27%|██▋       | 1300/4809 [12:18<43:50,  1.33it/s]

{'loss': 0.062, 'grad_norm': 0.5316475629806519, 'learning_rate': 2.189020586400499e-05, 'epoch': 0.81}


 27%|██▋       | 1310/4809 [12:25<43:44,  1.33it/s]

{'loss': 0.0562, 'grad_norm': 0.5594882369041443, 'learning_rate': 2.1827822832189645e-05, 'epoch': 0.82}


 27%|██▋       | 1320/4809 [12:33<43:18,  1.34it/s]

{'loss': 0.0517, 'grad_norm': 0.6030540466308594, 'learning_rate': 2.1765439800374297e-05, 'epoch': 0.82}


 28%|██▊       | 1330/4809 [12:40<43:30,  1.33it/s]

{'loss': 0.0599, 'grad_norm': 0.7506543397903442, 'learning_rate': 2.170305676855895e-05, 'epoch': 0.83}


 28%|██▊       | 1340/4809 [12:48<43:51,  1.32it/s]

{'loss': 0.0724, 'grad_norm': 0.2489054948091507, 'learning_rate': 2.1640673736743607e-05, 'epoch': 0.84}


 28%|██▊       | 1350/4809 [12:56<43:59,  1.31it/s]

{'loss': 0.0408, 'grad_norm': 0.25788626074790955, 'learning_rate': 2.1578290704928258e-05, 'epoch': 0.84}


 28%|██▊       | 1360/4809 [13:03<44:24,  1.29it/s]

{'loss': 0.0634, 'grad_norm': 0.46892106533050537, 'learning_rate': 2.1515907673112913e-05, 'epoch': 0.85}


 28%|██▊       | 1370/4809 [13:11<43:17,  1.32it/s]

{'loss': 0.0558, 'grad_norm': 0.4066108465194702, 'learning_rate': 2.1453524641297568e-05, 'epoch': 0.85}


 29%|██▊       | 1380/4809 [13:18<42:49,  1.33it/s]

{'loss': 0.0553, 'grad_norm': 0.3242902457714081, 'learning_rate': 2.139114160948222e-05, 'epoch': 0.86}


 29%|██▉       | 1390/4809 [13:26<42:50,  1.33it/s]

{'loss': 0.0524, 'grad_norm': 0.37799152731895447, 'learning_rate': 2.1328758577666875e-05, 'epoch': 0.87}


 29%|██▉       | 1400/4809 [13:33<42:30,  1.34it/s]

{'loss': 0.0696, 'grad_norm': 0.3781944811344147, 'learning_rate': 2.126637554585153e-05, 'epoch': 0.87}


 29%|██▉       | 1410/4809 [13:41<42:40,  1.33it/s]

{'loss': 0.0791, 'grad_norm': 0.4406336545944214, 'learning_rate': 2.120399251403618e-05, 'epoch': 0.88}


 30%|██▉       | 1420/4809 [13:48<42:16,  1.34it/s]

{'loss': 0.0522, 'grad_norm': 0.9837760329246521, 'learning_rate': 2.1141609482220836e-05, 'epoch': 0.89}


 30%|██▉       | 1430/4809 [13:56<42:28,  1.33it/s]

{'loss': 0.0554, 'grad_norm': 0.2703597843647003, 'learning_rate': 2.107922645040549e-05, 'epoch': 0.89}


 30%|██▉       | 1440/4809 [14:03<42:11,  1.33it/s]

{'loss': 0.0585, 'grad_norm': 0.34961605072021484, 'learning_rate': 2.1016843418590143e-05, 'epoch': 0.9}


 30%|███       | 1450/4809 [14:11<42:12,  1.33it/s]

{'loss': 0.0534, 'grad_norm': 0.37299469113349915, 'learning_rate': 2.0954460386774798e-05, 'epoch': 0.9}


 30%|███       | 1460/4809 [14:19<43:11,  1.29it/s]

{'loss': 0.0517, 'grad_norm': 0.501522421836853, 'learning_rate': 2.0892077354959453e-05, 'epoch': 0.91}


 31%|███       | 1470/4809 [14:26<42:52,  1.30it/s]

{'loss': 0.0596, 'grad_norm': 0.483264297246933, 'learning_rate': 2.0829694323144105e-05, 'epoch': 0.92}


 31%|███       | 1480/4809 [14:34<41:43,  1.33it/s]

{'loss': 0.0623, 'grad_norm': 0.6507223844528198, 'learning_rate': 2.076731129132876e-05, 'epoch': 0.92}


 31%|███       | 1490/4809 [14:41<41:27,  1.33it/s]

{'loss': 0.0543, 'grad_norm': 0.2649420201778412, 'learning_rate': 2.0704928259513414e-05, 'epoch': 0.93}


 31%|███       | 1500/4809 [14:49<41:29,  1.33it/s]

{'loss': 0.058, 'grad_norm': 0.409466952085495, 'learning_rate': 2.0642545227698066e-05, 'epoch': 0.94}


 31%|███▏      | 1510/4809 [14:58<42:15,  1.30it/s]  

{'loss': 0.0542, 'grad_norm': 0.2862611413002014, 'learning_rate': 2.058016219588272e-05, 'epoch': 0.94}


 32%|███▏      | 1520/4809 [15:05<41:04,  1.33it/s]

{'loss': 0.0499, 'grad_norm': 0.4281299412250519, 'learning_rate': 2.0517779164067376e-05, 'epoch': 0.95}


 32%|███▏      | 1530/4809 [15:13<41:13,  1.33it/s]

{'loss': 0.0536, 'grad_norm': 0.35286471247673035, 'learning_rate': 2.0455396132252028e-05, 'epoch': 0.95}


 32%|███▏      | 1540/4809 [15:21<40:49,  1.33it/s]

{'loss': 0.0506, 'grad_norm': 0.27324172854423523, 'learning_rate': 2.0393013100436683e-05, 'epoch': 0.96}


 32%|███▏      | 1550/4809 [15:28<40:42,  1.33it/s]

{'loss': 0.0535, 'grad_norm': 0.47402945160865784, 'learning_rate': 2.0330630068621338e-05, 'epoch': 0.97}


 32%|███▏      | 1560/4809 [15:36<40:37,  1.33it/s]

{'loss': 0.0619, 'grad_norm': 0.36122894287109375, 'learning_rate': 2.026824703680599e-05, 'epoch': 0.97}


 33%|███▎      | 1570/4809 [15:43<40:27,  1.33it/s]

{'loss': 0.0529, 'grad_norm': 0.33016151189804077, 'learning_rate': 2.0205864004990644e-05, 'epoch': 0.98}


 33%|███▎      | 1580/4809 [15:51<40:23,  1.33it/s]

{'loss': 0.0542, 'grad_norm': 0.8423780202865601, 'learning_rate': 2.01434809731753e-05, 'epoch': 0.99}


 33%|███▎      | 1590/4809 [15:58<41:27,  1.29it/s]

{'loss': 0.0525, 'grad_norm': 0.38234585523605347, 'learning_rate': 2.008109794135995e-05, 'epoch': 0.99}


 33%|███▎      | 1600/4809 [16:06<41:34,  1.29it/s]

{'loss': 0.05, 'grad_norm': 0.7934027314186096, 'learning_rate': 2.0018714909544606e-05, 'epoch': 1.0}



 33%|███▎      | 1603/4809 [19:26<36:36,  1.46it/s]

{'eval_loss': 0.07020910829305649, 'eval_sacrebleu': 11.610413085965229, 'eval_runtime': 198.3782, 'eval_samples_per_second': 6.956, 'eval_steps_per_second': 1.739, 'epoch': 1.0}


 33%|███▎      | 1610/4809 [19:32<6:53:30,  7.76s/it] 

{'loss': 0.0473, 'grad_norm': 0.4630511403083801, 'learning_rate': 1.995633187772926e-05, 'epoch': 1.0}


 34%|███▎      | 1620/4809 [19:39<50:23,  1.05it/s]  

{'loss': 0.0572, 'grad_norm': 0.3906794488430023, 'learning_rate': 1.9893948845913912e-05, 'epoch': 1.01}


 34%|███▍      | 1630/4809 [19:47<40:31,  1.31it/s]

{'loss': 0.0499, 'grad_norm': 0.3770335912704468, 'learning_rate': 1.9831565814098564e-05, 'epoch': 1.02}


 34%|███▍      | 1640/4809 [19:55<39:38,  1.33it/s]

{'loss': 0.0555, 'grad_norm': 0.32056865096092224, 'learning_rate': 1.976918278228322e-05, 'epoch': 1.02}


 34%|███▍      | 1650/4809 [20:02<39:31,  1.33it/s]

{'loss': 0.0498, 'grad_norm': 0.4679020345211029, 'learning_rate': 1.970679975046787e-05, 'epoch': 1.03}


 35%|███▍      | 1660/4809 [20:10<39:26,  1.33it/s]

{'loss': 0.0441, 'grad_norm': 0.45959773659706116, 'learning_rate': 1.9644416718652526e-05, 'epoch': 1.04}


 35%|███▍      | 1670/4809 [20:17<39:11,  1.33it/s]

{'loss': 0.0579, 'grad_norm': 0.2701535224914551, 'learning_rate': 1.958203368683718e-05, 'epoch': 1.04}


 35%|███▍      | 1680/4809 [20:25<39:14,  1.33it/s]

{'loss': 0.0601, 'grad_norm': 0.35505664348602295, 'learning_rate': 1.9519650655021832e-05, 'epoch': 1.05}


 35%|███▌      | 1690/4809 [20:32<39:16,  1.32it/s]

{'loss': 0.0508, 'grad_norm': 0.29232171177864075, 'learning_rate': 1.9457267623206487e-05, 'epoch': 1.05}


 35%|███▌      | 1700/4809 [20:40<38:54,  1.33it/s]

{'loss': 0.0492, 'grad_norm': 0.4922715425491333, 'learning_rate': 1.9394884591391142e-05, 'epoch': 1.06}


 36%|███▌      | 1710/4809 [20:47<38:45,  1.33it/s]

{'loss': 0.0616, 'grad_norm': 0.6182212233543396, 'learning_rate': 1.9332501559575794e-05, 'epoch': 1.07}


 36%|███▌      | 1720/4809 [20:55<38:52,  1.32it/s]

{'loss': 0.054, 'grad_norm': 0.25438472628593445, 'learning_rate': 1.927011852776045e-05, 'epoch': 1.07}


 36%|███▌      | 1730/4809 [21:02<38:30,  1.33it/s]

{'loss': 0.0377, 'grad_norm': 0.38257643580436707, 'learning_rate': 1.9207735495945104e-05, 'epoch': 1.08}


 36%|███▌      | 1740/4809 [21:10<38:30,  1.33it/s]

{'loss': 0.0419, 'grad_norm': 0.5568053126335144, 'learning_rate': 1.9145352464129755e-05, 'epoch': 1.09}


 36%|███▋      | 1750/4809 [21:17<38:22,  1.33it/s]

{'loss': 0.0558, 'grad_norm': 0.4392715394496918, 'learning_rate': 1.908296943231441e-05, 'epoch': 1.09}


 37%|███▋      | 1760/4809 [21:25<38:14,  1.33it/s]

{'loss': 0.0577, 'grad_norm': 0.46203792095184326, 'learning_rate': 1.9020586400499065e-05, 'epoch': 1.1}


 37%|███▋      | 1770/4809 [21:33<38:08,  1.33it/s]

{'loss': 0.0677, 'grad_norm': 0.47827035188674927, 'learning_rate': 1.8958203368683717e-05, 'epoch': 1.1}


 37%|███▋      | 1780/4809 [21:40<38:00,  1.33it/s]

{'loss': 0.058, 'grad_norm': 0.32760074734687805, 'learning_rate': 1.8895820336868372e-05, 'epoch': 1.11}


 37%|███▋      | 1790/4809 [21:48<37:41,  1.34it/s]

{'loss': 0.0441, 'grad_norm': 0.31088903546333313, 'learning_rate': 1.8833437305053027e-05, 'epoch': 1.12}


 37%|███▋      | 1800/4809 [21:55<37:30,  1.34it/s]

{'loss': 0.0487, 'grad_norm': 0.2738548517227173, 'learning_rate': 1.877105427323768e-05, 'epoch': 1.12}


 38%|███▊      | 1810/4809 [22:03<37:42,  1.33it/s]

{'loss': 0.0434, 'grad_norm': 0.3153046667575836, 'learning_rate': 1.8708671241422333e-05, 'epoch': 1.13}


 38%|███▊      | 1820/4809 [22:10<37:21,  1.33it/s]

{'loss': 0.0515, 'grad_norm': 0.5168128609657288, 'learning_rate': 1.864628820960699e-05, 'epoch': 1.14}


 38%|███▊      | 1830/4809 [22:18<37:08,  1.34it/s]

{'loss': 0.0499, 'grad_norm': 0.4153248369693756, 'learning_rate': 1.858390517779164e-05, 'epoch': 1.14}


 38%|███▊      | 1840/4809 [22:25<36:56,  1.34it/s]

{'loss': 0.0514, 'grad_norm': 0.4258531928062439, 'learning_rate': 1.8521522145976295e-05, 'epoch': 1.15}


 38%|███▊      | 1850/4809 [22:33<36:55,  1.34it/s]

{'loss': 0.047, 'grad_norm': 0.30229824781417847, 'learning_rate': 1.845913911416095e-05, 'epoch': 1.15}


 39%|███▊      | 1860/4809 [22:40<36:47,  1.34it/s]

{'loss': 0.0675, 'grad_norm': 0.36775854229927063, 'learning_rate': 1.83967560823456e-05, 'epoch': 1.16}


 39%|███▉      | 1870/4809 [22:48<36:48,  1.33it/s]

{'loss': 0.0453, 'grad_norm': 0.3046451508998871, 'learning_rate': 1.8334373050530257e-05, 'epoch': 1.17}


 39%|███▉      | 1880/4809 [22:55<36:33,  1.34it/s]

{'loss': 0.0467, 'grad_norm': 0.3010392487049103, 'learning_rate': 1.827199001871491e-05, 'epoch': 1.17}


 39%|███▉      | 1890/4809 [23:03<36:37,  1.33it/s]

{'loss': 0.0437, 'grad_norm': 0.6142290234565735, 'learning_rate': 1.8209606986899563e-05, 'epoch': 1.18}


 40%|███▉      | 1900/4809 [23:10<36:24,  1.33it/s]

{'loss': 0.0694, 'grad_norm': 0.37350502610206604, 'learning_rate': 1.8147223955084218e-05, 'epoch': 1.19}


 40%|███▉      | 1910/4809 [23:18<36:27,  1.33it/s]

{'loss': 0.0591, 'grad_norm': 0.5594028830528259, 'learning_rate': 1.8084840923268873e-05, 'epoch': 1.19}


 40%|███▉      | 1920/4809 [23:25<36:03,  1.34it/s]

{'loss': 0.0542, 'grad_norm': 0.3945244550704956, 'learning_rate': 1.8022457891453525e-05, 'epoch': 1.2}


 40%|████      | 1930/4809 [23:33<36:09,  1.33it/s]

{'loss': 0.0451, 'grad_norm': 0.3405929207801819, 'learning_rate': 1.796007485963818e-05, 'epoch': 1.2}


 40%|████      | 1940/4809 [23:40<35:56,  1.33it/s]

{'loss': 0.0609, 'grad_norm': 0.37838760018348694, 'learning_rate': 1.7897691827822835e-05, 'epoch': 1.21}


 41%|████      | 1950/4809 [23:48<36:12,  1.32it/s]

{'loss': 0.0501, 'grad_norm': 0.4816477298736572, 'learning_rate': 1.7835308796007486e-05, 'epoch': 1.22}


 41%|████      | 1960/4809 [23:56<35:40,  1.33it/s]

{'loss': 0.0561, 'grad_norm': 0.36887848377227783, 'learning_rate': 1.777292576419214e-05, 'epoch': 1.22}


 41%|████      | 1970/4809 [24:03<35:31,  1.33it/s]

{'loss': 0.0486, 'grad_norm': 0.5161802172660828, 'learning_rate': 1.7710542732376796e-05, 'epoch': 1.23}


 41%|████      | 1980/4809 [24:11<35:26,  1.33it/s]

{'loss': 0.0533, 'grad_norm': 0.5800255537033081, 'learning_rate': 1.7648159700561448e-05, 'epoch': 1.24}


 41%|████▏     | 1990/4809 [24:18<35:26,  1.33it/s]

{'loss': 0.0453, 'grad_norm': 0.41463613510131836, 'learning_rate': 1.7585776668746103e-05, 'epoch': 1.24}


 42%|████▏     | 2000/4809 [24:26<35:13,  1.33it/s]

{'loss': 0.0499, 'grad_norm': 0.3049864172935486, 'learning_rate': 1.7523393636930758e-05, 'epoch': 1.25}


 42%|████▏     | 2010/4809 [24:32<22:54,  2.04it/s]

{'loss': 0.0576, 'grad_norm': 0.540787935256958, 'learning_rate': 1.746101060511541e-05, 'epoch': 1.25}


 42%|████▏     | 2020/4809 [24:36<21:42,  2.14it/s]

{'loss': 0.0447, 'grad_norm': 0.3959307372570038, 'learning_rate': 1.7398627573300065e-05, 'epoch': 1.26}


 42%|████▏     | 2030/4809 [24:41<21:32,  2.15it/s]

{'loss': 0.0422, 'grad_norm': 0.2778511643409729, 'learning_rate': 1.733624454148472e-05, 'epoch': 1.27}


 42%|████▏     | 2040/4809 [24:46<21:27,  2.15it/s]

{'loss': 0.0525, 'grad_norm': 0.32884830236434937, 'learning_rate': 1.727386150966937e-05, 'epoch': 1.27}


 43%|████▎     | 2050/4809 [24:50<21:28,  2.14it/s]

{'loss': 0.0525, 'grad_norm': 0.6591536998748779, 'learning_rate': 1.7211478477854026e-05, 'epoch': 1.28}


 43%|████▎     | 2060/4809 [24:55<21:20,  2.15it/s]

{'loss': 0.0555, 'grad_norm': 0.40245577692985535, 'learning_rate': 1.714909544603868e-05, 'epoch': 1.29}


 43%|████▎     | 2070/4809 [25:00<21:20,  2.14it/s]

{'loss': 0.0575, 'grad_norm': 0.16724343597888947, 'learning_rate': 1.7086712414223333e-05, 'epoch': 1.29}


 43%|████▎     | 2080/4809 [25:04<21:11,  2.15it/s]

{'loss': 0.0493, 'grad_norm': 0.3220089375972748, 'learning_rate': 1.7024329382407984e-05, 'epoch': 1.3}


 43%|████▎     | 2090/4809 [25:09<21:08,  2.14it/s]

{'loss': 0.061, 'grad_norm': 0.5013978481292725, 'learning_rate': 1.696194635059264e-05, 'epoch': 1.3}


 44%|████▎     | 2100/4809 [25:14<21:01,  2.15it/s]

{'loss': 0.0439, 'grad_norm': 1.1533199548721313, 'learning_rate': 1.689956331877729e-05, 'epoch': 1.31}


 44%|████▍     | 2110/4809 [25:18<21:04,  2.13it/s]

{'loss': 0.0487, 'grad_norm': 0.2556804120540619, 'learning_rate': 1.6837180286961946e-05, 'epoch': 1.32}


 44%|████▍     | 2120/4809 [25:23<21:06,  2.12it/s]

{'loss': 0.0618, 'grad_norm': 0.3316275477409363, 'learning_rate': 1.6774797255146597e-05, 'epoch': 1.32}


 44%|████▍     | 2130/4809 [25:28<20:47,  2.15it/s]

{'loss': 0.0468, 'grad_norm': 0.4193956255912781, 'learning_rate': 1.6712414223331252e-05, 'epoch': 1.33}


 44%|████▍     | 2140/4809 [25:32<20:51,  2.13it/s]

{'loss': 0.052, 'grad_norm': 1.561191439628601, 'learning_rate': 1.6650031191515907e-05, 'epoch': 1.33}


 45%|████▍     | 2150/4809 [25:37<20:44,  2.14it/s]

{'loss': 0.0402, 'grad_norm': 0.2674272060394287, 'learning_rate': 1.658764815970056e-05, 'epoch': 1.34}


 45%|████▍     | 2160/4809 [25:42<20:33,  2.15it/s]

{'loss': 0.065, 'grad_norm': 0.39131879806518555, 'learning_rate': 1.6525265127885214e-05, 'epoch': 1.35}


 45%|████▌     | 2170/4809 [25:46<20:25,  2.15it/s]

{'loss': 0.0441, 'grad_norm': 0.42992693185806274, 'learning_rate': 1.646288209606987e-05, 'epoch': 1.35}


 45%|████▌     | 2180/4809 [25:51<20:26,  2.14it/s]

{'loss': 0.0842, 'grad_norm': 0.42844557762145996, 'learning_rate': 1.640049906425452e-05, 'epoch': 1.36}


 46%|████▌     | 2190/4809 [25:56<20:19,  2.15it/s]

{'loss': 0.0526, 'grad_norm': 0.47627556324005127, 'learning_rate': 1.6338116032439176e-05, 'epoch': 1.37}


 46%|████▌     | 2200/4809 [26:01<20:43,  2.10it/s]

{'loss': 0.0494, 'grad_norm': 0.33705222606658936, 'learning_rate': 1.627573300062383e-05, 'epoch': 1.37}


 46%|████▌     | 2210/4809 [26:07<29:09,  1.49it/s]

{'loss': 0.0486, 'grad_norm': 0.7116111516952515, 'learning_rate': 1.6213349968808482e-05, 'epoch': 1.38}


 46%|████▌     | 2220/4809 [26:14<30:41,  1.41it/s]

{'loss': 0.0643, 'grad_norm': 0.378670871257782, 'learning_rate': 1.6150966936993137e-05, 'epoch': 1.38}


 46%|████▋     | 2230/4809 [26:21<30:36,  1.40it/s]

{'loss': 0.0462, 'grad_norm': 0.3461899161338806, 'learning_rate': 1.6088583905177792e-05, 'epoch': 1.39}


 47%|████▋     | 2240/4809 [26:28<30:35,  1.40it/s]

{'loss': 0.048, 'grad_norm': 0.2937355935573578, 'learning_rate': 1.6026200873362444e-05, 'epoch': 1.4}


 47%|████▋     | 2250/4809 [26:35<30:26,  1.40it/s]

{'loss': 0.0441, 'grad_norm': 0.3787110447883606, 'learning_rate': 1.59638178415471e-05, 'epoch': 1.4}


 47%|████▋     | 2260/4809 [26:42<30:15,  1.40it/s]

{'loss': 0.0614, 'grad_norm': 0.5420246720314026, 'learning_rate': 1.5901434809731754e-05, 'epoch': 1.41}


 47%|████▋     | 2270/4809 [26:50<30:10,  1.40it/s]

{'loss': 0.0463, 'grad_norm': 0.23159541189670563, 'learning_rate': 1.5839051777916405e-05, 'epoch': 1.42}


 47%|████▋     | 2280/4809 [26:57<30:04,  1.40it/s]

{'loss': 0.058, 'grad_norm': 0.30931001901626587, 'learning_rate': 1.577666874610106e-05, 'epoch': 1.42}


 48%|████▊     | 2290/4809 [27:04<29:50,  1.41it/s]

{'loss': 0.0515, 'grad_norm': 0.30717581510543823, 'learning_rate': 1.5714285714285715e-05, 'epoch': 1.43}


 48%|████▊     | 2300/4809 [27:11<29:48,  1.40it/s]

{'loss': 0.0587, 'grad_norm': 0.3855573534965515, 'learning_rate': 1.5651902682470367e-05, 'epoch': 1.43}


 48%|████▊     | 2310/4809 [27:18<29:39,  1.40it/s]

{'loss': 0.0437, 'grad_norm': 0.32048094272613525, 'learning_rate': 1.5589519650655022e-05, 'epoch': 1.44}


 48%|████▊     | 2320/4809 [27:25<29:31,  1.41it/s]

{'loss': 0.0633, 'grad_norm': 0.3947415053844452, 'learning_rate': 1.5527136618839677e-05, 'epoch': 1.45}


 48%|████▊     | 2330/4809 [27:32<29:25,  1.40it/s]

{'loss': 0.0461, 'grad_norm': 0.4342047870159149, 'learning_rate': 1.546475358702433e-05, 'epoch': 1.45}


 49%|████▊     | 2340/4809 [27:39<29:18,  1.40it/s]

{'loss': 0.0519, 'grad_norm': 0.2641584277153015, 'learning_rate': 1.5402370555208983e-05, 'epoch': 1.46}


 49%|████▉     | 2350/4809 [27:47<29:10,  1.41it/s]

{'loss': 0.0599, 'grad_norm': 0.3743475079536438, 'learning_rate': 1.533998752339364e-05, 'epoch': 1.47}


 49%|████▉     | 2360/4809 [27:54<29:02,  1.41it/s]

{'loss': 0.0469, 'grad_norm': 0.371653288602829, 'learning_rate': 1.527760449157829e-05, 'epoch': 1.47}


 49%|████▉     | 2370/4809 [28:01<29:20,  1.39it/s]

{'loss': 0.0576, 'grad_norm': 0.47059884667396545, 'learning_rate': 1.5215221459762945e-05, 'epoch': 1.48}


 49%|████▉     | 2380/4809 [28:08<28:51,  1.40it/s]

{'loss': 0.0518, 'grad_norm': 0.4519332945346832, 'learning_rate': 1.51528384279476e-05, 'epoch': 1.48}


 50%|████▉     | 2390/4809 [28:15<28:46,  1.40it/s]

{'loss': 0.0574, 'grad_norm': 0.2929302752017975, 'learning_rate': 1.5090455396132252e-05, 'epoch': 1.49}


 50%|████▉     | 2400/4809 [28:22<28:36,  1.40it/s]

{'loss': 0.0585, 'grad_norm': 0.4932253360748291, 'learning_rate': 1.5028072364316907e-05, 'epoch': 1.5}


 50%|█████     | 2410/4809 [28:29<28:26,  1.41it/s]

{'loss': 0.0585, 'grad_norm': 0.44645601511001587, 'learning_rate': 1.496568933250156e-05, 'epoch': 1.5}


 50%|█████     | 2420/4809 [28:37<28:20,  1.40it/s]

{'loss': 0.0448, 'grad_norm': 0.32150694727897644, 'learning_rate': 1.4903306300686215e-05, 'epoch': 1.51}


 51%|█████     | 2430/4809 [28:44<28:18,  1.40it/s]

{'loss': 0.0518, 'grad_norm': 0.3998996913433075, 'learning_rate': 1.4840923268870867e-05, 'epoch': 1.52}


 51%|█████     | 2440/4809 [28:51<28:07,  1.40it/s]

{'loss': 0.0465, 'grad_norm': 0.2652006149291992, 'learning_rate': 1.477854023705552e-05, 'epoch': 1.52}


 51%|█████     | 2450/4809 [28:58<28:03,  1.40it/s]

{'loss': 0.0461, 'grad_norm': 0.49493932723999023, 'learning_rate': 1.4716157205240175e-05, 'epoch': 1.53}


 51%|█████     | 2460/4809 [29:05<27:56,  1.40it/s]

{'loss': 0.04, 'grad_norm': 0.36907854676246643, 'learning_rate': 1.4653774173424828e-05, 'epoch': 1.53}


 51%|█████▏    | 2470/4809 [29:12<27:43,  1.41it/s]

{'loss': 0.0457, 'grad_norm': 0.292263388633728, 'learning_rate': 1.4591391141609481e-05, 'epoch': 1.54}


 52%|█████▏    | 2480/4809 [29:19<27:40,  1.40it/s]

{'loss': 0.0614, 'grad_norm': 0.4359634220600128, 'learning_rate': 1.4529008109794136e-05, 'epoch': 1.55}


 52%|█████▏    | 2490/4809 [29:26<27:32,  1.40it/s]

{'loss': 0.0514, 'grad_norm': 0.4371730387210846, 'learning_rate': 1.446662507797879e-05, 'epoch': 1.55}


 52%|█████▏    | 2500/4809 [29:34<27:24,  1.40it/s]

{'loss': 0.0545, 'grad_norm': 0.4212033748626709, 'learning_rate': 1.4404242046163443e-05, 'epoch': 1.56}


 52%|█████▏    | 2510/4809 [29:42<27:56,  1.37it/s]

{'loss': 0.0614, 'grad_norm': 0.44431403279304504, 'learning_rate': 1.4341859014348098e-05, 'epoch': 1.57}


 52%|█████▏    | 2520/4809 [29:49<27:09,  1.40it/s]

{'loss': 0.0557, 'grad_norm': 0.293131947517395, 'learning_rate': 1.4279475982532751e-05, 'epoch': 1.57}


 53%|█████▎    | 2530/4809 [29:56<27:07,  1.40it/s]

{'loss': 0.0506, 'grad_norm': 0.29705050587654114, 'learning_rate': 1.4217092950717405e-05, 'epoch': 1.58}


 53%|█████▎    | 2540/4809 [30:03<26:55,  1.40it/s]

{'loss': 0.0833, 'grad_norm': 0.26772913336753845, 'learning_rate': 1.415470991890206e-05, 'epoch': 1.58}


 53%|█████▎    | 2550/4809 [30:11<26:48,  1.40it/s]

{'loss': 0.0502, 'grad_norm': 0.47159990668296814, 'learning_rate': 1.4092326887086713e-05, 'epoch': 1.59}


 53%|█████▎    | 2560/4809 [30:18<26:41,  1.40it/s]

{'loss': 0.0458, 'grad_norm': 0.4385506808757782, 'learning_rate': 1.4029943855271366e-05, 'epoch': 1.6}


 53%|█████▎    | 2570/4809 [30:25<26:34,  1.40it/s]

{'loss': 0.0616, 'grad_norm': 0.3164421319961548, 'learning_rate': 1.3967560823456021e-05, 'epoch': 1.6}


 54%|█████▎    | 2580/4809 [30:32<26:27,  1.40it/s]

{'loss': 0.0496, 'grad_norm': 0.22770893573760986, 'learning_rate': 1.3905177791640674e-05, 'epoch': 1.61}


 54%|█████▍    | 2590/4809 [30:39<26:21,  1.40it/s]

{'loss': 0.0592, 'grad_norm': 0.2919440269470215, 'learning_rate': 1.3842794759825328e-05, 'epoch': 1.62}


 54%|█████▍    | 2600/4809 [30:46<26:12,  1.40it/s]

{'loss': 0.0508, 'grad_norm': 0.36004552245140076, 'learning_rate': 1.3780411728009983e-05, 'epoch': 1.62}


 54%|█████▍    | 2610/4809 [30:53<26:03,  1.41it/s]

{'loss': 0.0499, 'grad_norm': 0.30560293793678284, 'learning_rate': 1.3718028696194636e-05, 'epoch': 1.63}


 54%|█████▍    | 2620/4809 [31:00<25:58,  1.40it/s]

{'loss': 0.0448, 'grad_norm': 0.3719046413898468, 'learning_rate': 1.365564566437929e-05, 'epoch': 1.63}


 55%|█████▍    | 2630/4809 [31:08<25:49,  1.41it/s]

{'loss': 0.0611, 'grad_norm': 0.3755300045013428, 'learning_rate': 1.3593262632563944e-05, 'epoch': 1.64}


 55%|█████▍    | 2640/4809 [31:15<25:42,  1.41it/s]

{'loss': 0.0443, 'grad_norm': 0.3545854389667511, 'learning_rate': 1.3530879600748598e-05, 'epoch': 1.65}


 55%|█████▌    | 2650/4809 [31:22<25:38,  1.40it/s]

{'loss': 0.0616, 'grad_norm': 0.20440590381622314, 'learning_rate': 1.3468496568933251e-05, 'epoch': 1.65}


 55%|█████▌    | 2660/4809 [31:29<25:31,  1.40it/s]

{'loss': 0.0431, 'grad_norm': 0.25277256965637207, 'learning_rate': 1.3406113537117904e-05, 'epoch': 1.66}


 56%|█████▌    | 2670/4809 [31:36<25:26,  1.40it/s]

{'loss': 0.0427, 'grad_norm': 0.2813083231449127, 'learning_rate': 1.3343730505302557e-05, 'epoch': 1.67}


 56%|█████▌    | 2680/4809 [31:43<25:18,  1.40it/s]

{'loss': 0.0461, 'grad_norm': 0.5050594210624695, 'learning_rate': 1.328134747348721e-05, 'epoch': 1.67}


 56%|█████▌    | 2690/4809 [31:50<25:07,  1.41it/s]

{'loss': 0.05, 'grad_norm': 0.369850754737854, 'learning_rate': 1.3218964441671866e-05, 'epoch': 1.68}


 56%|█████▌    | 2700/4809 [31:57<25:00,  1.41it/s]

{'loss': 0.0591, 'grad_norm': 0.25228381156921387, 'learning_rate': 1.3156581409856519e-05, 'epoch': 1.68}


 56%|█████▋    | 2710/4809 [32:05<24:54,  1.40it/s]

{'loss': 0.0487, 'grad_norm': 0.3117560148239136, 'learning_rate': 1.3094198378041172e-05, 'epoch': 1.69}


 57%|█████▋    | 2720/4809 [32:12<24:47,  1.40it/s]

{'loss': 0.039, 'grad_norm': 0.3110908269882202, 'learning_rate': 1.3031815346225827e-05, 'epoch': 1.7}


 57%|█████▋    | 2730/4809 [32:19<24:39,  1.40it/s]

{'loss': 0.0535, 'grad_norm': 0.33742067217826843, 'learning_rate': 1.296943231441048e-05, 'epoch': 1.7}


 57%|█████▋    | 2740/4809 [32:26<24:31,  1.41it/s]

{'loss': 0.0471, 'grad_norm': 0.5064645409584045, 'learning_rate': 1.2907049282595134e-05, 'epoch': 1.71}


 57%|█████▋    | 2750/4809 [32:33<24:25,  1.40it/s]

{'loss': 0.0518, 'grad_norm': 0.44979605078697205, 'learning_rate': 1.2844666250779787e-05, 'epoch': 1.72}


 57%|█████▋    | 2760/4809 [32:40<24:17,  1.41it/s]

{'loss': 0.0493, 'grad_norm': 0.20076382160186768, 'learning_rate': 1.2782283218964442e-05, 'epoch': 1.72}


 58%|█████▊    | 2770/4809 [32:47<24:11,  1.40it/s]

{'loss': 0.0498, 'grad_norm': 0.37733110785484314, 'learning_rate': 1.2719900187149095e-05, 'epoch': 1.73}


 58%|█████▊    | 2780/4809 [32:54<24:06,  1.40it/s]

{'loss': 0.0409, 'grad_norm': 0.22488558292388916, 'learning_rate': 1.2657517155333749e-05, 'epoch': 1.73}


 58%|█████▊    | 2790/4809 [33:02<23:59,  1.40it/s]

{'loss': 0.0571, 'grad_norm': 0.33359602093696594, 'learning_rate': 1.2595134123518404e-05, 'epoch': 1.74}


 58%|█████▊    | 2800/4809 [33:09<23:52,  1.40it/s]

{'loss': 0.0497, 'grad_norm': 0.32335901260375977, 'learning_rate': 1.2532751091703057e-05, 'epoch': 1.75}


 58%|█████▊    | 2810/4809 [33:16<23:44,  1.40it/s]

{'loss': 0.0562, 'grad_norm': 0.4498320519924164, 'learning_rate': 1.247036805988771e-05, 'epoch': 1.75}


 59%|█████▊    | 2820/4809 [33:23<23:35,  1.41it/s]

{'loss': 0.0532, 'grad_norm': 0.2778667211532593, 'learning_rate': 1.2407985028072365e-05, 'epoch': 1.76}


 59%|█████▉    | 2830/4809 [33:30<23:30,  1.40it/s]

{'loss': 0.0421, 'grad_norm': 0.4286208748817444, 'learning_rate': 1.2345601996257019e-05, 'epoch': 1.77}


 59%|█████▉    | 2840/4809 [33:37<23:20,  1.41it/s]

{'loss': 0.0553, 'grad_norm': 0.3495928943157196, 'learning_rate': 1.2283218964441672e-05, 'epoch': 1.77}


 59%|█████▉    | 2850/4809 [33:44<23:20,  1.40it/s]

{'loss': 0.0513, 'grad_norm': 0.41307100653648376, 'learning_rate': 1.2220835932626327e-05, 'epoch': 1.78}


 59%|█████▉    | 2860/4809 [33:52<23:07,  1.41it/s]

{'loss': 0.043, 'grad_norm': 0.48511141538619995, 'learning_rate': 1.215845290081098e-05, 'epoch': 1.78}


 60%|█████▉    | 2870/4809 [33:59<23:01,  1.40it/s]

{'loss': 0.0516, 'grad_norm': 0.49847957491874695, 'learning_rate': 1.2096069868995634e-05, 'epoch': 1.79}


 60%|█████▉    | 2880/4809 [34:06<22:54,  1.40it/s]

{'loss': 0.0666, 'grad_norm': 0.3490190804004669, 'learning_rate': 1.2033686837180289e-05, 'epoch': 1.8}


 60%|██████    | 2890/4809 [34:13<22:45,  1.41it/s]

{'loss': 0.0479, 'grad_norm': 0.33696866035461426, 'learning_rate': 1.1971303805364942e-05, 'epoch': 1.8}


 60%|██████    | 2900/4809 [34:20<22:39,  1.40it/s]

{'loss': 0.0444, 'grad_norm': 0.26884615421295166, 'learning_rate': 1.1908920773549593e-05, 'epoch': 1.81}


 61%|██████    | 2910/4809 [34:27<22:33,  1.40it/s]

{'loss': 0.0455, 'grad_norm': 0.3950278162956238, 'learning_rate': 1.1846537741734248e-05, 'epoch': 1.82}


 61%|██████    | 2920/4809 [34:34<22:24,  1.40it/s]

{'loss': 0.0466, 'grad_norm': 0.5690175294876099, 'learning_rate': 1.1784154709918902e-05, 'epoch': 1.82}


 61%|██████    | 2930/4809 [34:41<22:17,  1.41it/s]

{'loss': 0.0478, 'grad_norm': 1.0572519302368164, 'learning_rate': 1.1721771678103555e-05, 'epoch': 1.83}


 61%|██████    | 2940/4809 [34:49<22:10,  1.40it/s]

{'loss': 0.0536, 'grad_norm': 0.43662723898887634, 'learning_rate': 1.165938864628821e-05, 'epoch': 1.83}


 61%|██████▏   | 2950/4809 [34:56<22:03,  1.40it/s]

{'loss': 0.0435, 'grad_norm': 0.40427905321121216, 'learning_rate': 1.1597005614472863e-05, 'epoch': 1.84}


 62%|██████▏   | 2960/4809 [35:03<21:56,  1.40it/s]

{'loss': 0.0433, 'grad_norm': 0.3228379189968109, 'learning_rate': 1.1534622582657517e-05, 'epoch': 1.85}


 62%|██████▏   | 2970/4809 [35:10<21:49,  1.40it/s]

{'loss': 0.051, 'grad_norm': 0.33898892998695374, 'learning_rate': 1.1472239550842172e-05, 'epoch': 1.85}


 62%|██████▏   | 2980/4809 [35:17<21:43,  1.40it/s]

{'loss': 0.0427, 'grad_norm': 0.34776705503463745, 'learning_rate': 1.1409856519026825e-05, 'epoch': 1.86}


 62%|██████▏   | 2990/4809 [35:24<21:35,  1.40it/s]

{'loss': 0.0502, 'grad_norm': 0.43681013584136963, 'learning_rate': 1.1347473487211478e-05, 'epoch': 1.87}


 62%|██████▏   | 3000/4809 [35:31<21:29,  1.40it/s]

{'loss': 0.0488, 'grad_norm': 0.28003761172294617, 'learning_rate': 1.1285090455396133e-05, 'epoch': 1.87}


 63%|██████▎   | 3010/4809 [35:40<21:51,  1.37it/s]

{'loss': 0.0399, 'grad_norm': 0.8620200753211975, 'learning_rate': 1.1222707423580786e-05, 'epoch': 1.88}


 63%|██████▎   | 3020/4809 [35:47<21:14,  1.40it/s]

{'loss': 0.0503, 'grad_norm': 0.32582905888557434, 'learning_rate': 1.116032439176544e-05, 'epoch': 1.88}


 63%|██████▎   | 3030/4809 [35:54<21:05,  1.41it/s]

{'loss': 0.0666, 'grad_norm': 0.35426265001296997, 'learning_rate': 1.1097941359950095e-05, 'epoch': 1.89}


 63%|██████▎   | 3040/4809 [36:01<20:58,  1.41it/s]

{'loss': 0.0567, 'grad_norm': 0.38720202445983887, 'learning_rate': 1.1035558328134748e-05, 'epoch': 1.9}


 63%|██████▎   | 3050/4809 [36:08<20:52,  1.40it/s]

{'loss': 0.0504, 'grad_norm': 0.3549478054046631, 'learning_rate': 1.0973175296319401e-05, 'epoch': 1.9}


 64%|██████▎   | 3060/4809 [36:15<20:45,  1.40it/s]

{'loss': 0.0476, 'grad_norm': 0.45907384157180786, 'learning_rate': 1.0910792264504056e-05, 'epoch': 1.91}


 64%|██████▍   | 3070/4809 [36:22<20:36,  1.41it/s]

{'loss': 0.0561, 'grad_norm': 0.6468528509140015, 'learning_rate': 1.084840923268871e-05, 'epoch': 1.92}


 64%|██████▍   | 3080/4809 [36:30<20:29,  1.41it/s]

{'loss': 0.0408, 'grad_norm': 0.3045956492424011, 'learning_rate': 1.0786026200873363e-05, 'epoch': 1.92}


 64%|██████▍   | 3090/4809 [36:37<20:24,  1.40it/s]

{'loss': 0.0502, 'grad_norm': 0.4132218658924103, 'learning_rate': 1.0723643169058018e-05, 'epoch': 1.93}


 64%|██████▍   | 3100/4809 [36:44<20:15,  1.41it/s]

{'loss': 0.0497, 'grad_norm': 0.3024570047855377, 'learning_rate': 1.0661260137242671e-05, 'epoch': 1.93}


 65%|██████▍   | 3110/4809 [36:51<20:09,  1.40it/s]

{'loss': 0.056, 'grad_norm': 0.7941246628761292, 'learning_rate': 1.0598877105427324e-05, 'epoch': 1.94}


 65%|██████▍   | 3120/4809 [36:58<20:01,  1.41it/s]

{'loss': 0.0457, 'grad_norm': 0.29685690999031067, 'learning_rate': 1.0536494073611978e-05, 'epoch': 1.95}


 65%|██████▌   | 3130/4809 [37:05<19:54,  1.41it/s]

{'loss': 0.066, 'grad_norm': 0.44127535820007324, 'learning_rate': 1.0474111041796631e-05, 'epoch': 1.95}


 65%|██████▌   | 3140/4809 [37:12<19:47,  1.41it/s]

{'loss': 0.04, 'grad_norm': 0.30513396859169006, 'learning_rate': 1.0411728009981284e-05, 'epoch': 1.96}


 66%|██████▌   | 3150/4809 [37:19<19:40,  1.40it/s]

{'loss': 0.0457, 'grad_norm': 0.6547472476959229, 'learning_rate': 1.034934497816594e-05, 'epoch': 1.97}


 66%|██████▌   | 3160/4809 [37:27<19:33,  1.40it/s]

{'loss': 0.0564, 'grad_norm': 0.3100929260253906, 'learning_rate': 1.0286961946350593e-05, 'epoch': 1.97}


 66%|██████▌   | 3170/4809 [37:34<19:26,  1.40it/s]

{'loss': 0.052, 'grad_norm': 0.37868112325668335, 'learning_rate': 1.0224578914535246e-05, 'epoch': 1.98}


 66%|██████▌   | 3180/4809 [37:41<19:19,  1.41it/s]

{'loss': 0.051, 'grad_norm': 0.29649534821510315, 'learning_rate': 1.01621958827199e-05, 'epoch': 1.98}


 66%|██████▋   | 3190/4809 [37:48<19:12,  1.40it/s]

{'loss': 0.047, 'grad_norm': 0.442263662815094, 'learning_rate': 1.0099812850904554e-05, 'epoch': 1.99}


 67%|██████▋   | 3200/4809 [37:55<19:05,  1.40it/s]

{'loss': 0.0517, 'grad_norm': 0.36049315333366394, 'learning_rate': 1.0037429819089207e-05, 'epoch': 2.0}



 67%|██████▋   | 3206/4809 [41:04<16:45,  1.59it/s]

{'eval_loss': 0.06665567308664322, 'eval_sacrebleu': 11.487157538986812, 'eval_runtime': 184.8328, 'eval_samples_per_second': 7.466, 'eval_steps_per_second': 1.867, 'epoch': 2.0}


 67%|██████▋   | 3210/4809 [41:07<8:45:52, 19.73s/it] 

{'loss': 0.054, 'grad_norm': 0.3243762254714966, 'learning_rate': 9.97504678727386e-06, 'epoch': 2.0}


 67%|██████▋   | 3220/4809 [41:14<33:05,  1.25s/it]  

{'loss': 0.048, 'grad_norm': 0.3021610379219055, 'learning_rate': 9.912663755458516e-06, 'epoch': 2.01}


 67%|██████▋   | 3230/4809 [41:21<19:07,  1.38it/s]

{'loss': 0.0447, 'grad_norm': 0.4096018970012665, 'learning_rate': 9.850280723643169e-06, 'epoch': 2.01}


 67%|██████▋   | 3240/4809 [41:28<18:39,  1.40it/s]

{'loss': 0.0536, 'grad_norm': 0.35429975390434265, 'learning_rate': 9.787897691827822e-06, 'epoch': 2.02}


 68%|██████▊   | 3250/4809 [41:35<18:27,  1.41it/s]

{'loss': 0.0527, 'grad_norm': 0.4872100055217743, 'learning_rate': 9.725514660012477e-06, 'epoch': 2.03}


 68%|██████▊   | 3260/4809 [41:43<18:25,  1.40it/s]

{'loss': 0.0538, 'grad_norm': 0.4310190975666046, 'learning_rate': 9.66313162819713e-06, 'epoch': 2.03}


 68%|██████▊   | 3270/4809 [41:50<18:15,  1.41it/s]

{'loss': 0.0609, 'grad_norm': 0.2597830891609192, 'learning_rate': 9.600748596381784e-06, 'epoch': 2.04}


 68%|██████▊   | 3280/4809 [41:57<18:10,  1.40it/s]

{'loss': 0.0458, 'grad_norm': 0.4501512348651886, 'learning_rate': 9.538365564566439e-06, 'epoch': 2.05}


 68%|██████▊   | 3290/4809 [42:04<18:01,  1.40it/s]

{'loss': 0.0467, 'grad_norm': 0.36244460940361023, 'learning_rate': 9.475982532751092e-06, 'epoch': 2.05}


 69%|██████▊   | 3300/4809 [42:11<17:53,  1.41it/s]

{'loss': 0.0433, 'grad_norm': 0.220920130610466, 'learning_rate': 9.413599500935746e-06, 'epoch': 2.06}


 69%|██████▉   | 3310/4809 [42:18<17:47,  1.40it/s]

{'loss': 0.0536, 'grad_norm': 0.4513571262359619, 'learning_rate': 9.3512164691204e-06, 'epoch': 2.06}


 69%|██████▉   | 3320/4809 [42:25<17:40,  1.40it/s]

{'loss': 0.0474, 'grad_norm': 0.3091931939125061, 'learning_rate': 9.288833437305054e-06, 'epoch': 2.07}


 69%|██████▉   | 3330/4809 [42:33<17:35,  1.40it/s]

{'loss': 0.0462, 'grad_norm': 0.23023942112922668, 'learning_rate': 9.226450405489707e-06, 'epoch': 2.08}


 69%|██████▉   | 3340/4809 [42:40<17:26,  1.40it/s]

{'loss': 0.043, 'grad_norm': 0.2543829381465912, 'learning_rate': 9.164067373674362e-06, 'epoch': 2.08}


 70%|██████▉   | 3350/4809 [42:47<17:19,  1.40it/s]

{'loss': 0.0528, 'grad_norm': 0.511995255947113, 'learning_rate': 9.101684341859015e-06, 'epoch': 2.09}


 70%|██████▉   | 3360/4809 [42:54<17:13,  1.40it/s]

{'loss': 0.0422, 'grad_norm': 0.3775237202644348, 'learning_rate': 9.039301310043667e-06, 'epoch': 2.1}


 70%|███████   | 3370/4809 [43:01<17:05,  1.40it/s]

{'loss': 0.0493, 'grad_norm': 0.28479379415512085, 'learning_rate': 8.976918278228322e-06, 'epoch': 2.1}


 70%|███████   | 3380/4809 [43:08<16:59,  1.40it/s]

{'loss': 0.0564, 'grad_norm': 0.3439125716686249, 'learning_rate': 8.914535246412975e-06, 'epoch': 2.11}


 70%|███████   | 3390/4809 [43:15<16:52,  1.40it/s]

{'loss': 0.048, 'grad_norm': 0.28387460112571716, 'learning_rate': 8.852152214597629e-06, 'epoch': 2.11}


 71%|███████   | 3400/4809 [43:22<16:41,  1.41it/s]

{'loss': 0.0543, 'grad_norm': 0.19544242322444916, 'learning_rate': 8.789769182782284e-06, 'epoch': 2.12}


 71%|███████   | 3410/4809 [43:30<16:36,  1.40it/s]

{'loss': 0.0403, 'grad_norm': 0.40634533762931824, 'learning_rate': 8.727386150966937e-06, 'epoch': 2.13}


 71%|███████   | 3420/4809 [43:37<16:30,  1.40it/s]

{'loss': 0.0513, 'grad_norm': 0.3705398440361023, 'learning_rate': 8.66500311915159e-06, 'epoch': 2.13}


 71%|███████▏  | 3430/4809 [43:44<16:21,  1.40it/s]

{'loss': 0.0572, 'grad_norm': 0.4731425344944, 'learning_rate': 8.602620087336245e-06, 'epoch': 2.14}


 72%|███████▏  | 3440/4809 [43:51<16:15,  1.40it/s]

{'loss': 0.0484, 'grad_norm': 0.40706709027290344, 'learning_rate': 8.540237055520898e-06, 'epoch': 2.15}


 72%|███████▏  | 3450/4809 [43:58<16:11,  1.40it/s]

{'loss': 0.048, 'grad_norm': 0.46629977226257324, 'learning_rate': 8.477854023705552e-06, 'epoch': 2.15}


 72%|███████▏  | 3460/4809 [44:05<16:03,  1.40it/s]

{'loss': 0.064, 'grad_norm': 0.4450732171535492, 'learning_rate': 8.415470991890207e-06, 'epoch': 2.16}


 72%|███████▏  | 3470/4809 [44:12<15:54,  1.40it/s]

{'loss': 0.0509, 'grad_norm': 0.5368961691856384, 'learning_rate': 8.35308796007486e-06, 'epoch': 2.16}


 72%|███████▏  | 3480/4809 [44:20<15:47,  1.40it/s]

{'loss': 0.0419, 'grad_norm': 0.3572223484516144, 'learning_rate': 8.290704928259513e-06, 'epoch': 2.17}


 73%|███████▎  | 3490/4809 [44:27<15:38,  1.41it/s]

{'loss': 0.0414, 'grad_norm': 0.2723168432712555, 'learning_rate': 8.228321896444168e-06, 'epoch': 2.18}


 73%|███████▎  | 3500/4809 [44:34<15:32,  1.40it/s]

{'loss': 0.0463, 'grad_norm': 0.4843517243862152, 'learning_rate': 8.165938864628822e-06, 'epoch': 2.18}


 73%|███████▎  | 3510/4809 [44:42<15:50,  1.37it/s]

{'loss': 0.0393, 'grad_norm': 0.4054289162158966, 'learning_rate': 8.103555832813475e-06, 'epoch': 2.19}


 73%|███████▎  | 3520/4809 [44:49<15:19,  1.40it/s]

{'loss': 0.0469, 'grad_norm': 0.2674046456813812, 'learning_rate': 8.04117280099813e-06, 'epoch': 2.2}


 73%|███████▎  | 3530/4809 [44:57<15:11,  1.40it/s]

{'loss': 0.0474, 'grad_norm': 0.46166983246803284, 'learning_rate': 7.978789769182783e-06, 'epoch': 2.2}


 74%|███████▎  | 3540/4809 [45:04<15:05,  1.40it/s]

{'loss': 0.0458, 'grad_norm': 0.47941532731056213, 'learning_rate': 7.916406737367436e-06, 'epoch': 2.21}


 74%|███████▍  | 3550/4809 [45:11<14:56,  1.40it/s]

{'loss': 0.056, 'grad_norm': 0.7843883037567139, 'learning_rate': 7.854023705552091e-06, 'epoch': 2.21}


 74%|███████▍  | 3560/4809 [45:18<14:50,  1.40it/s]

{'loss': 0.0459, 'grad_norm': 0.4447408616542816, 'learning_rate': 7.791640673736745e-06, 'epoch': 2.22}


 74%|███████▍  | 3570/4809 [45:25<14:42,  1.40it/s]

{'loss': 0.0496, 'grad_norm': 0.6902630925178528, 'learning_rate': 7.729257641921398e-06, 'epoch': 2.23}


 74%|███████▍  | 3580/4809 [45:32<14:33,  1.41it/s]

{'loss': 0.0375, 'grad_norm': 0.3003169596195221, 'learning_rate': 7.666874610106053e-06, 'epoch': 2.23}


 75%|███████▍  | 3590/4809 [45:39<14:26,  1.41it/s]

{'loss': 0.0453, 'grad_norm': 0.3141941726207733, 'learning_rate': 7.6044915782907055e-06, 'epoch': 2.24}


 75%|███████▍  | 3600/4809 [45:46<14:22,  1.40it/s]

{'loss': 0.0412, 'grad_norm': 0.5222563147544861, 'learning_rate': 7.542108546475359e-06, 'epoch': 2.25}


 75%|███████▌  | 3610/4809 [45:54<14:13,  1.41it/s]

{'loss': 0.0403, 'grad_norm': 0.709987461566925, 'learning_rate': 7.479725514660013e-06, 'epoch': 2.25}


 75%|███████▌  | 3620/4809 [46:01<14:06,  1.40it/s]

{'loss': 0.0503, 'grad_norm': 0.3175961375236511, 'learning_rate': 7.417342482844667e-06, 'epoch': 2.26}


 75%|███████▌  | 3630/4809 [46:08<14:01,  1.40it/s]

{'loss': 0.0488, 'grad_norm': 0.5091413855552673, 'learning_rate': 7.35495945102932e-06, 'epoch': 2.26}


 76%|███████▌  | 3640/4809 [46:15<13:56,  1.40it/s]

{'loss': 0.042, 'grad_norm': 0.5647861361503601, 'learning_rate': 7.2925764192139745e-06, 'epoch': 2.27}


 76%|███████▌  | 3650/4809 [46:22<13:45,  1.40it/s]

{'loss': 0.0492, 'grad_norm': 0.36196500062942505, 'learning_rate': 7.230193387398628e-06, 'epoch': 2.28}


 76%|███████▌  | 3660/4809 [46:29<13:38,  1.40it/s]

{'loss': 0.0459, 'grad_norm': 0.5923359394073486, 'learning_rate': 7.167810355583281e-06, 'epoch': 2.28}


 76%|███████▋  | 3670/4809 [46:36<13:32,  1.40it/s]

{'loss': 0.05, 'grad_norm': 0.4074265956878662, 'learning_rate': 7.105427323767935e-06, 'epoch': 2.29}


 77%|███████▋  | 3680/4809 [46:43<13:23,  1.40it/s]

{'loss': 0.059, 'grad_norm': 0.3730238974094391, 'learning_rate': 7.043044291952589e-06, 'epoch': 2.3}


 77%|███████▋  | 3690/4809 [46:51<13:17,  1.40it/s]

{'loss': 0.0562, 'grad_norm': 0.3777116537094116, 'learning_rate': 6.980661260137243e-06, 'epoch': 2.3}


 77%|███████▋  | 3700/4809 [46:58<13:09,  1.40it/s]

{'loss': 0.0456, 'grad_norm': 0.2666313052177429, 'learning_rate': 6.918278228321897e-06, 'epoch': 2.31}


 77%|███████▋  | 3710/4809 [47:05<13:02,  1.41it/s]

{'loss': 0.0537, 'grad_norm': 0.6459601521492004, 'learning_rate': 6.85589519650655e-06, 'epoch': 2.31}


 77%|███████▋  | 3720/4809 [47:12<12:55,  1.40it/s]

{'loss': 0.048, 'grad_norm': 0.3457947075366974, 'learning_rate': 6.793512164691204e-06, 'epoch': 2.32}


 78%|███████▊  | 3730/4809 [47:19<12:47,  1.41it/s]

{'loss': 0.0483, 'grad_norm': 0.20804035663604736, 'learning_rate': 6.731129132875858e-06, 'epoch': 2.33}


 78%|███████▊  | 3740/4809 [47:26<12:40,  1.41it/s]

{'loss': 0.0465, 'grad_norm': 0.5911889672279358, 'learning_rate': 6.668746101060512e-06, 'epoch': 2.33}


 78%|███████▊  | 3750/4809 [47:33<12:33,  1.41it/s]

{'loss': 0.0355, 'grad_norm': 0.5219283103942871, 'learning_rate': 6.606363069245166e-06, 'epoch': 2.34}


 78%|███████▊  | 3760/4809 [47:40<12:26,  1.40it/s]

{'loss': 0.0481, 'grad_norm': 0.2197079211473465, 'learning_rate': 6.543980037429819e-06, 'epoch': 2.35}


 78%|███████▊  | 3770/4809 [47:48<12:19,  1.41it/s]

{'loss': 0.0418, 'grad_norm': 0.35655829310417175, 'learning_rate': 6.481597005614472e-06, 'epoch': 2.35}


 79%|███████▊  | 3780/4809 [47:55<12:13,  1.40it/s]

{'loss': 0.045, 'grad_norm': 0.3696802258491516, 'learning_rate': 6.4192139737991265e-06, 'epoch': 2.36}


 79%|███████▉  | 3790/4809 [48:02<12:04,  1.41it/s]

{'loss': 0.055, 'grad_norm': 0.5522945523262024, 'learning_rate': 6.356830941983781e-06, 'epoch': 2.36}


 79%|███████▉  | 3800/4809 [48:09<11:58,  1.40it/s]

{'loss': 0.0363, 'grad_norm': 0.28516843914985657, 'learning_rate': 6.294447910168434e-06, 'epoch': 2.37}


 79%|███████▉  | 3810/4809 [48:16<11:50,  1.41it/s]

{'loss': 0.0452, 'grad_norm': 0.3548103868961334, 'learning_rate': 6.232064878353088e-06, 'epoch': 2.38}


 79%|███████▉  | 3820/4809 [48:23<11:45,  1.40it/s]

{'loss': 0.0351, 'grad_norm': 0.35888174176216125, 'learning_rate': 6.169681846537742e-06, 'epoch': 2.38}


 80%|███████▉  | 3830/4809 [48:30<11:37,  1.40it/s]

{'loss': 0.0534, 'grad_norm': 0.41219958662986755, 'learning_rate': 6.1072988147223955e-06, 'epoch': 2.39}


 80%|███████▉  | 3840/4809 [48:37<11:29,  1.41it/s]

{'loss': 0.0513, 'grad_norm': 0.5407086610794067, 'learning_rate': 6.04491578290705e-06, 'epoch': 2.4}


 80%|████████  | 3850/4809 [48:45<11:22,  1.41it/s]

{'loss': 0.0531, 'grad_norm': 0.2527744174003601, 'learning_rate': 5.982532751091704e-06, 'epoch': 2.4}


 80%|████████  | 3860/4809 [48:52<11:16,  1.40it/s]

{'loss': 0.0488, 'grad_norm': 0.3934636116027832, 'learning_rate': 5.920149719276357e-06, 'epoch': 2.41}


 80%|████████  | 3870/4809 [48:59<11:08,  1.40it/s]

{'loss': 0.0524, 'grad_norm': 0.43449729681015015, 'learning_rate': 5.857766687461011e-06, 'epoch': 2.41}


 81%|████████  | 3880/4809 [49:06<11:01,  1.40it/s]

{'loss': 0.0624, 'grad_norm': 0.3087119162082672, 'learning_rate': 5.7953836556456646e-06, 'epoch': 2.42}


 81%|████████  | 3890/4809 [49:13<10:54,  1.40it/s]

{'loss': 0.0571, 'grad_norm': 0.3041447401046753, 'learning_rate': 5.733000623830318e-06, 'epoch': 2.43}


 81%|████████  | 3900/4809 [49:20<10:49,  1.40it/s]

{'loss': 0.0481, 'grad_norm': 0.3084196150302887, 'learning_rate': 5.670617592014972e-06, 'epoch': 2.43}


 81%|████████▏ | 3910/4809 [49:27<10:41,  1.40it/s]

{'loss': 0.0471, 'grad_norm': 0.26624077558517456, 'learning_rate': 5.608234560199625e-06, 'epoch': 2.44}


 82%|████████▏ | 3920/4809 [49:35<10:32,  1.41it/s]

{'loss': 0.0601, 'grad_norm': 0.3117295801639557, 'learning_rate': 5.545851528384279e-06, 'epoch': 2.45}


 82%|████████▏ | 3930/4809 [49:42<10:25,  1.40it/s]

{'loss': 0.0408, 'grad_norm': 0.3573516309261322, 'learning_rate': 5.4834684965689336e-06, 'epoch': 2.45}


 82%|████████▏ | 3940/4809 [49:49<10:17,  1.41it/s]

{'loss': 0.0461, 'grad_norm': 0.3393855690956116, 'learning_rate': 5.421085464753587e-06, 'epoch': 2.46}


 82%|████████▏ | 3950/4809 [49:56<10:11,  1.40it/s]

{'loss': 0.0327, 'grad_norm': 0.3497965335845947, 'learning_rate': 5.358702432938241e-06, 'epoch': 2.46}


 82%|████████▏ | 3960/4809 [50:03<10:06,  1.40it/s]

{'loss': 0.0437, 'grad_norm': 1.0286684036254883, 'learning_rate': 5.296319401122895e-06, 'epoch': 2.47}


 83%|████████▎ | 3970/4809 [50:10<09:58,  1.40it/s]

{'loss': 0.0477, 'grad_norm': 0.4346436560153961, 'learning_rate': 5.2339363693075484e-06, 'epoch': 2.48}


 83%|████████▎ | 3980/4809 [50:17<09:50,  1.40it/s]

{'loss': 0.0366, 'grad_norm': 0.3197268545627594, 'learning_rate': 5.171553337492203e-06, 'epoch': 2.48}


 83%|████████▎ | 3990/4809 [50:24<09:44,  1.40it/s]

{'loss': 0.0504, 'grad_norm': 0.7176608443260193, 'learning_rate': 5.109170305676856e-06, 'epoch': 2.49}


 83%|████████▎ | 4000/4809 [50:32<09:35,  1.40it/s]

{'loss': 0.0483, 'grad_norm': 0.347301185131073, 'learning_rate': 5.046787273861509e-06, 'epoch': 2.5}


 83%|████████▎ | 4010/4809 [50:40<09:43,  1.37it/s]

{'loss': 0.0495, 'grad_norm': 0.8400896787643433, 'learning_rate': 4.984404242046163e-06, 'epoch': 2.5}


 84%|████████▎ | 4020/4809 [50:47<09:23,  1.40it/s]

{'loss': 0.0367, 'grad_norm': 0.3348332643508911, 'learning_rate': 4.9220212102308175e-06, 'epoch': 2.51}


 84%|████████▍ | 4030/4809 [50:54<09:13,  1.41it/s]

{'loss': 0.0536, 'grad_norm': 0.3380401134490967, 'learning_rate': 4.859638178415471e-06, 'epoch': 2.51}


 84%|████████▍ | 4040/4809 [51:01<09:07,  1.40it/s]

{'loss': 0.0447, 'grad_norm': 0.36563271284103394, 'learning_rate': 4.797255146600125e-06, 'epoch': 2.52}


 84%|████████▍ | 4050/4809 [51:09<09:00,  1.40it/s]

{'loss': 0.0509, 'grad_norm': 0.25403493642807007, 'learning_rate': 4.734872114784779e-06, 'epoch': 2.53}


 84%|████████▍ | 4060/4809 [51:16<08:53,  1.40it/s]

{'loss': 0.0406, 'grad_norm': 0.2419433444738388, 'learning_rate': 4.672489082969432e-06, 'epoch': 2.53}


 85%|████████▍ | 4070/4809 [51:23<08:46,  1.40it/s]

{'loss': 0.0502, 'grad_norm': 0.2846861779689789, 'learning_rate': 4.6101060511540865e-06, 'epoch': 2.54}


 85%|████████▍ | 4080/4809 [51:30<08:38,  1.40it/s]

{'loss': 0.0457, 'grad_norm': 0.32617688179016113, 'learning_rate': 4.547723019338741e-06, 'epoch': 2.55}


 85%|████████▌ | 4090/4809 [51:37<08:31,  1.41it/s]

{'loss': 0.0483, 'grad_norm': 0.368587464094162, 'learning_rate': 4.485339987523394e-06, 'epoch': 2.55}


 85%|████████▌ | 4100/4809 [51:44<08:25,  1.40it/s]

{'loss': 0.0519, 'grad_norm': 0.39509978890419006, 'learning_rate': 4.422956955708048e-06, 'epoch': 2.56}


 85%|████████▌ | 4110/4809 [51:51<08:17,  1.40it/s]

{'loss': 0.0479, 'grad_norm': 0.34884995222091675, 'learning_rate': 4.360573923892701e-06, 'epoch': 2.56}


 86%|████████▌ | 4120/4809 [51:58<08:10,  1.41it/s]

{'loss': 0.0495, 'grad_norm': 0.41471388936042786, 'learning_rate': 4.298190892077355e-06, 'epoch': 2.57}


 86%|████████▌ | 4130/4809 [52:06<08:04,  1.40it/s]

{'loss': 0.0493, 'grad_norm': 0.3985145092010498, 'learning_rate': 4.235807860262009e-06, 'epoch': 2.58}


 86%|████████▌ | 4140/4809 [52:13<07:56,  1.40it/s]

{'loss': 0.0485, 'grad_norm': 0.46557652950286865, 'learning_rate': 4.173424828446662e-06, 'epoch': 2.58}


 86%|████████▋ | 4150/4809 [52:20<07:49,  1.40it/s]

{'loss': 0.0542, 'grad_norm': 0.3958267867565155, 'learning_rate': 4.111041796631316e-06, 'epoch': 2.59}


 87%|████████▋ | 4160/4809 [52:27<07:42,  1.40it/s]

{'loss': 0.0492, 'grad_norm': 0.6355127096176147, 'learning_rate': 4.04865876481597e-06, 'epoch': 2.6}


 87%|████████▋ | 4170/4809 [52:34<07:34,  1.40it/s]

{'loss': 0.0555, 'grad_norm': 0.7790703177452087, 'learning_rate': 3.986275733000624e-06, 'epoch': 2.6}


 87%|████████▋ | 4180/4809 [52:41<07:28,  1.40it/s]

{'loss': 0.0487, 'grad_norm': 0.3646567761898041, 'learning_rate': 3.923892701185278e-06, 'epoch': 2.61}


 87%|████████▋ | 4190/4809 [52:48<07:20,  1.40it/s]

{'loss': 0.0392, 'grad_norm': 0.24412734806537628, 'learning_rate': 3.861509669369932e-06, 'epoch': 2.61}


 87%|████████▋ | 4200/4809 [52:56<07:13,  1.40it/s]

{'loss': 0.0555, 'grad_norm': 0.3761933445930481, 'learning_rate': 3.799126637554585e-06, 'epoch': 2.62}


 88%|████████▊ | 4210/4809 [53:03<07:07,  1.40it/s]

{'loss': 0.0519, 'grad_norm': 0.36525166034698486, 'learning_rate': 3.736743605739239e-06, 'epoch': 2.63}


 88%|████████▊ | 4220/4809 [53:10<06:59,  1.40it/s]

{'loss': 0.0464, 'grad_norm': 0.3192504644393921, 'learning_rate': 3.6743605739238927e-06, 'epoch': 2.63}


 88%|████████▊ | 4230/4809 [53:17<06:51,  1.41it/s]

{'loss': 0.0409, 'grad_norm': 0.47930338978767395, 'learning_rate': 3.611977542108547e-06, 'epoch': 2.64}


 88%|████████▊ | 4240/4809 [53:24<06:45,  1.40it/s]

{'loss': 0.0429, 'grad_norm': 0.3686308264732361, 'learning_rate': 3.5495945102932005e-06, 'epoch': 2.65}


 88%|████████▊ | 4250/4809 [53:31<06:38,  1.40it/s]

{'loss': 0.042, 'grad_norm': 0.3008618950843811, 'learning_rate': 3.487211478477854e-06, 'epoch': 2.65}


 89%|████████▊ | 4260/4809 [53:38<06:30,  1.40it/s]

{'loss': 0.0434, 'grad_norm': 0.9695006608963013, 'learning_rate': 3.424828446662508e-06, 'epoch': 2.66}


 89%|████████▉ | 4270/4809 [53:45<06:23,  1.40it/s]

{'loss': 0.0341, 'grad_norm': 0.21862034499645233, 'learning_rate': 3.3624454148471617e-06, 'epoch': 2.66}


 89%|████████▉ | 4280/4809 [53:53<06:18,  1.40it/s]

{'loss': 0.045, 'grad_norm': 0.36868470907211304, 'learning_rate': 3.3000623830318154e-06, 'epoch': 2.67}


 89%|████████▉ | 4290/4809 [54:00<06:10,  1.40it/s]

{'loss': 0.0629, 'grad_norm': 0.42184510827064514, 'learning_rate': 3.237679351216469e-06, 'epoch': 2.68}


 89%|████████▉ | 4300/4809 [54:07<06:02,  1.40it/s]

{'loss': 0.05, 'grad_norm': 0.32881537079811096, 'learning_rate': 3.1752963194011233e-06, 'epoch': 2.68}


 90%|████████▉ | 4310/4809 [54:14<05:55,  1.40it/s]

{'loss': 0.0417, 'grad_norm': 0.3831768035888672, 'learning_rate': 3.1129132875857765e-06, 'epoch': 2.69}


 90%|████████▉ | 4320/4809 [54:21<05:48,  1.40it/s]

{'loss': 0.0458, 'grad_norm': 0.5568768978118896, 'learning_rate': 3.0505302557704303e-06, 'epoch': 2.69}


 90%|█████████ | 4330/4809 [54:28<05:41,  1.40it/s]

{'loss': 0.0678, 'grad_norm': 0.33063483238220215, 'learning_rate': 2.9881472239550844e-06, 'epoch': 2.7}


 90%|█████████ | 4340/4809 [54:35<05:34,  1.40it/s]

{'loss': 0.0536, 'grad_norm': 0.5208762884140015, 'learning_rate': 2.925764192139738e-06, 'epoch': 2.71}


 90%|█████████ | 4350/4809 [54:42<05:27,  1.40it/s]

{'loss': 0.0482, 'grad_norm': 0.3153918385505676, 'learning_rate': 2.863381160324392e-06, 'epoch': 2.71}


 91%|█████████ | 4360/4809 [54:50<05:19,  1.40it/s]

{'loss': 0.039, 'grad_norm': 0.2101368010044098, 'learning_rate': 2.800998128509046e-06, 'epoch': 2.72}


 91%|█████████ | 4370/4809 [54:57<05:12,  1.40it/s]

{'loss': 0.0413, 'grad_norm': 0.2862531244754791, 'learning_rate': 2.7386150966936993e-06, 'epoch': 2.73}


 91%|█████████ | 4380/4809 [55:04<05:05,  1.40it/s]

{'loss': 0.0519, 'grad_norm': 0.1993807852268219, 'learning_rate': 2.676232064878353e-06, 'epoch': 2.73}


 91%|█████████▏| 4390/4809 [55:11<04:58,  1.41it/s]

{'loss': 0.0336, 'grad_norm': 0.34536412358283997, 'learning_rate': 2.6138490330630067e-06, 'epoch': 2.74}


 91%|█████████▏| 4400/4809 [55:18<04:50,  1.41it/s]

{'loss': 0.0569, 'grad_norm': 0.4548404812812805, 'learning_rate': 2.551466001247661e-06, 'epoch': 2.74}


 92%|█████████▏| 4410/4809 [55:25<04:44,  1.40it/s]

{'loss': 0.0461, 'grad_norm': 0.4837590157985687, 'learning_rate': 2.4890829694323146e-06, 'epoch': 2.75}


 92%|█████████▏| 4420/4809 [55:32<04:37,  1.40it/s]

{'loss': 0.0465, 'grad_norm': 0.4105735123157501, 'learning_rate': 2.4266999376169683e-06, 'epoch': 2.76}


 92%|█████████▏| 4430/4809 [55:39<04:29,  1.40it/s]

{'loss': 0.0385, 'grad_norm': 0.2153993844985962, 'learning_rate': 2.364316905801622e-06, 'epoch': 2.76}


 92%|█████████▏| 4440/4809 [55:47<04:23,  1.40it/s]

{'loss': 0.051, 'grad_norm': 0.43269869685173035, 'learning_rate': 2.3019338739862757e-06, 'epoch': 2.77}


 93%|█████████▎| 4450/4809 [55:54<04:15,  1.41it/s]

{'loss': 0.0452, 'grad_norm': 0.41292545199394226, 'learning_rate': 2.2395508421709294e-06, 'epoch': 2.78}


 93%|█████████▎| 4460/4809 [56:01<04:08,  1.40it/s]

{'loss': 0.0393, 'grad_norm': 0.40457478165626526, 'learning_rate': 2.1771678103555836e-06, 'epoch': 2.78}


 93%|█████████▎| 4470/4809 [56:08<04:02,  1.40it/s]

{'loss': 0.0739, 'grad_norm': 0.5864883065223694, 'learning_rate': 2.1147847785402373e-06, 'epoch': 2.79}


 93%|█████████▎| 4480/4809 [56:15<03:54,  1.40it/s]

{'loss': 0.0363, 'grad_norm': 0.9649804830551147, 'learning_rate': 2.0524017467248906e-06, 'epoch': 2.79}


 93%|█████████▎| 4490/4809 [56:22<03:46,  1.41it/s]

{'loss': 0.0454, 'grad_norm': 0.3183889091014862, 'learning_rate': 1.9900187149095443e-06, 'epoch': 2.8}


 94%|█████████▎| 4500/4809 [56:29<03:39,  1.40it/s]

{'loss': 0.0401, 'grad_norm': 0.23499085009098053, 'learning_rate': 1.9276356830941985e-06, 'epoch': 2.81}


 94%|█████████▍| 4510/4809 [56:38<03:38,  1.37it/s]

{'loss': 0.0521, 'grad_norm': 0.34458082914352417, 'learning_rate': 1.8652526512788522e-06, 'epoch': 2.81}


 94%|█████████▍| 4520/4809 [56:45<03:25,  1.40it/s]

{'loss': 0.0469, 'grad_norm': 0.3997359275817871, 'learning_rate': 1.8028696194635061e-06, 'epoch': 2.82}


 94%|█████████▍| 4530/4809 [56:52<03:19,  1.40it/s]

{'loss': 0.0427, 'grad_norm': 0.4639826714992523, 'learning_rate': 1.7404865876481596e-06, 'epoch': 2.83}


 94%|█████████▍| 4540/4809 [56:59<03:11,  1.40it/s]

{'loss': 0.0431, 'grad_norm': 0.2705443799495697, 'learning_rate': 1.6781035558328135e-06, 'epoch': 2.83}


 95%|█████████▍| 4550/4809 [57:06<03:04,  1.40it/s]

{'loss': 0.0349, 'grad_norm': 0.29035744071006775, 'learning_rate': 1.6157205240174675e-06, 'epoch': 2.84}


 95%|█████████▍| 4560/4809 [57:14<02:57,  1.40it/s]

{'loss': 0.041, 'grad_norm': 0.45639631152153015, 'learning_rate': 1.553337492202121e-06, 'epoch': 2.84}


 95%|█████████▌| 4570/4809 [57:21<02:50,  1.40it/s]

{'loss': 0.0483, 'grad_norm': 0.3912714123725891, 'learning_rate': 1.490954460386775e-06, 'epoch': 2.85}


 95%|█████████▌| 4580/4809 [57:28<02:42,  1.41it/s]

{'loss': 0.0557, 'grad_norm': 0.35992541909217834, 'learning_rate': 1.4285714285714286e-06, 'epoch': 2.86}


 95%|█████████▌| 4590/4809 [57:35<02:35,  1.40it/s]

{'loss': 0.0452, 'grad_norm': 0.47452086210250854, 'learning_rate': 1.3661883967560823e-06, 'epoch': 2.86}


 96%|█████████▌| 4600/4809 [57:42<02:28,  1.41it/s]

{'loss': 0.0493, 'grad_norm': 0.4546333849430084, 'learning_rate': 1.3038053649407363e-06, 'epoch': 2.87}


 96%|█████████▌| 4610/4809 [57:49<02:21,  1.40it/s]

{'loss': 0.0547, 'grad_norm': 0.45974040031433105, 'learning_rate': 1.2414223331253898e-06, 'epoch': 2.88}


 96%|█████████▌| 4620/4809 [57:56<02:14,  1.40it/s]

{'loss': 0.0468, 'grad_norm': 0.4849410653114319, 'learning_rate': 1.1790393013100437e-06, 'epoch': 2.88}


 96%|█████████▋| 4630/4809 [58:03<02:07,  1.40it/s]

{'loss': 0.0536, 'grad_norm': 0.4355476498603821, 'learning_rate': 1.1166562694946974e-06, 'epoch': 2.89}


 96%|█████████▋| 4640/4809 [58:11<02:00,  1.40it/s]

{'loss': 0.0431, 'grad_norm': 0.3124575614929199, 'learning_rate': 1.0542732376793511e-06, 'epoch': 2.89}


 97%|█████████▋| 4650/4809 [58:18<01:53,  1.40it/s]

{'loss': 0.0447, 'grad_norm': 0.2670326232910156, 'learning_rate': 9.91890205864005e-07, 'epoch': 2.9}


 97%|█████████▋| 4660/4809 [58:25<01:46,  1.40it/s]

{'loss': 0.0535, 'grad_norm': 0.5848234295845032, 'learning_rate': 9.295071740486588e-07, 'epoch': 2.91}


 97%|█████████▋| 4670/4809 [58:32<01:38,  1.41it/s]

{'loss': 0.0442, 'grad_norm': 0.3675306737422943, 'learning_rate': 8.671241422333126e-07, 'epoch': 2.91}


 97%|█████████▋| 4680/4809 [58:39<01:32,  1.40it/s]

{'loss': 0.0478, 'grad_norm': 0.39675259590148926, 'learning_rate': 8.047411104179663e-07, 'epoch': 2.92}


 98%|█████████▊| 4690/4809 [58:46<01:24,  1.41it/s]

{'loss': 0.0427, 'grad_norm': 0.27359411120414734, 'learning_rate': 7.423580786026201e-07, 'epoch': 2.93}


 98%|█████████▊| 4700/4809 [58:53<01:17,  1.40it/s]

{'loss': 0.0498, 'grad_norm': 0.14930666983127594, 'learning_rate': 6.799750467872739e-07, 'epoch': 2.93}


 98%|█████████▊| 4710/4809 [59:01<01:10,  1.40it/s]

{'loss': 0.055, 'grad_norm': 0.24389535188674927, 'learning_rate': 6.175920149719276e-07, 'epoch': 2.94}


 98%|█████████▊| 4720/4809 [59:08<01:03,  1.40it/s]

{'loss': 0.0541, 'grad_norm': 0.43170586228370667, 'learning_rate': 5.552089831565814e-07, 'epoch': 2.94}


 98%|█████████▊| 4730/4809 [59:15<00:56,  1.40it/s]

{'loss': 0.0509, 'grad_norm': 0.4575198292732239, 'learning_rate': 4.928259513412352e-07, 'epoch': 2.95}


 99%|█████████▊| 4740/4809 [59:22<00:49,  1.41it/s]

{'loss': 0.0415, 'grad_norm': 0.2971765398979187, 'learning_rate': 4.3044291952588896e-07, 'epoch': 2.96}


 99%|█████████▉| 4750/4809 [59:29<00:42,  1.40it/s]

{'loss': 0.0465, 'grad_norm': 0.40704604983329773, 'learning_rate': 3.680598877105428e-07, 'epoch': 2.96}


 99%|█████████▉| 4760/4809 [59:36<00:34,  1.40it/s]

{'loss': 0.0444, 'grad_norm': 0.7961022853851318, 'learning_rate': 3.056768558951965e-07, 'epoch': 2.97}


 99%|█████████▉| 4770/4809 [59:43<00:27,  1.40it/s]

{'loss': 0.0509, 'grad_norm': 0.5515265464782715, 'learning_rate': 2.432938240798503e-07, 'epoch': 2.98}


 99%|█████████▉| 4780/4809 [59:50<00:20,  1.40it/s]

{'loss': 0.0456, 'grad_norm': 0.8681116700172424, 'learning_rate': 1.8091079226450407e-07, 'epoch': 2.98}


100%|█████████▉| 4790/4809 [59:58<00:13,  1.41it/s]

{'loss': 0.0672, 'grad_norm': 0.33275267481803894, 'learning_rate': 1.1852776044915782e-07, 'epoch': 2.99}


100%|█████████▉| 4800/4809 [1:00:05<00:06,  1.40it/s]

{'loss': 0.0509, 'grad_norm': 0.41079390048980713, 'learning_rate': 5.6144728633811603e-08, 'epoch': 2.99}



100%|██████████| 4809/4809 [1:03:23<00:00,  1.26it/s]

{'eval_loss': 0.06662019342184067, 'eval_sacrebleu': 13.536063222017306, 'eval_runtime': 191.2446, 'eval_samples_per_second': 7.216, 'eval_steps_per_second': 1.804, 'epoch': 3.0}
{'train_runtime': 3803.9463, 'train_samples_per_second': 5.055, 'train_steps_per_second': 1.264, 'train_loss': 0.5405642928353812, 'epoch': 3.0}





TrainOutput(global_step=4809, training_loss=0.5405642928353812, metrics={'train_runtime': 3803.9463, 'train_samples_per_second': 5.055, 'train_steps_per_second': 1.264, 'total_flos': 3574674405457920.0, 'train_loss': 0.5405642928353812, 'epoch': 3.0})

In [52]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsFlanT5_trial2/epoch5",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_shuffled,
    eval_dataset=tokenized_valid_shuffled,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  0%|          | 10/3206 [00:04<25:02,  2.13it/s]

{'loss': 0.0412, 'grad_norm': 0.2835259735584259, 'learning_rate': 2.990642545227698e-05, 'epoch': 0.01}


  1%|          | 20/3206 [00:09<25:09,  2.11it/s]

{'loss': 0.0523, 'grad_norm': 0.25933638215065, 'learning_rate': 2.9812850904553962e-05, 'epoch': 0.01}


  1%|          | 30/3206 [00:14<24:39,  2.15it/s]

{'loss': 0.046, 'grad_norm': 0.3346374034881592, 'learning_rate': 2.971927635683094e-05, 'epoch': 0.02}


  1%|          | 40/3206 [00:18<24:35,  2.15it/s]

{'loss': 0.0637, 'grad_norm': 0.4124918580055237, 'learning_rate': 2.9625701809107924e-05, 'epoch': 0.02}


  2%|▏         | 50/3206 [00:23<24:35,  2.14it/s]

{'loss': 0.0453, 'grad_norm': 0.4266037046909332, 'learning_rate': 2.9532127261384903e-05, 'epoch': 0.03}


  2%|▏         | 60/3206 [00:28<24:24,  2.15it/s]

{'loss': 0.0434, 'grad_norm': 0.2510928809642792, 'learning_rate': 2.9438552713661885e-05, 'epoch': 0.04}


  2%|▏         | 70/3206 [00:32<24:41,  2.12it/s]

{'loss': 0.0431, 'grad_norm': 0.5661092400550842, 'learning_rate': 2.9344978165938865e-05, 'epoch': 0.04}


  2%|▏         | 80/3206 [00:37<24:36,  2.12it/s]

{'loss': 0.0505, 'grad_norm': 0.33784252405166626, 'learning_rate': 2.9251403618215847e-05, 'epoch': 0.05}


  3%|▎         | 90/3206 [00:42<24:19,  2.14it/s]

{'loss': 0.0612, 'grad_norm': 0.24509982764720917, 'learning_rate': 2.9157829070492826e-05, 'epoch': 0.06}


  3%|▎         | 100/3206 [00:47<24:23,  2.12it/s]

{'loss': 0.0457, 'grad_norm': 0.36395108699798584, 'learning_rate': 2.906425452276981e-05, 'epoch': 0.06}


  3%|▎         | 110/3206 [00:51<24:06,  2.14it/s]

{'loss': 0.0494, 'grad_norm': 0.36481648683547974, 'learning_rate': 2.8970679975046788e-05, 'epoch': 0.07}


  4%|▎         | 120/3206 [00:56<23:59,  2.14it/s]

{'loss': 0.0475, 'grad_norm': 0.4453220069408417, 'learning_rate': 2.887710542732377e-05, 'epoch': 0.07}


  4%|▍         | 130/3206 [01:01<24:13,  2.12it/s]

{'loss': 0.0462, 'grad_norm': 0.5915471315383911, 'learning_rate': 2.878353087960075e-05, 'epoch': 0.08}


  4%|▍         | 140/3206 [01:05<24:01,  2.13it/s]

{'loss': 0.0403, 'grad_norm': 0.21127240359783173, 'learning_rate': 2.8689956331877732e-05, 'epoch': 0.09}


  5%|▍         | 150/3206 [01:10<24:08,  2.11it/s]

{'loss': 0.0565, 'grad_norm': 0.4657142758369446, 'learning_rate': 2.859638178415471e-05, 'epoch': 0.09}


  5%|▍         | 160/3206 [01:15<23:59,  2.12it/s]

{'loss': 0.0451, 'grad_norm': 0.6044858694076538, 'learning_rate': 2.850280723643169e-05, 'epoch': 0.1}


  5%|▌         | 170/3206 [01:20<23:53,  2.12it/s]

{'loss': 0.0513, 'grad_norm': 0.2527850270271301, 'learning_rate': 2.840923268870867e-05, 'epoch': 0.11}


  6%|▌         | 180/3206 [01:24<23:46,  2.12it/s]

{'loss': 0.063, 'grad_norm': 0.24249215424060822, 'learning_rate': 2.831565814098565e-05, 'epoch': 0.11}


  6%|▌         | 190/3206 [01:29<23:33,  2.13it/s]

{'loss': 0.0391, 'grad_norm': 0.6034140586853027, 'learning_rate': 2.822208359326263e-05, 'epoch': 0.12}


  6%|▌         | 200/3206 [01:34<23:29,  2.13it/s]

{'loss': 0.0483, 'grad_norm': 0.5725790858268738, 'learning_rate': 2.8128509045539613e-05, 'epoch': 0.12}


  7%|▋         | 210/3206 [01:38<23:44,  2.10it/s]

{'loss': 0.0384, 'grad_norm': 0.31487661600112915, 'learning_rate': 2.8034934497816592e-05, 'epoch': 0.13}


  7%|▋         | 220/3206 [01:43<23:53,  2.08it/s]

{'loss': 0.0446, 'grad_norm': 0.3246872127056122, 'learning_rate': 2.7941359950093575e-05, 'epoch': 0.14}


  7%|▋         | 230/3206 [01:48<24:42,  2.01it/s]

{'loss': 0.0489, 'grad_norm': 0.2756957709789276, 'learning_rate': 2.7847785402370554e-05, 'epoch': 0.14}


  7%|▋         | 240/3206 [01:55<36:22,  1.36it/s]

{'loss': 0.0355, 'grad_norm': 0.40881311893463135, 'learning_rate': 2.7754210854647536e-05, 'epoch': 0.15}


  8%|▊         | 250/3206 [02:03<36:43,  1.34it/s]

{'loss': 0.0486, 'grad_norm': 0.5187987685203552, 'learning_rate': 2.7660636306924515e-05, 'epoch': 0.16}


  8%|▊         | 260/3206 [02:09<25:39,  1.91it/s]

{'loss': 0.0501, 'grad_norm': 0.7014827132225037, 'learning_rate': 2.7567061759201498e-05, 'epoch': 0.16}


  8%|▊         | 270/3206 [02:14<23:02,  2.12it/s]

{'loss': 0.0586, 'grad_norm': 0.5710322260856628, 'learning_rate': 2.7473487211478477e-05, 'epoch': 0.17}


  9%|▊         | 280/3206 [02:18<22:49,  2.14it/s]

{'loss': 0.0455, 'grad_norm': 0.4239213466644287, 'learning_rate': 2.737991266375546e-05, 'epoch': 0.17}


  9%|▉         | 290/3206 [02:23<22:53,  2.12it/s]

{'loss': 0.0524, 'grad_norm': 0.3608964681625366, 'learning_rate': 2.728633811603244e-05, 'epoch': 0.18}


  9%|▉         | 300/3206 [02:28<22:43,  2.13it/s]

{'loss': 0.0446, 'grad_norm': 0.4938013553619385, 'learning_rate': 2.719276356830942e-05, 'epoch': 0.19}


 10%|▉         | 310/3206 [02:33<22:29,  2.15it/s]

{'loss': 0.0392, 'grad_norm': 0.2232765555381775, 'learning_rate': 2.70991890205864e-05, 'epoch': 0.19}


 10%|▉         | 320/3206 [02:37<22:34,  2.13it/s]

{'loss': 0.042, 'grad_norm': 0.35945308208465576, 'learning_rate': 2.7005614472863383e-05, 'epoch': 0.2}


 10%|█         | 330/3206 [02:42<22:40,  2.11it/s]

{'loss': 0.0672, 'grad_norm': 0.6513182520866394, 'learning_rate': 2.6912039925140362e-05, 'epoch': 0.21}


 11%|█         | 340/3206 [02:47<22:22,  2.13it/s]

{'loss': 0.0343, 'grad_norm': 0.26504582166671753, 'learning_rate': 2.6818465377417344e-05, 'epoch': 0.21}


 11%|█         | 350/3206 [02:51<22:24,  2.12it/s]

{'loss': 0.0474, 'grad_norm': 0.36894136667251587, 'learning_rate': 2.6724890829694323e-05, 'epoch': 0.22}


 11%|█         | 360/3206 [02:56<22:21,  2.12it/s]

{'loss': 0.0421, 'grad_norm': 0.7065690159797668, 'learning_rate': 2.6631316281971306e-05, 'epoch': 0.22}


 12%|█▏        | 370/3206 [03:01<22:05,  2.14it/s]

{'loss': 0.0412, 'grad_norm': 0.2987544536590576, 'learning_rate': 2.6537741734248285e-05, 'epoch': 0.23}


 12%|█▏        | 380/3206 [03:05<22:05,  2.13it/s]

{'loss': 0.044, 'grad_norm': 0.23202110826969147, 'learning_rate': 2.6444167186525267e-05, 'epoch': 0.24}


 12%|█▏        | 390/3206 [03:10<22:13,  2.11it/s]

{'loss': 0.0407, 'grad_norm': 0.3795394003391266, 'learning_rate': 2.6350592638802246e-05, 'epoch': 0.24}


 12%|█▏        | 400/3206 [03:15<22:20,  2.09it/s]

{'loss': 0.0649, 'grad_norm': 0.4803023636341095, 'learning_rate': 2.625701809107923e-05, 'epoch': 0.25}


 13%|█▎        | 410/3206 [03:20<21:53,  2.13it/s]

{'loss': 0.0387, 'grad_norm': 0.34596866369247437, 'learning_rate': 2.6163443543356208e-05, 'epoch': 0.26}


 13%|█▎        | 420/3206 [03:24<21:44,  2.14it/s]

{'loss': 0.0607, 'grad_norm': 0.21563419699668884, 'learning_rate': 2.606986899563319e-05, 'epoch': 0.26}


 13%|█▎        | 430/3206 [03:29<21:48,  2.12it/s]

{'loss': 0.0509, 'grad_norm': 0.44093018770217896, 'learning_rate': 2.597629444791017e-05, 'epoch': 0.27}


 14%|█▎        | 440/3206 [03:34<21:43,  2.12it/s]

{'loss': 0.0507, 'grad_norm': 0.3343141973018646, 'learning_rate': 2.5882719900187152e-05, 'epoch': 0.27}


 14%|█▍        | 450/3206 [03:38<21:25,  2.14it/s]

{'loss': 0.0402, 'grad_norm': 0.36723411083221436, 'learning_rate': 2.578914535246413e-05, 'epoch': 0.28}


 14%|█▍        | 460/3206 [03:43<21:35,  2.12it/s]

{'loss': 0.0426, 'grad_norm': 0.19117769598960876, 'learning_rate': 2.5695570804741114e-05, 'epoch': 0.29}


 15%|█▍        | 470/3206 [03:48<21:41,  2.10it/s]

{'loss': 0.0523, 'grad_norm': 0.4054264426231384, 'learning_rate': 2.560199625701809e-05, 'epoch': 0.29}


 15%|█▍        | 480/3206 [03:53<21:22,  2.12it/s]

{'loss': 0.0522, 'grad_norm': 0.4457609951496124, 'learning_rate': 2.5508421709295072e-05, 'epoch': 0.3}


 15%|█▌        | 490/3206 [03:57<21:12,  2.13it/s]

{'loss': 0.043, 'grad_norm': 0.93934565782547, 'learning_rate': 2.541484716157205e-05, 'epoch': 0.31}


 16%|█▌        | 500/3206 [04:02<21:10,  2.13it/s]

{'loss': 0.0508, 'grad_norm': 0.7465210556983948, 'learning_rate': 2.5321272613849033e-05, 'epoch': 0.31}


 16%|█▌        | 510/3206 [04:08<22:12,  2.02it/s]

{'loss': 0.0483, 'grad_norm': 0.3267180025577545, 'learning_rate': 2.5227698066126013e-05, 'epoch': 0.32}


 16%|█▌        | 520/3206 [04:13<21:20,  2.10it/s]

{'loss': 0.0525, 'grad_norm': 0.24176950752735138, 'learning_rate': 2.5134123518402995e-05, 'epoch': 0.32}


 17%|█▋        | 530/3206 [04:18<20:54,  2.13it/s]

{'loss': 0.0709, 'grad_norm': 0.2722904682159424, 'learning_rate': 2.5040548970679974e-05, 'epoch': 0.33}


 17%|█▋        | 540/3206 [04:22<20:46,  2.14it/s]

{'loss': 0.0396, 'grad_norm': 0.24306170642375946, 'learning_rate': 2.4946974422956957e-05, 'epoch': 0.34}


 17%|█▋        | 550/3206 [04:27<20:50,  2.12it/s]

{'loss': 0.0477, 'grad_norm': 0.38481560349464417, 'learning_rate': 2.4853399875233936e-05, 'epoch': 0.34}


 17%|█▋        | 560/3206 [04:32<20:50,  2.12it/s]

{'loss': 0.0557, 'grad_norm': 0.34431201219558716, 'learning_rate': 2.4759825327510918e-05, 'epoch': 0.35}


 18%|█▊        | 570/3206 [04:37<20:46,  2.11it/s]

{'loss': 0.037, 'grad_norm': 0.2454601377248764, 'learning_rate': 2.4666250779787897e-05, 'epoch': 0.36}


 18%|█▊        | 580/3206 [04:41<20:30,  2.13it/s]

{'loss': 0.0454, 'grad_norm': 0.4473062753677368, 'learning_rate': 2.457267623206488e-05, 'epoch': 0.36}


 18%|█▊        | 590/3206 [04:46<20:33,  2.12it/s]

{'loss': 0.0373, 'grad_norm': 0.3031476140022278, 'learning_rate': 2.447910168434186e-05, 'epoch': 0.37}


 19%|█▊        | 600/3206 [04:51<20:30,  2.12it/s]

{'loss': 0.0413, 'grad_norm': 0.28271034359931946, 'learning_rate': 2.438552713661884e-05, 'epoch': 0.37}


 19%|█▉        | 610/3206 [04:56<20:30,  2.11it/s]

{'loss': 0.0427, 'grad_norm': 0.5156816244125366, 'learning_rate': 2.429195258889582e-05, 'epoch': 0.38}


 19%|█▉        | 620/3206 [05:00<20:12,  2.13it/s]

{'loss': 0.0467, 'grad_norm': 0.3465268313884735, 'learning_rate': 2.4198378041172803e-05, 'epoch': 0.39}


 20%|█▉        | 630/3206 [05:05<20:07,  2.13it/s]

{'loss': 0.0424, 'grad_norm': 0.5089059472084045, 'learning_rate': 2.4104803493449782e-05, 'epoch': 0.39}


 20%|█▉        | 640/3206 [05:10<20:17,  2.11it/s]

{'loss': 0.0405, 'grad_norm': 0.40161240100860596, 'learning_rate': 2.4011228945726764e-05, 'epoch': 0.4}


 20%|██        | 650/3206 [05:14<19:59,  2.13it/s]

{'loss': 0.0352, 'grad_norm': 0.3061951696872711, 'learning_rate': 2.3917654398003744e-05, 'epoch': 0.41}


 21%|██        | 660/3206 [05:19<19:57,  2.13it/s]

{'loss': 0.0383, 'grad_norm': 0.3275967538356781, 'learning_rate': 2.3824079850280726e-05, 'epoch': 0.41}


 21%|██        | 670/3206 [05:24<19:47,  2.14it/s]

{'loss': 0.0467, 'grad_norm': 0.29036861658096313, 'learning_rate': 2.3730505302557705e-05, 'epoch': 0.42}


 21%|██        | 680/3206 [05:29<19:56,  2.11it/s]

{'loss': 0.0394, 'grad_norm': 0.4664570391178131, 'learning_rate': 2.3636930754834688e-05, 'epoch': 0.42}


 22%|██▏       | 690/3206 [05:33<19:43,  2.13it/s]

{'loss': 0.0292, 'grad_norm': 0.41158127784729004, 'learning_rate': 2.3543356207111667e-05, 'epoch': 0.43}


 22%|██▏       | 700/3206 [05:38<19:41,  2.12it/s]

{'loss': 0.0307, 'grad_norm': 0.29797449707984924, 'learning_rate': 2.344978165938865e-05, 'epoch': 0.44}


 22%|██▏       | 710/3206 [05:43<19:44,  2.11it/s]

{'loss': 0.0452, 'grad_norm': 0.443950355052948, 'learning_rate': 2.3356207111665628e-05, 'epoch': 0.44}


 22%|██▏       | 720/3206 [05:47<19:25,  2.13it/s]

{'loss': 0.0409, 'grad_norm': 0.32571107149124146, 'learning_rate': 2.326263256394261e-05, 'epoch': 0.45}


 23%|██▎       | 730/3206 [05:52<19:29,  2.12it/s]

{'loss': 0.0385, 'grad_norm': 0.4434262216091156, 'learning_rate': 2.316905801621959e-05, 'epoch': 0.46}


 23%|██▎       | 740/3206 [05:57<19:19,  2.13it/s]

{'loss': 0.0494, 'grad_norm': 0.35490337014198303, 'learning_rate': 2.3075483468496572e-05, 'epoch': 0.46}


 23%|██▎       | 750/3206 [06:02<19:24,  2.11it/s]

{'loss': 0.0439, 'grad_norm': 0.40174782276153564, 'learning_rate': 2.298190892077355e-05, 'epoch': 0.47}


 24%|██▎       | 760/3206 [06:06<19:17,  2.11it/s]

{'loss': 0.0422, 'grad_norm': 0.27619996666908264, 'learning_rate': 2.2888334373050534e-05, 'epoch': 0.47}


 24%|██▍       | 770/3206 [06:11<19:12,  2.11it/s]

{'loss': 0.0337, 'grad_norm': 0.3425427973270416, 'learning_rate': 2.2794759825327513e-05, 'epoch': 0.48}


 24%|██▍       | 780/3206 [06:16<19:00,  2.13it/s]

{'loss': 0.0315, 'grad_norm': 0.2238396555185318, 'learning_rate': 2.2701185277604492e-05, 'epoch': 0.49}


 25%|██▍       | 790/3206 [06:20<18:55,  2.13it/s]

{'loss': 0.0376, 'grad_norm': 0.37793436646461487, 'learning_rate': 2.260761072988147e-05, 'epoch': 0.49}


 25%|██▍       | 800/3206 [06:25<18:41,  2.15it/s]

{'loss': 0.0396, 'grad_norm': 0.6177851557731628, 'learning_rate': 2.2514036182158454e-05, 'epoch': 0.5}


 25%|██▌       | 810/3206 [06:30<19:08,  2.09it/s]

{'loss': 0.0446, 'grad_norm': 0.2617768347263336, 'learning_rate': 2.2420461634435433e-05, 'epoch': 0.51}


 26%|██▌       | 820/3206 [06:35<18:40,  2.13it/s]

{'loss': 0.045, 'grad_norm': 0.2909237742424011, 'learning_rate': 2.2326887086712415e-05, 'epoch': 0.51}


 26%|██▌       | 830/3206 [06:39<18:34,  2.13it/s]

{'loss': 0.042, 'grad_norm': 0.3884228765964508, 'learning_rate': 2.2233312538989394e-05, 'epoch': 0.52}


 26%|██▌       | 840/3206 [06:44<18:23,  2.14it/s]

{'loss': 0.0404, 'grad_norm': 0.1740584671497345, 'learning_rate': 2.2139737991266377e-05, 'epoch': 0.52}


 27%|██▋       | 850/3206 [06:49<18:22,  2.14it/s]

{'loss': 0.0378, 'grad_norm': 0.3526846170425415, 'learning_rate': 2.2046163443543356e-05, 'epoch': 0.53}


 27%|██▋       | 860/3206 [06:53<18:13,  2.14it/s]

{'loss': 0.0389, 'grad_norm': 0.33493879437446594, 'learning_rate': 2.1952588895820335e-05, 'epoch': 0.54}


 27%|██▋       | 870/3206 [06:58<18:18,  2.13it/s]

{'loss': 0.033, 'grad_norm': 0.44067293405532837, 'learning_rate': 2.1859014348097318e-05, 'epoch': 0.54}


 27%|██▋       | 880/3206 [07:03<18:21,  2.11it/s]

{'loss': 0.0382, 'grad_norm': 0.30381566286087036, 'learning_rate': 2.1765439800374297e-05, 'epoch': 0.55}


 28%|██▊       | 890/3206 [07:07<18:29,  2.09it/s]

{'loss': 0.0373, 'grad_norm': 0.30765268206596375, 'learning_rate': 2.167186525265128e-05, 'epoch': 0.56}


 28%|██▊       | 900/3206 [07:12<18:06,  2.12it/s]

{'loss': 0.035, 'grad_norm': 0.2609866261482239, 'learning_rate': 2.1578290704928258e-05, 'epoch': 0.56}


 28%|██▊       | 910/3206 [07:17<17:52,  2.14it/s]

{'loss': 0.0416, 'grad_norm': 0.4900684058666229, 'learning_rate': 2.148471615720524e-05, 'epoch': 0.57}


 29%|██▊       | 920/3206 [07:22<17:46,  2.14it/s]

{'loss': 0.0374, 'grad_norm': 0.5410414934158325, 'learning_rate': 2.139114160948222e-05, 'epoch': 0.57}


 29%|██▉       | 930/3206 [07:26<17:58,  2.11it/s]

{'loss': 0.0291, 'grad_norm': 0.31253933906555176, 'learning_rate': 2.1297567061759202e-05, 'epoch': 0.58}


 29%|██▉       | 940/3206 [07:31<18:42,  2.02it/s]

{'loss': 0.0294, 'grad_norm': 0.19041332602500916, 'learning_rate': 2.120399251403618e-05, 'epoch': 0.59}


 30%|██▉       | 950/3206 [07:39<28:06,  1.34it/s]

{'loss': 0.0449, 'grad_norm': 0.5130828022956848, 'learning_rate': 2.1110417966313164e-05, 'epoch': 0.59}


 30%|██▉       | 960/3206 [07:46<29:12,  1.28it/s]

{'loss': 0.031, 'grad_norm': 0.45246291160583496, 'learning_rate': 2.1016843418590143e-05, 'epoch': 0.6}


 30%|███       | 970/3206 [07:54<28:27,  1.31it/s]

{'loss': 0.0407, 'grad_norm': 0.24854052066802979, 'learning_rate': 2.0923268870867125e-05, 'epoch': 0.61}


 31%|███       | 980/3206 [08:02<28:06,  1.32it/s]

{'loss': 0.0417, 'grad_norm': 0.4252491295337677, 'learning_rate': 2.0829694323144105e-05, 'epoch': 0.61}


 31%|███       | 990/3206 [08:09<27:57,  1.32it/s]

{'loss': 0.0411, 'grad_norm': 0.3586950898170471, 'learning_rate': 2.0736119775421087e-05, 'epoch': 0.62}


 31%|███       | 1000/3206 [08:17<28:10,  1.31it/s]

{'loss': 0.0337, 'grad_norm': 0.19548946619033813, 'learning_rate': 2.0642545227698066e-05, 'epoch': 0.62}


 32%|███▏      | 1010/3206 [08:26<29:10,  1.25it/s]

{'loss': 0.0395, 'grad_norm': 0.32551127672195435, 'learning_rate': 2.054897067997505e-05, 'epoch': 0.63}


 32%|███▏      | 1020/3206 [08:34<27:29,  1.33it/s]

{'loss': 0.037, 'grad_norm': 0.3121154010295868, 'learning_rate': 2.0455396132252028e-05, 'epoch': 0.64}


 32%|███▏      | 1030/3206 [08:42<28:13,  1.28it/s]

{'loss': 0.0373, 'grad_norm': 0.2505495846271515, 'learning_rate': 2.036182158452901e-05, 'epoch': 0.64}


 32%|███▏      | 1040/3206 [08:49<28:05,  1.29it/s]

{'loss': 0.04, 'grad_norm': 0.3753795921802521, 'learning_rate': 2.026824703680599e-05, 'epoch': 0.65}


 33%|███▎      | 1050/3206 [08:57<28:33,  1.26it/s]

{'loss': 0.0328, 'grad_norm': 0.27988386154174805, 'learning_rate': 2.0174672489082972e-05, 'epoch': 0.66}


 33%|███▎      | 1060/3206 [09:05<28:09,  1.27it/s]

{'loss': 0.0431, 'grad_norm': 0.4707167148590088, 'learning_rate': 2.008109794135995e-05, 'epoch': 0.66}


 33%|███▎      | 1070/3206 [09:13<28:23,  1.25it/s]

{'loss': 0.0591, 'grad_norm': 0.3371438980102539, 'learning_rate': 1.9987523393636933e-05, 'epoch': 0.67}


 34%|███▎      | 1080/3206 [09:21<28:22,  1.25it/s]

{'loss': 0.0512, 'grad_norm': 0.3934256434440613, 'learning_rate': 1.9893948845913912e-05, 'epoch': 0.67}


 34%|███▍      | 1090/3206 [09:29<28:04,  1.26it/s]

{'loss': 0.0297, 'grad_norm': 0.413288950920105, 'learning_rate': 1.980037429819089e-05, 'epoch': 0.68}


 34%|███▍      | 1100/3206 [09:37<27:28,  1.28it/s]

{'loss': 0.0339, 'grad_norm': 0.2918906807899475, 'learning_rate': 1.970679975046787e-05, 'epoch': 0.69}


 35%|███▍      | 1110/3206 [09:45<27:27,  1.27it/s]

{'loss': 0.041, 'grad_norm': 0.3609526753425598, 'learning_rate': 1.9613225202744853e-05, 'epoch': 0.69}


 35%|███▍      | 1120/3206 [09:53<27:00,  1.29it/s]

{'loss': 0.0408, 'grad_norm': 0.23136086761951447, 'learning_rate': 1.9519650655021832e-05, 'epoch': 0.7}


 35%|███▌      | 1130/3206 [09:58<16:25,  2.11it/s]

{'loss': 0.0333, 'grad_norm': 0.29055047035217285, 'learning_rate': 1.9426076107298815e-05, 'epoch': 0.7}


 36%|███▌      | 1140/3206 [10:02<16:11,  2.13it/s]

{'loss': 0.0372, 'grad_norm': 0.26972246170043945, 'learning_rate': 1.9332501559575794e-05, 'epoch': 0.71}


 36%|███▌      | 1150/3206 [10:07<16:01,  2.14it/s]

{'loss': 0.0463, 'grad_norm': 0.298904687166214, 'learning_rate': 1.9238927011852776e-05, 'epoch': 0.72}


 36%|███▌      | 1160/3206 [10:12<15:52,  2.15it/s]

{'loss': 0.0371, 'grad_norm': 0.31128090620040894, 'learning_rate': 1.9145352464129755e-05, 'epoch': 0.72}


 36%|███▋      | 1170/3206 [10:16<15:47,  2.15it/s]

{'loss': 0.0367, 'grad_norm': 0.41819536685943604, 'learning_rate': 1.9051777916406738e-05, 'epoch': 0.73}


 37%|███▋      | 1180/3206 [10:21<15:44,  2.14it/s]

{'loss': 0.043, 'grad_norm': 0.4576278626918793, 'learning_rate': 1.8958203368683717e-05, 'epoch': 0.74}


 37%|███▋      | 1190/3206 [10:26<15:50,  2.12it/s]

{'loss': 0.0322, 'grad_norm': 0.38730645179748535, 'learning_rate': 1.88646288209607e-05, 'epoch': 0.74}


 37%|███▋      | 1200/3206 [10:30<15:35,  2.14it/s]

{'loss': 0.0336, 'grad_norm': 0.34048542380332947, 'learning_rate': 1.877105427323768e-05, 'epoch': 0.75}


 38%|███▊      | 1210/3206 [10:35<15:29,  2.15it/s]

{'loss': 0.0487, 'grad_norm': 0.4433746039867401, 'learning_rate': 1.867747972551466e-05, 'epoch': 0.75}


 38%|███▊      | 1220/3206 [10:40<15:33,  2.13it/s]

{'loss': 0.0373, 'grad_norm': 0.4571499228477478, 'learning_rate': 1.858390517779164e-05, 'epoch': 0.76}


 38%|███▊      | 1230/3206 [10:44<15:20,  2.15it/s]

{'loss': 0.0407, 'grad_norm': 0.4628845453262329, 'learning_rate': 1.8490330630068623e-05, 'epoch': 0.77}


 39%|███▊      | 1240/3206 [10:49<15:18,  2.14it/s]

{'loss': 0.0402, 'grad_norm': 0.2851603031158447, 'learning_rate': 1.83967560823456e-05, 'epoch': 0.77}


 39%|███▉      | 1250/3206 [10:54<15:12,  2.14it/s]

{'loss': 0.0343, 'grad_norm': 0.34913086891174316, 'learning_rate': 1.8303181534622584e-05, 'epoch': 0.78}


 39%|███▉      | 1260/3206 [10:58<15:09,  2.14it/s]

{'loss': 0.0443, 'grad_norm': 0.49291926622390747, 'learning_rate': 1.8209606986899563e-05, 'epoch': 0.79}


 40%|███▉      | 1270/3206 [11:03<15:07,  2.13it/s]

{'loss': 0.0496, 'grad_norm': 0.2846435308456421, 'learning_rate': 1.8116032439176546e-05, 'epoch': 0.79}


 40%|███▉      | 1280/3206 [11:08<14:55,  2.15it/s]

{'loss': 0.0391, 'grad_norm': 0.3189795911312103, 'learning_rate': 1.8022457891453525e-05, 'epoch': 0.8}


 40%|████      | 1290/3206 [11:12<15:01,  2.13it/s]

{'loss': 0.0326, 'grad_norm': 0.49035024642944336, 'learning_rate': 1.7928883343730507e-05, 'epoch': 0.8}


 41%|████      | 1300/3206 [11:17<14:47,  2.15it/s]

{'loss': 0.0416, 'grad_norm': 0.9132699370384216, 'learning_rate': 1.7835308796007486e-05, 'epoch': 0.81}


 41%|████      | 1310/3206 [11:22<14:49,  2.13it/s]

{'loss': 0.0383, 'grad_norm': 0.4876684546470642, 'learning_rate': 1.774173424828447e-05, 'epoch': 0.82}


 41%|████      | 1320/3206 [11:27<14:37,  2.15it/s]

{'loss': 0.0324, 'grad_norm': 0.3270654082298279, 'learning_rate': 1.7648159700561448e-05, 'epoch': 0.82}


 41%|████▏     | 1330/3206 [11:31<14:38,  2.13it/s]

{'loss': 0.0402, 'grad_norm': 0.48449450731277466, 'learning_rate': 1.755458515283843e-05, 'epoch': 0.83}


 42%|████▏     | 1340/3206 [11:36<14:29,  2.15it/s]

{'loss': 0.0498, 'grad_norm': 0.240003302693367, 'learning_rate': 1.746101060511541e-05, 'epoch': 0.84}


 42%|████▏     | 1350/3206 [11:41<14:26,  2.14it/s]

{'loss': 0.0268, 'grad_norm': 0.20964522659778595, 'learning_rate': 1.7367436057392392e-05, 'epoch': 0.84}


 42%|████▏     | 1360/3206 [11:45<14:22,  2.14it/s]

{'loss': 0.0434, 'grad_norm': 0.43760284781455994, 'learning_rate': 1.727386150966937e-05, 'epoch': 0.85}


 43%|████▎     | 1370/3206 [11:50<14:22,  2.13it/s]

{'loss': 0.0388, 'grad_norm': 0.34139880537986755, 'learning_rate': 1.7180286961946354e-05, 'epoch': 0.85}


 43%|████▎     | 1380/3206 [11:55<14:14,  2.14it/s]

{'loss': 0.0401, 'grad_norm': 0.30279240012168884, 'learning_rate': 1.7086712414223333e-05, 'epoch': 0.86}


 43%|████▎     | 1390/3206 [11:59<14:11,  2.13it/s]

{'loss': 0.0366, 'grad_norm': 0.32028016448020935, 'learning_rate': 1.6993137866500312e-05, 'epoch': 0.87}


 44%|████▎     | 1400/3206 [12:04<14:28,  2.08it/s]

{'loss': 0.0482, 'grad_norm': 0.305671364068985, 'learning_rate': 1.689956331877729e-05, 'epoch': 0.87}


 44%|████▍     | 1410/3206 [12:09<13:55,  2.15it/s]

{'loss': 0.0526, 'grad_norm': 0.35928529500961304, 'learning_rate': 1.6805988771054273e-05, 'epoch': 0.88}


 44%|████▍     | 1420/3206 [12:13<13:52,  2.15it/s]

{'loss': 0.0389, 'grad_norm': 0.3998653292655945, 'learning_rate': 1.6712414223331252e-05, 'epoch': 0.89}


 45%|████▍     | 1430/3206 [12:18<13:54,  2.13it/s]

{'loss': 0.034, 'grad_norm': 0.21304093301296234, 'learning_rate': 1.6618839675608235e-05, 'epoch': 0.89}


 45%|████▍     | 1440/3206 [12:23<13:48,  2.13it/s]

{'loss': 0.0408, 'grad_norm': 0.2955358922481537, 'learning_rate': 1.6525265127885214e-05, 'epoch': 0.9}


 45%|████▌     | 1450/3206 [12:27<13:51,  2.11it/s]

{'loss': 0.0354, 'grad_norm': 0.37409842014312744, 'learning_rate': 1.6431690580162197e-05, 'epoch': 0.9}


 46%|████▌     | 1460/3206 [12:32<13:34,  2.14it/s]

{'loss': 0.0337, 'grad_norm': 0.4834647476673126, 'learning_rate': 1.6338116032439176e-05, 'epoch': 0.91}


 46%|████▌     | 1470/3206 [12:37<13:37,  2.12it/s]

{'loss': 0.0437, 'grad_norm': 0.4438875913619995, 'learning_rate': 1.6244541484716158e-05, 'epoch': 0.92}


 46%|████▌     | 1480/3206 [12:42<13:29,  2.13it/s]

{'loss': 0.0409, 'grad_norm': 0.35479554533958435, 'learning_rate': 1.6150966936993137e-05, 'epoch': 0.92}


 46%|████▋     | 1490/3206 [12:46<13:25,  2.13it/s]

{'loss': 0.0355, 'grad_norm': 0.22187037765979767, 'learning_rate': 1.605739238927012e-05, 'epoch': 0.93}


 47%|████▋     | 1500/3206 [12:51<13:23,  2.12it/s]

{'loss': 0.0381, 'grad_norm': 0.2942262887954712, 'learning_rate': 1.59638178415471e-05, 'epoch': 0.94}


 47%|████▋     | 1510/3206 [12:57<13:40,  2.07it/s]

{'loss': 0.0416, 'grad_norm': 0.37326887249946594, 'learning_rate': 1.587024329382408e-05, 'epoch': 0.94}


 47%|████▋     | 1520/3206 [13:02<13:05,  2.15it/s]

{'loss': 0.0365, 'grad_norm': 0.4255453646183014, 'learning_rate': 1.577666874610106e-05, 'epoch': 0.95}


 48%|████▊     | 1530/3206 [13:06<13:05,  2.13it/s]

{'loss': 0.042, 'grad_norm': 0.33353036642074585, 'learning_rate': 1.5683094198378043e-05, 'epoch': 0.95}


 48%|████▊     | 1540/3206 [13:11<13:01,  2.13it/s]

{'loss': 0.0364, 'grad_norm': 0.17124487459659576, 'learning_rate': 1.5589519650655022e-05, 'epoch': 0.96}


 48%|████▊     | 1550/3206 [13:16<12:51,  2.15it/s]

{'loss': 0.0353, 'grad_norm': 0.4207940101623535, 'learning_rate': 1.5495945102932004e-05, 'epoch': 0.97}


 49%|████▊     | 1560/3206 [13:20<12:46,  2.15it/s]

{'loss': 0.0491, 'grad_norm': 0.35897523164749146, 'learning_rate': 1.5402370555208983e-05, 'epoch': 0.97}


 49%|████▉     | 1570/3206 [13:25<12:43,  2.14it/s]

{'loss': 0.0368, 'grad_norm': 0.3122720718383789, 'learning_rate': 1.5308796007485966e-05, 'epoch': 0.98}


 49%|████▉     | 1580/3206 [13:30<12:38,  2.14it/s]

{'loss': 0.0371, 'grad_norm': 0.31697142124176025, 'learning_rate': 1.5215221459762945e-05, 'epoch': 0.99}


 50%|████▉     | 1590/3206 [13:34<12:40,  2.12it/s]

{'loss': 0.0412, 'grad_norm': 0.33036094903945923, 'learning_rate': 1.5121646912039928e-05, 'epoch': 0.99}


 50%|████▉     | 1600/3206 [13:39<12:34,  2.13it/s]

{'loss': 0.0338, 'grad_norm': 0.25635838508605957, 'learning_rate': 1.5028072364316907e-05, 'epoch': 1.0}



 50%|█████     | 1603/3206 [15:59<10:59,  2.43it/s]

{'eval_loss': 0.06567298620939255, 'eval_sacrebleu': 13.581528156831043, 'eval_runtime': 139.0233, 'eval_samples_per_second': 9.926, 'eval_steps_per_second': 2.482, 'epoch': 1.0}


 50%|█████     | 1610/3206 [16:03<2:23:01,  5.38s/it] 

{'loss': 0.0328, 'grad_norm': 0.38752487301826477, 'learning_rate': 1.4934497816593887e-05, 'epoch': 1.0}


 51%|█████     | 1620/3206 [16:08<15:58,  1.65it/s]  

{'loss': 0.0379, 'grad_norm': 0.3466513752937317, 'learning_rate': 1.4840923268870867e-05, 'epoch': 1.01}


 51%|█████     | 1630/3206 [16:12<12:21,  2.13it/s]

{'loss': 0.0344, 'grad_norm': 0.2667334973812103, 'learning_rate': 1.4747348721147847e-05, 'epoch': 1.02}


 51%|█████     | 1640/3206 [16:17<12:08,  2.15it/s]

{'loss': 0.0401, 'grad_norm': 0.47689709067344666, 'learning_rate': 1.4653774173424828e-05, 'epoch': 1.02}


 51%|█████▏    | 1650/3206 [16:22<12:04,  2.15it/s]

{'loss': 0.0355, 'grad_norm': 0.4500890076160431, 'learning_rate': 1.4560199625701809e-05, 'epoch': 1.03}


 52%|█████▏    | 1660/3206 [16:26<12:10,  2.12it/s]

{'loss': 0.032, 'grad_norm': 0.4428517818450928, 'learning_rate': 1.446662507797879e-05, 'epoch': 1.04}


 52%|█████▏    | 1670/3206 [16:31<11:56,  2.14it/s]

{'loss': 0.0411, 'grad_norm': 0.3011291027069092, 'learning_rate': 1.437305053025577e-05, 'epoch': 1.04}


 52%|█████▏    | 1680/3206 [16:36<11:51,  2.15it/s]

{'loss': 0.0396, 'grad_norm': 0.32629719376564026, 'learning_rate': 1.4279475982532751e-05, 'epoch': 1.05}


 53%|█████▎    | 1690/3206 [16:40<11:51,  2.13it/s]

{'loss': 0.0367, 'grad_norm': 0.19298797845840454, 'learning_rate': 1.4185901434809732e-05, 'epoch': 1.05}


 53%|█████▎    | 1700/3206 [16:45<11:43,  2.14it/s]

{'loss': 0.035, 'grad_norm': 0.48727717995643616, 'learning_rate': 1.4092326887086713e-05, 'epoch': 1.06}


 53%|█████▎    | 1710/3206 [16:50<11:36,  2.15it/s]

{'loss': 0.0465, 'grad_norm': 0.5801207423210144, 'learning_rate': 1.3998752339363694e-05, 'epoch': 1.07}


 54%|█████▎    | 1720/3206 [16:54<11:33,  2.14it/s]

{'loss': 0.041, 'grad_norm': 0.2312019169330597, 'learning_rate': 1.3905177791640674e-05, 'epoch': 1.07}


 54%|█████▍    | 1730/3206 [16:59<11:31,  2.14it/s]

{'loss': 0.0242, 'grad_norm': 0.35995006561279297, 'learning_rate': 1.3811603243917655e-05, 'epoch': 1.08}


 54%|█████▍    | 1740/3206 [17:04<11:32,  2.12it/s]

{'loss': 0.0278, 'grad_norm': 0.3529146611690521, 'learning_rate': 1.3718028696194636e-05, 'epoch': 1.09}


 55%|█████▍    | 1750/3206 [17:09<11:18,  2.15it/s]

{'loss': 0.0401, 'grad_norm': 0.7309990525245667, 'learning_rate': 1.3624454148471617e-05, 'epoch': 1.09}


 55%|█████▍    | 1760/3206 [17:13<11:13,  2.15it/s]

{'loss': 0.0414, 'grad_norm': 0.401713490486145, 'learning_rate': 1.3530879600748598e-05, 'epoch': 1.1}


 55%|█████▌    | 1770/3206 [17:18<11:08,  2.15it/s]

{'loss': 0.0499, 'grad_norm': 0.45108574628829956, 'learning_rate': 1.3437305053025578e-05, 'epoch': 1.1}


 56%|█████▌    | 1780/3206 [17:23<11:07,  2.14it/s]

{'loss': 0.039, 'grad_norm': 0.3853246569633484, 'learning_rate': 1.3343730505302557e-05, 'epoch': 1.11}


 56%|█████▌    | 1790/3206 [17:27<11:06,  2.13it/s]

{'loss': 0.0314, 'grad_norm': 0.2608006000518799, 'learning_rate': 1.3250155957579538e-05, 'epoch': 1.12}


 56%|█████▌    | 1800/3206 [17:32<10:57,  2.14it/s]

{'loss': 0.036, 'grad_norm': 0.325930118560791, 'learning_rate': 1.3156581409856519e-05, 'epoch': 1.12}


 56%|█████▋    | 1810/3206 [17:37<10:53,  2.14it/s]

{'loss': 0.0322, 'grad_norm': 0.25116172432899475, 'learning_rate': 1.30630068621335e-05, 'epoch': 1.13}


 57%|█████▋    | 1820/3206 [17:41<11:05,  2.08it/s]

{'loss': 0.0345, 'grad_norm': 0.36790743470191956, 'learning_rate': 1.296943231441048e-05, 'epoch': 1.14}


 57%|█████▋    | 1830/3206 [17:46<12:17,  1.87it/s]

{'loss': 0.0368, 'grad_norm': 0.3964082896709442, 'learning_rate': 1.2875857766687461e-05, 'epoch': 1.14}


 57%|█████▋    | 1840/3206 [17:51<10:42,  2.13it/s]

{'loss': 0.0393, 'grad_norm': 0.43764641880989075, 'learning_rate': 1.2782283218964442e-05, 'epoch': 1.15}


 58%|█████▊    | 1850/3206 [17:56<10:31,  2.15it/s]

{'loss': 0.0316, 'grad_norm': 0.3200012445449829, 'learning_rate': 1.2688708671241423e-05, 'epoch': 1.15}


 58%|█████▊    | 1860/3206 [18:01<10:33,  2.12it/s]

{'loss': 0.049, 'grad_norm': 0.3956458270549774, 'learning_rate': 1.2595134123518404e-05, 'epoch': 1.16}


 58%|█████▊    | 1870/3206 [18:05<10:24,  2.14it/s]

{'loss': 0.0329, 'grad_norm': 0.35504037141799927, 'learning_rate': 1.2501559575795385e-05, 'epoch': 1.17}


 59%|█████▊    | 1880/3206 [18:10<10:18,  2.14it/s]

{'loss': 0.0317, 'grad_norm': 0.32773494720458984, 'learning_rate': 1.2407985028072365e-05, 'epoch': 1.17}


 59%|█████▉    | 1890/3206 [18:15<10:19,  2.12it/s]

{'loss': 0.031, 'grad_norm': 0.6851794719696045, 'learning_rate': 1.2314410480349346e-05, 'epoch': 1.18}


 59%|█████▉    | 1900/3206 [18:19<10:10,  2.14it/s]

{'loss': 0.0496, 'grad_norm': 0.34321483969688416, 'learning_rate': 1.2220835932626327e-05, 'epoch': 1.19}


 60%|█████▉    | 1910/3206 [18:24<10:09,  2.13it/s]

{'loss': 0.0441, 'grad_norm': 0.42673107981681824, 'learning_rate': 1.2127261384903308e-05, 'epoch': 1.19}


 60%|█████▉    | 1920/3206 [18:29<09:59,  2.14it/s]

{'loss': 0.0374, 'grad_norm': 0.37522950768470764, 'learning_rate': 1.2033686837180289e-05, 'epoch': 1.2}


 60%|██████    | 1930/3206 [18:33<09:56,  2.14it/s]

{'loss': 0.0327, 'grad_norm': 0.29787734150886536, 'learning_rate': 1.1940112289457268e-05, 'epoch': 1.2}


 61%|██████    | 1940/3206 [18:38<09:55,  2.12it/s]

{'loss': 0.0431, 'grad_norm': 0.3979950547218323, 'learning_rate': 1.1846537741734248e-05, 'epoch': 1.21}


 61%|██████    | 1950/3206 [18:43<09:46,  2.14it/s]

{'loss': 0.0369, 'grad_norm': 0.510367214679718, 'learning_rate': 1.175296319401123e-05, 'epoch': 1.22}


 61%|██████    | 1960/3206 [18:47<09:42,  2.14it/s]

{'loss': 0.0394, 'grad_norm': 0.34004589915275574, 'learning_rate': 1.165938864628821e-05, 'epoch': 1.22}


 61%|██████▏   | 1970/3206 [18:52<09:41,  2.13it/s]

{'loss': 0.0341, 'grad_norm': 0.44589537382125854, 'learning_rate': 1.156581409856519e-05, 'epoch': 1.23}


 62%|██████▏   | 1980/3206 [18:57<09:35,  2.13it/s]

{'loss': 0.0374, 'grad_norm': 0.3999633193016052, 'learning_rate': 1.1472239550842172e-05, 'epoch': 1.24}


 62%|██████▏   | 1990/3206 [19:01<09:32,  2.12it/s]

{'loss': 0.0318, 'grad_norm': 0.30992525815963745, 'learning_rate': 1.1378665003119152e-05, 'epoch': 1.24}


 62%|██████▏   | 2000/3206 [19:06<09:27,  2.13it/s]

{'loss': 0.0354, 'grad_norm': 0.32379618287086487, 'learning_rate': 1.1285090455396133e-05, 'epoch': 1.25}


 63%|██████▎   | 2010/3206 [19:12<09:42,  2.05it/s]

{'loss': 0.0454, 'grad_norm': 0.43276283144950867, 'learning_rate': 1.1191515907673114e-05, 'epoch': 1.25}


 63%|██████▎   | 2020/3206 [19:17<09:16,  2.13it/s]

{'loss': 0.031, 'grad_norm': 0.419127881526947, 'learning_rate': 1.1097941359950095e-05, 'epoch': 1.26}


 63%|██████▎   | 2030/3206 [19:22<09:07,  2.15it/s]

{'loss': 0.0294, 'grad_norm': 0.27264198660850525, 'learning_rate': 1.1004366812227075e-05, 'epoch': 1.27}


 64%|██████▎   | 2040/3206 [19:26<09:05,  2.14it/s]

{'loss': 0.0399, 'grad_norm': 0.24941875040531158, 'learning_rate': 1.0910792264504056e-05, 'epoch': 1.27}


 64%|██████▍   | 2050/3206 [19:31<09:05,  2.12it/s]

{'loss': 0.0373, 'grad_norm': 0.3525792360305786, 'learning_rate': 1.0817217716781037e-05, 'epoch': 1.28}


 64%|██████▍   | 2060/3206 [19:36<08:54,  2.15it/s]

{'loss': 0.042, 'grad_norm': 0.34496068954467773, 'learning_rate': 1.0723643169058018e-05, 'epoch': 1.29}


 65%|██████▍   | 2070/3206 [19:40<08:51,  2.14it/s]

{'loss': 0.0441, 'grad_norm': 0.15256530046463013, 'learning_rate': 1.0630068621334999e-05, 'epoch': 1.29}


 65%|██████▍   | 2080/3206 [19:45<08:47,  2.14it/s]

{'loss': 0.0369, 'grad_norm': 0.3526161015033722, 'learning_rate': 1.0536494073611978e-05, 'epoch': 1.3}


 65%|██████▌   | 2090/3206 [19:50<08:44,  2.13it/s]

{'loss': 0.0453, 'grad_norm': 0.33044958114624023, 'learning_rate': 1.0442919525888959e-05, 'epoch': 1.3}


 66%|██████▌   | 2100/3206 [19:54<08:36,  2.14it/s]

{'loss': 0.0318, 'grad_norm': 0.4829869270324707, 'learning_rate': 1.034934497816594e-05, 'epoch': 1.31}


 66%|██████▌   | 2110/3206 [19:59<08:34,  2.13it/s]

{'loss': 0.0373, 'grad_norm': 0.28208231925964355, 'learning_rate': 1.0255770430442918e-05, 'epoch': 1.32}


 66%|██████▌   | 2120/3206 [20:04<08:31,  2.12it/s]

{'loss': 0.0482, 'grad_norm': 0.31710952520370483, 'learning_rate': 1.01621958827199e-05, 'epoch': 1.32}


 66%|██████▋   | 2130/3206 [20:08<08:22,  2.14it/s]

{'loss': 0.0339, 'grad_norm': 0.31666994094848633, 'learning_rate': 1.006862133499688e-05, 'epoch': 1.33}


 67%|██████▋   | 2140/3206 [20:13<08:16,  2.14it/s]

{'loss': 0.0385, 'grad_norm': 0.279032826423645, 'learning_rate': 9.97504678727386e-06, 'epoch': 1.33}


 67%|██████▋   | 2150/3206 [20:18<08:16,  2.13it/s]

{'loss': 0.0292, 'grad_norm': 0.25463467836380005, 'learning_rate': 9.881472239550842e-06, 'epoch': 1.34}


 67%|██████▋   | 2160/3206 [20:23<08:12,  2.13it/s]

{'loss': 0.0483, 'grad_norm': 0.3640073239803314, 'learning_rate': 9.787897691827822e-06, 'epoch': 1.35}


 68%|██████▊   | 2170/3206 [20:27<08:04,  2.14it/s]

{'loss': 0.0345, 'grad_norm': 0.49997788667678833, 'learning_rate': 9.694323144104803e-06, 'epoch': 1.35}


 68%|██████▊   | 2180/3206 [20:32<07:58,  2.14it/s]

{'loss': 0.0636, 'grad_norm': 0.43118715286254883, 'learning_rate': 9.600748596381784e-06, 'epoch': 1.36}


 68%|██████▊   | 2190/3206 [20:37<07:56,  2.13it/s]

{'loss': 0.0372, 'grad_norm': 0.35874879360198975, 'learning_rate': 9.507174048658765e-06, 'epoch': 1.37}


 69%|██████▊   | 2200/3206 [20:41<07:54,  2.12it/s]

{'loss': 0.0379, 'grad_norm': 0.35795891284942627, 'learning_rate': 9.413599500935746e-06, 'epoch': 1.37}


 69%|██████▉   | 2210/3206 [20:46<07:44,  2.14it/s]

{'loss': 0.0357, 'grad_norm': 0.5544427037239075, 'learning_rate': 9.320024953212726e-06, 'epoch': 1.38}


 69%|██████▉   | 2220/3206 [20:51<07:41,  2.14it/s]

{'loss': 0.0452, 'grad_norm': 0.39143112301826477, 'learning_rate': 9.226450405489707e-06, 'epoch': 1.38}


 70%|██████▉   | 2230/3206 [20:55<07:35,  2.14it/s]

{'loss': 0.0318, 'grad_norm': 0.3172743022441864, 'learning_rate': 9.132875857766688e-06, 'epoch': 1.39}


 70%|██████▉   | 2240/3206 [21:00<07:31,  2.14it/s]

{'loss': 0.0365, 'grad_norm': 0.2758018672466278, 'learning_rate': 9.039301310043667e-06, 'epoch': 1.4}


 70%|███████   | 2250/3206 [21:05<07:26,  2.14it/s]

{'loss': 0.0326, 'grad_norm': 0.3407478630542755, 'learning_rate': 8.945726762320648e-06, 'epoch': 1.4}


 70%|███████   | 2260/3206 [21:09<07:21,  2.14it/s]

{'loss': 0.0473, 'grad_norm': 0.3854328393936157, 'learning_rate': 8.852152214597629e-06, 'epoch': 1.41}


 71%|███████   | 2270/3206 [21:14<07:20,  2.12it/s]

{'loss': 0.0355, 'grad_norm': 0.26202815771102905, 'learning_rate': 8.75857766687461e-06, 'epoch': 1.42}


 71%|███████   | 2280/3206 [21:19<07:15,  2.13it/s]

{'loss': 0.0426, 'grad_norm': 0.4292384088039398, 'learning_rate': 8.66500311915159e-06, 'epoch': 1.42}


 71%|███████▏  | 2290/3206 [21:23<07:12,  2.12it/s]

{'loss': 0.0399, 'grad_norm': 0.7266135215759277, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.43}


 72%|███████▏  | 2300/3206 [21:28<07:03,  2.14it/s]

{'loss': 0.0435, 'grad_norm': 0.3269810974597931, 'learning_rate': 8.477854023705552e-06, 'epoch': 1.43}


 72%|███████▏  | 2310/3206 [21:33<06:57,  2.14it/s]

{'loss': 0.032, 'grad_norm': 0.3308592438697815, 'learning_rate': 8.384279475982532e-06, 'epoch': 1.44}


 72%|███████▏  | 2320/3206 [21:37<06:52,  2.15it/s]

{'loss': 0.0461, 'grad_norm': 0.36070066690444946, 'learning_rate': 8.290704928259513e-06, 'epoch': 1.45}


 73%|███████▎  | 2330/3206 [21:42<06:49,  2.14it/s]

{'loss': 0.0311, 'grad_norm': 0.43563082814216614, 'learning_rate': 8.197130380536494e-06, 'epoch': 1.45}


 73%|███████▎  | 2340/3206 [21:47<06:44,  2.14it/s]

{'loss': 0.0412, 'grad_norm': 0.3333953022956848, 'learning_rate': 8.103555832813475e-06, 'epoch': 1.46}


 73%|███████▎  | 2350/3206 [21:52<06:41,  2.13it/s]

{'loss': 0.0463, 'grad_norm': 0.35607975721359253, 'learning_rate': 8.009981285090456e-06, 'epoch': 1.47}


 74%|███████▎  | 2360/3206 [21:56<06:35,  2.14it/s]

{'loss': 0.0352, 'grad_norm': 0.38385316729545593, 'learning_rate': 7.916406737367436e-06, 'epoch': 1.47}


 74%|███████▍  | 2370/3206 [22:01<06:34,  2.12it/s]

{'loss': 0.0422, 'grad_norm': 0.3409155607223511, 'learning_rate': 7.822832189644417e-06, 'epoch': 1.48}


 74%|███████▍  | 2380/3206 [22:06<06:27,  2.13it/s]

{'loss': 0.0356, 'grad_norm': 0.3586655557155609, 'learning_rate': 7.729257641921398e-06, 'epoch': 1.48}


 75%|███████▍  | 2390/3206 [22:10<06:23,  2.13it/s]

{'loss': 0.0434, 'grad_norm': 0.3260762095451355, 'learning_rate': 7.635683094198377e-06, 'epoch': 1.49}


 75%|███████▍  | 2400/3206 [22:15<06:16,  2.14it/s]

{'loss': 0.0435, 'grad_norm': 0.3648258447647095, 'learning_rate': 7.542108546475359e-06, 'epoch': 1.5}


 75%|███████▌  | 2410/3206 [22:20<06:15,  2.12it/s]

{'loss': 0.0444, 'grad_norm': 0.4548003673553467, 'learning_rate': 7.4485339987523395e-06, 'epoch': 1.5}


 75%|███████▌  | 2420/3206 [22:24<06:07,  2.14it/s]

{'loss': 0.0339, 'grad_norm': 0.3015728294849396, 'learning_rate': 7.35495945102932e-06, 'epoch': 1.51}


 76%|███████▌  | 2430/3206 [22:29<06:03,  2.13it/s]

{'loss': 0.039, 'grad_norm': 0.37603121995925903, 'learning_rate': 7.2613849033063e-06, 'epoch': 1.52}


 76%|███████▌  | 2440/3206 [22:34<06:05,  2.09it/s]

{'loss': 0.0363, 'grad_norm': 0.36490398645401, 'learning_rate': 7.167810355583281e-06, 'epoch': 1.52}


 76%|███████▋  | 2450/3206 [22:39<05:56,  2.12it/s]

{'loss': 0.0346, 'grad_norm': 0.5528420209884644, 'learning_rate': 7.074235807860262e-06, 'epoch': 1.53}


 77%|███████▋  | 2460/3206 [22:43<05:51,  2.12it/s]

{'loss': 0.0311, 'grad_norm': 0.4052344560623169, 'learning_rate': 6.980661260137243e-06, 'epoch': 1.53}


 77%|███████▋  | 2470/3206 [22:48<06:07,  2.00it/s]

{'loss': 0.035, 'grad_norm': 0.34740057587623596, 'learning_rate': 6.8870867124142234e-06, 'epoch': 1.54}


 77%|███████▋  | 2480/3206 [22:55<08:47,  1.38it/s]

{'loss': 0.0492, 'grad_norm': 0.41318950057029724, 'learning_rate': 6.793512164691204e-06, 'epoch': 1.55}


 78%|███████▊  | 2490/3206 [23:02<09:00,  1.33it/s]

{'loss': 0.0412, 'grad_norm': 0.4041561186313629, 'learning_rate': 6.699937616968185e-06, 'epoch': 1.55}


 78%|███████▊  | 2500/3206 [23:10<08:50,  1.33it/s]

{'loss': 0.0411, 'grad_norm': 0.4231911301612854, 'learning_rate': 6.606363069245166e-06, 'epoch': 1.56}


 78%|███████▊  | 2510/3206 [23:19<08:54,  1.30it/s]

{'loss': 0.0478, 'grad_norm': 0.4486480951309204, 'learning_rate': 6.512788521522146e-06, 'epoch': 1.57}


 79%|███████▊  | 2520/3206 [23:26<08:34,  1.33it/s]

{'loss': 0.0438, 'grad_norm': 0.35873210430145264, 'learning_rate': 6.4192139737991265e-06, 'epoch': 1.57}


 79%|███████▉  | 2530/3206 [23:34<08:27,  1.33it/s]

{'loss': 0.0347, 'grad_norm': 0.34420597553253174, 'learning_rate': 6.325639426076107e-06, 'epoch': 1.58}


 79%|███████▉  | 2540/3206 [23:41<08:18,  1.34it/s]

{'loss': 0.062, 'grad_norm': 0.24476362764835358, 'learning_rate': 6.232064878353088e-06, 'epoch': 1.58}


 80%|███████▉  | 2550/3206 [23:49<08:23,  1.30it/s]

{'loss': 0.0348, 'grad_norm': 0.36615169048309326, 'learning_rate': 6.138490330630069e-06, 'epoch': 1.59}


 80%|███████▉  | 2560/3206 [23:57<08:05,  1.33it/s]

{'loss': 0.0316, 'grad_norm': 0.3821001946926117, 'learning_rate': 6.04491578290705e-06, 'epoch': 1.6}


 80%|████████  | 2570/3206 [24:04<07:59,  1.33it/s]

{'loss': 0.0444, 'grad_norm': 0.36104488372802734, 'learning_rate': 5.9513412351840305e-06, 'epoch': 1.6}


 80%|████████  | 2580/3206 [24:12<07:49,  1.33it/s]

{'loss': 0.0356, 'grad_norm': 0.10372164100408554, 'learning_rate': 5.857766687461011e-06, 'epoch': 1.61}


 81%|████████  | 2590/3206 [24:19<07:54,  1.30it/s]

{'loss': 0.0464, 'grad_norm': 0.22012700140476227, 'learning_rate': 5.764192139737991e-06, 'epoch': 1.62}


 81%|████████  | 2600/3206 [24:27<07:46,  1.30it/s]

{'loss': 0.0424, 'grad_norm': 0.4646395444869995, 'learning_rate': 5.670617592014972e-06, 'epoch': 1.62}


 81%|████████▏ | 2610/3206 [24:35<07:45,  1.28it/s]

{'loss': 0.0388, 'grad_norm': 0.2453603893518448, 'learning_rate': 5.577043044291953e-06, 'epoch': 1.63}


 82%|████████▏ | 2620/3206 [24:42<07:22,  1.32it/s]

{'loss': 0.0342, 'grad_norm': 0.3675726354122162, 'learning_rate': 5.4834684965689336e-06, 'epoch': 1.63}


 82%|████████▏ | 2630/3206 [24:50<07:10,  1.34it/s]

{'loss': 0.0464, 'grad_norm': 0.39168548583984375, 'learning_rate': 5.389893948845914e-06, 'epoch': 1.64}


 82%|████████▏ | 2640/3206 [24:57<07:05,  1.33it/s]

{'loss': 0.0326, 'grad_norm': 0.3185268044471741, 'learning_rate': 5.296319401122895e-06, 'epoch': 1.65}


 83%|████████▎ | 2650/3206 [25:05<06:56,  1.34it/s]

{'loss': 0.0467, 'grad_norm': 0.1966874599456787, 'learning_rate': 5.202744853399876e-06, 'epoch': 1.65}


 83%|████████▎ | 2660/3206 [25:12<06:50,  1.33it/s]

{'loss': 0.0363, 'grad_norm': 0.32902792096138, 'learning_rate': 5.109170305676856e-06, 'epoch': 1.66}


 83%|████████▎ | 2670/3206 [25:20<06:40,  1.34it/s]

{'loss': 0.0332, 'grad_norm': 0.311939537525177, 'learning_rate': 5.015595757953837e-06, 'epoch': 1.67}


 84%|████████▎ | 2680/3206 [25:27<06:33,  1.34it/s]

{'loss': 0.0366, 'grad_norm': 0.40941911935806274, 'learning_rate': 4.9220212102308175e-06, 'epoch': 1.67}


 84%|████████▍ | 2690/3206 [25:35<06:41,  1.28it/s]

{'loss': 0.0405, 'grad_norm': 0.3506719470024109, 'learning_rate': 4.828446662507798e-06, 'epoch': 1.68}


 84%|████████▍ | 2700/3206 [25:43<06:28,  1.30it/s]

{'loss': 0.0467, 'grad_norm': 0.12433233112096786, 'learning_rate': 4.734872114784779e-06, 'epoch': 1.68}


 85%|████████▍ | 2710/3206 [25:50<06:27,  1.28it/s]

{'loss': 0.0377, 'grad_norm': 0.307964563369751, 'learning_rate': 4.64129756706176e-06, 'epoch': 1.69}


 85%|████████▍ | 2720/3206 [25:57<05:26,  1.49it/s]

{'loss': 0.0272, 'grad_norm': 0.305698424577713, 'learning_rate': 4.547723019338741e-06, 'epoch': 1.7}


 85%|████████▌ | 2730/3206 [26:05<06:13,  1.28it/s]

{'loss': 0.0424, 'grad_norm': 0.3110799789428711, 'learning_rate': 4.454148471615721e-06, 'epoch': 1.7}


 85%|████████▌ | 2740/3206 [26:13<05:51,  1.33it/s]

{'loss': 0.0356, 'grad_norm': 0.36221906542778015, 'learning_rate': 4.360573923892701e-06, 'epoch': 1.71}


 86%|████████▌ | 2750/3206 [26:20<05:40,  1.34it/s]

{'loss': 0.0369, 'grad_norm': 0.2956308424472809, 'learning_rate': 4.266999376169681e-06, 'epoch': 1.72}


 86%|████████▌ | 2760/3206 [26:28<05:34,  1.34it/s]

{'loss': 0.0394, 'grad_norm': 0.19982388615608215, 'learning_rate': 4.173424828446662e-06, 'epoch': 1.72}


 86%|████████▋ | 2770/3206 [26:35<05:27,  1.33it/s]

{'loss': 0.037, 'grad_norm': 0.2895731031894684, 'learning_rate': 4.079850280723643e-06, 'epoch': 1.73}


 87%|████████▋ | 2780/3206 [26:42<04:19,  1.64it/s]

{'loss': 0.0297, 'grad_norm': 0.21136042475700378, 'learning_rate': 3.986275733000624e-06, 'epoch': 1.73}


 87%|████████▋ | 2790/3206 [26:50<05:23,  1.29it/s]

{'loss': 0.0478, 'grad_norm': 0.3804079592227936, 'learning_rate': 3.8927011852776044e-06, 'epoch': 1.74}


 87%|████████▋ | 2800/3206 [26:57<05:08,  1.32it/s]

{'loss': 0.0393, 'grad_norm': 0.32101550698280334, 'learning_rate': 3.799126637554585e-06, 'epoch': 1.75}


 88%|████████▊ | 2810/3206 [27:05<04:57,  1.33it/s]

{'loss': 0.0439, 'grad_norm': 0.38428953289985657, 'learning_rate': 3.705552089831566e-06, 'epoch': 1.75}


 88%|████████▊ | 2820/3206 [27:12<04:43,  1.36it/s]

{'loss': 0.0409, 'grad_norm': 0.2993963360786438, 'learning_rate': 3.611977542108547e-06, 'epoch': 1.76}


 88%|████████▊ | 2830/3206 [27:19<04:40,  1.34it/s]

{'loss': 0.032, 'grad_norm': 0.4733477830886841, 'learning_rate': 3.5184029943855276e-06, 'epoch': 1.77}


 89%|████████▊ | 2840/3206 [27:27<04:46,  1.28it/s]

{'loss': 0.0438, 'grad_norm': 0.3751271665096283, 'learning_rate': 3.424828446662508e-06, 'epoch': 1.77}


 89%|████████▉ | 2850/3206 [27:35<04:32,  1.31it/s]

{'loss': 0.038, 'grad_norm': 0.42964622378349304, 'learning_rate': 3.3312538989394883e-06, 'epoch': 1.78}


 89%|████████▉ | 2860/3206 [27:43<04:37,  1.25it/s]

{'loss': 0.0315, 'grad_norm': 0.5373347401618958, 'learning_rate': 3.237679351216469e-06, 'epoch': 1.78}


 90%|████████▉ | 2870/3206 [27:51<04:17,  1.30it/s]

{'loss': 0.0386, 'grad_norm': 0.2545953691005707, 'learning_rate': 3.14410480349345e-06, 'epoch': 1.79}


 90%|████████▉ | 2880/3206 [27:58<04:04,  1.34it/s]

{'loss': 0.0518, 'grad_norm': 0.34185048937797546, 'learning_rate': 3.0505302557704303e-06, 'epoch': 1.8}


 90%|█████████ | 2890/3206 [28:06<04:00,  1.31it/s]

{'loss': 0.0387, 'grad_norm': 0.3510056436061859, 'learning_rate': 2.956955708047411e-06, 'epoch': 1.8}


 90%|█████████ | 2900/3206 [28:13<03:50,  1.33it/s]

{'loss': 0.0356, 'grad_norm': 0.21868425607681274, 'learning_rate': 2.863381160324392e-06, 'epoch': 1.81}


 91%|█████████ | 2910/3206 [28:21<03:43,  1.33it/s]

{'loss': 0.0359, 'grad_norm': 0.43741169571876526, 'learning_rate': 2.769806612601372e-06, 'epoch': 1.82}


 91%|█████████ | 2920/3206 [28:28<03:33,  1.34it/s]

{'loss': 0.0363, 'grad_norm': 0.5912444591522217, 'learning_rate': 2.676232064878353e-06, 'epoch': 1.82}


 91%|█████████▏| 2930/3206 [28:36<03:25,  1.35it/s]

{'loss': 0.0379, 'grad_norm': 0.518332302570343, 'learning_rate': 2.5826575171553338e-06, 'epoch': 1.83}


 92%|█████████▏| 2940/3206 [28:43<03:18,  1.34it/s]

{'loss': 0.0396, 'grad_norm': 0.35163670778274536, 'learning_rate': 2.4890829694323146e-06, 'epoch': 1.83}


 92%|█████████▏| 2950/3206 [28:51<03:11,  1.34it/s]

{'loss': 0.0352, 'grad_norm': 0.30442675948143005, 'learning_rate': 2.395508421709295e-06, 'epoch': 1.84}


 92%|█████████▏| 2960/3206 [28:58<03:03,  1.34it/s]

{'loss': 0.0332, 'grad_norm': 0.3963707685470581, 'learning_rate': 2.3019338739862757e-06, 'epoch': 1.85}


 93%|█████████▎| 2970/3206 [29:06<02:55,  1.34it/s]

{'loss': 0.0352, 'grad_norm': 0.38529670238494873, 'learning_rate': 2.2083593262632565e-06, 'epoch': 1.85}


 93%|█████████▎| 2980/3206 [29:13<02:50,  1.33it/s]

{'loss': 0.0305, 'grad_norm': 0.3877049684524536, 'learning_rate': 2.1147847785402373e-06, 'epoch': 1.86}


 93%|█████████▎| 2990/3206 [29:21<02:40,  1.35it/s]

{'loss': 0.037, 'grad_norm': 0.484253853559494, 'learning_rate': 2.0212102308172177e-06, 'epoch': 1.87}


 94%|█████████▎| 3000/3206 [29:28<02:33,  1.34it/s]

{'loss': 0.0373, 'grad_norm': 0.27927422523498535, 'learning_rate': 1.9276356830941985e-06, 'epoch': 1.87}


 94%|█████████▍| 3010/3206 [29:37<02:30,  1.30it/s]

{'loss': 0.03, 'grad_norm': 0.4645287096500397, 'learning_rate': 1.834061135371179e-06, 'epoch': 1.88}


 94%|█████████▍| 3020/3206 [29:45<02:19,  1.33it/s]

{'loss': 0.0401, 'grad_norm': 0.318136602640152, 'learning_rate': 1.7404865876481596e-06, 'epoch': 1.88}


 95%|█████████▍| 3030/3206 [29:52<02:12,  1.33it/s]

{'loss': 0.0535, 'grad_norm': 0.32800155878067017, 'learning_rate': 1.6469120399251404e-06, 'epoch': 1.89}


 95%|█████████▍| 3040/3206 [29:59<02:03,  1.35it/s]

{'loss': 0.0439, 'grad_norm': 0.35867664217948914, 'learning_rate': 1.553337492202121e-06, 'epoch': 1.9}


 95%|█████████▌| 3050/3206 [30:07<01:55,  1.35it/s]

{'loss': 0.0394, 'grad_norm': 0.3142782747745514, 'learning_rate': 1.4597629444791018e-06, 'epoch': 1.9}


 95%|█████████▌| 3060/3206 [30:14<01:49,  1.34it/s]

{'loss': 0.0367, 'grad_norm': 0.44971710443496704, 'learning_rate': 1.3661883967560823e-06, 'epoch': 1.91}


 96%|█████████▌| 3070/3206 [30:22<01:41,  1.34it/s]

{'loss': 0.0445, 'grad_norm': 0.55035400390625, 'learning_rate': 1.2726138490330631e-06, 'epoch': 1.92}


 96%|█████████▌| 3080/3206 [30:29<01:34,  1.34it/s]

{'loss': 0.0334, 'grad_norm': 0.35953426361083984, 'learning_rate': 1.1790393013100437e-06, 'epoch': 1.92}


 96%|█████████▋| 3090/3206 [30:37<01:26,  1.34it/s]

{'loss': 0.0369, 'grad_norm': 0.3825295567512512, 'learning_rate': 1.0854647535870245e-06, 'epoch': 1.93}


 97%|█████████▋| 3100/3206 [30:44<01:19,  1.33it/s]

{'loss': 0.041, 'grad_norm': 0.26670601963996887, 'learning_rate': 9.91890205864005e-07, 'epoch': 1.93}


 97%|█████████▋| 3110/3206 [30:52<01:11,  1.34it/s]

{'loss': 0.041, 'grad_norm': 0.44013720750808716, 'learning_rate': 8.983156581409857e-07, 'epoch': 1.94}


 97%|█████████▋| 3120/3206 [30:59<01:04,  1.34it/s]

{'loss': 0.035, 'grad_norm': 0.3732270896434784, 'learning_rate': 8.047411104179663e-07, 'epoch': 1.95}


 98%|█████████▊| 3130/3206 [31:07<00:56,  1.34it/s]

{'loss': 0.0552, 'grad_norm': 0.4239342510700226, 'learning_rate': 7.11166562694947e-07, 'epoch': 1.95}


 98%|█████████▊| 3140/3206 [31:14<00:49,  1.34it/s]

{'loss': 0.031, 'grad_norm': 0.36210235953330994, 'learning_rate': 6.175920149719276e-07, 'epoch': 1.96}


 98%|█████████▊| 3150/3206 [31:22<00:41,  1.34it/s]

{'loss': 0.0345, 'grad_norm': 0.5883322358131409, 'learning_rate': 5.240174672489083e-07, 'epoch': 1.97}


 99%|█████████▊| 3160/3206 [31:29<00:34,  1.33it/s]

{'loss': 0.0456, 'grad_norm': 0.35357409715652466, 'learning_rate': 4.3044291952588896e-07, 'epoch': 1.97}


 99%|█████████▉| 3170/3206 [31:37<00:26,  1.34it/s]

{'loss': 0.0421, 'grad_norm': 0.34022054076194763, 'learning_rate': 3.3686837180286965e-07, 'epoch': 1.98}


 99%|█████████▉| 3180/3206 [31:44<00:20,  1.28it/s]

{'loss': 0.0414, 'grad_norm': 0.34464600682258606, 'learning_rate': 2.432938240798503e-07, 'epoch': 1.98}


100%|█████████▉| 3190/3206 [31:52<00:12,  1.33it/s]

{'loss': 0.0354, 'grad_norm': 0.41978803277015686, 'learning_rate': 1.4971927635683093e-07, 'epoch': 1.99}


100%|█████████▉| 3200/3206 [32:00<00:04,  1.32it/s]

{'loss': 0.0432, 'grad_norm': 0.4275466501712799, 'learning_rate': 5.6144728633811603e-08, 'epoch': 2.0}



100%|██████████| 3206/3206 [35:33<00:00,  1.50it/s]

{'eval_loss': 0.06458806991577148, 'eval_sacrebleu': 16.69781380255021, 'eval_runtime': 208.1069, 'eval_samples_per_second': 6.631, 'eval_steps_per_second': 1.658, 'epoch': 2.0}
{'train_runtime': 2133.8535, 'train_samples_per_second': 6.008, 'train_steps_per_second': 1.502, 'train_loss': 0.04059957190464975, 'epoch': 2.0}





TrainOutput(global_step=3206, training_loss=0.04059957190464975, metrics={'train_runtime': 2133.8535, 'train_samples_per_second': 6.008, 'train_steps_per_second': 1.502, 'total_flos': 2383116270305280.0, 'train_loss': 0.04059957190464975, 'epoch': 2.0})

In [68]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsFlanT5_trial2/epoch8",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=250,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_shuffled,
    eval_dataset=tokenized_valid_shuffled,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  5%|▌         | 250/4809 [01:57<35:51,  2.12it/s]

{'loss': 0.0349, 'grad_norm': 0.3667924702167511, 'learning_rate': 2.8440424204616345e-05, 'epoch': 0.16}


 10%|█         | 500/4809 [03:55<33:51,  2.12it/s]

{'loss': 0.0353, 'grad_norm': 0.39768895506858826, 'learning_rate': 2.688084840923269e-05, 'epoch': 0.31}


 16%|█▌        | 750/4809 [06:34<50:41,  1.33it/s]  

{'loss': 0.033, 'grad_norm': 0.42284485697746277, 'learning_rate': 2.5321272613849033e-05, 'epoch': 0.47}


 21%|██        | 1000/4809 [08:58<29:54,  2.12it/s]

{'loss': 0.0298, 'grad_norm': 0.20565322041511536, 'learning_rate': 2.3761696818465378e-05, 'epoch': 0.62}


 26%|██▌       | 1250/4809 [10:58<27:59,  2.12it/s]

{'loss': 0.0316, 'grad_norm': 0.292184978723526, 'learning_rate': 2.2202121023081722e-05, 'epoch': 0.78}


 31%|███       | 1500/4809 [12:57<26:02,  2.12it/s]

{'loss': 0.0322, 'grad_norm': 0.2682301104068756, 'learning_rate': 2.0642545227698066e-05, 'epoch': 0.94}



 33%|███▎      | 1603/4809 [16:08<22:37,  2.36it/s]

{'eval_loss': 0.06600609421730042, 'eval_sacrebleu': 16.553134747304775, 'eval_runtime': 141.2269, 'eval_samples_per_second': 9.772, 'eval_steps_per_second': 2.443, 'epoch': 1.0}


 36%|███▋      | 1750/4809 [17:18<24:23,  2.09it/s]   

{'loss': 0.0302, 'grad_norm': 0.5563925504684448, 'learning_rate': 1.908296943231441e-05, 'epoch': 1.09}


 42%|████▏     | 2000/4809 [19:16<21:57,  2.13it/s]

{'loss': 0.0301, 'grad_norm': 0.3263070583343506, 'learning_rate': 1.7523393636930758e-05, 'epoch': 1.25}


 47%|████▋     | 2250/4809 [21:16<20:06,  2.12it/s]

{'loss': 0.0323, 'grad_norm': 0.46558067202568054, 'learning_rate': 1.59638178415471e-05, 'epoch': 1.4}


 52%|█████▏    | 2500/4809 [23:14<18:13,  2.11it/s]

{'loss': 0.0334, 'grad_norm': 0.5103921890258789, 'learning_rate': 1.4404242046163443e-05, 'epoch': 1.56}


 57%|█████▋    | 2750/4809 [25:14<16:10,  2.12it/s]

{'loss': 0.0347, 'grad_norm': 0.2103230506181717, 'learning_rate': 1.2844666250779787e-05, 'epoch': 1.72}


 62%|██████▏   | 3000/4809 [27:13<14:22,  2.10it/s]

{'loss': 0.0333, 'grad_norm': 0.2724129557609558, 'learning_rate': 1.1285090455396133e-05, 'epoch': 1.87}



 67%|██████▋   | 3206/4809 [31:18<11:15,  2.37it/s]

{'eval_loss': 0.0630999282002449, 'eval_sacrebleu': 17.819058202867037, 'eval_runtime': 146.7974, 'eval_samples_per_second': 9.401, 'eval_steps_per_second': 2.35, 'epoch': 2.0}


 68%|██████▊   | 3250/4809 [31:50<19:23,  1.34it/s]   

{'loss': 0.0369, 'grad_norm': 0.637355625629425, 'learning_rate': 9.725514660012477e-06, 'epoch': 2.03}


 73%|███████▎  | 3500/4809 [34:57<16:17,  1.34it/s]

{'loss': 0.0346, 'grad_norm': 0.36163464188575745, 'learning_rate': 8.165938864628822e-06, 'epoch': 2.18}


 78%|███████▊  | 3750/4809 [38:04<13:02,  1.35it/s]

{'loss': 0.0337, 'grad_norm': 0.42815661430358887, 'learning_rate': 6.606363069245166e-06, 'epoch': 2.34}


 83%|████████▎ | 4000/4809 [41:11<10:03,  1.34it/s]

{'loss': 0.0349, 'grad_norm': 0.3734879493713379, 'learning_rate': 5.046787273861509e-06, 'epoch': 2.5}


 88%|████████▊ | 4250/4809 [44:11<04:21,  2.13it/s]

{'loss': 0.0362, 'grad_norm': 0.4236227571964264, 'learning_rate': 3.487211478477854e-06, 'epoch': 2.65}


 94%|█████████▎| 4500/4809 [46:06<02:28,  2.09it/s]

{'loss': 0.0357, 'grad_norm': 0.2036958932876587, 'learning_rate': 1.9276356830941985e-06, 'epoch': 2.81}


 99%|█████████▉| 4750/4809 [49:05<00:41,  1.42it/s]

{'loss': 0.0374, 'grad_norm': 0.5027788281440735, 'learning_rate': 3.680598877105428e-07, 'epoch': 2.96}



100%|██████████| 4809/4809 [52:23<00:00,  1.53it/s]

{'eval_loss': 0.06243009865283966, 'eval_sacrebleu': 18.525779406299115, 'eval_runtime': 155.2458, 'eval_samples_per_second': 8.889, 'eval_steps_per_second': 2.222, 'epoch': 3.0}
{'train_runtime': 3143.4428, 'train_samples_per_second': 6.117, 'train_steps_per_second': 1.53, 'train_loss': 0.03375381277167244, 'epoch': 3.0}





TrainOutput(global_step=4809, training_loss=0.03375381277167244, metrics={'train_runtime': 3143.4428, 'train_samples_per_second': 6.117, 'train_steps_per_second': 1.53, 'total_flos': 3574674405457920.0, 'train_loss': 0.03375381277167244, 'epoch': 3.0})

Eval on full validation dataset

In [39]:
# After training run 1
# Evaluate and collect results on validation set
# Change to tokenized_train to check on train data
eval_results = trainer.evaluate(tokenized_valid)
print('results:')
print(eval_results)

# Separate the validation dataset into answerable and unanswerable subsets
answerable_data = tokenized_valid.filter(lambda x: x["answerable"])
unanswerable_data = tokenized_valid.filter(lambda x: not x["answerable"])

# Evaluate on answerable examples
answerable_results = trainer.evaluate(answerable_data)
print('answerable:')
print(answerable_results)

# Evaluate on unanswerable examples
unanswerable_results = trainer.evaluate(unanswerable_data)
print('unanswerable:')
print(unanswerable_results)

100%|██████████| 345/345 [02:47<00:00,  2.06it/s]


results:
{'eval_loss': 44.9827880859375, 'eval_sacrebleu': 3.8745802256783297, 'eval_runtime': 168.0772, 'eval_samples_per_second': 8.211, 'eval_steps_per_second': 2.053, 'epoch': 3.0}


100%|██████████| 238/238 [01:49<00:00,  2.18it/s]


answerable:
{'eval_loss': 45.29914855957031, 'eval_sacrebleu': 4.58833907869167, 'eval_runtime': 109.6013, 'eval_samples_per_second': 8.677, 'eval_steps_per_second': 2.172, 'epoch': 3.0}


100%|██████████| 108/108 [01:01<00:00,  1.77it/s]

unanswerable:
{'eval_loss': 44.23857879638672, 'eval_sacrebleu': 0.25487000229940127, 'eval_runtime': 61.9218, 'eval_samples_per_second': 6.928, 'eval_steps_per_second': 1.744, 'epoch': 3.0}





In [49]:
# After training run 3 (3 full epochs in total)
# Evaluate and collect results on validation set
# Change to tokenized_train to check on train data
eval_results = trainer.evaluate(tokenized_valid)
print('results:')
print(eval_results)

# Separate the validation dataset into answerable and unanswerable subsets
answerable_data = tokenized_valid.filter(lambda x: x["answerable"])
unanswerable_data = tokenized_valid.filter(lambda x: not x["answerable"])

# Evaluate on answerable examples
answerable_results = trainer.evaluate(answerable_data)
print('answerable:')
print(answerable_results)

# Evaluate on unanswerable examples
unanswerable_results = trainer.evaluate(unanswerable_data)
print('unanswerable:')
print(unanswerable_results)

100%|██████████| 345/345 [02:41<00:00,  2.14it/s]


results:
{'eval_loss': 0.06662019342184067, 'eval_sacrebleu': 13.536063222017306, 'eval_runtime': 162.4178, 'eval_samples_per_second': 8.497, 'eval_steps_per_second': 2.124, 'epoch': 3.0}


100%|██████████| 238/238 [01:52<00:00,  2.11it/s]


answerable:
{'eval_loss': 0.04146098718047142, 'eval_sacrebleu': 15.893940819467097, 'eval_runtime': 113.3546, 'eval_samples_per_second': 8.39, 'eval_steps_per_second': 2.1, 'epoch': 3.0}


100%|██████████| 108/108 [01:01<00:00,  1.76it/s]

unanswerable:
{'eval_loss': 0.12326549738645554, 'eval_sacrebleu': 1.2359014136494892, 'eval_runtime': 61.9918, 'eval_samples_per_second': 6.92, 'eval_steps_per_second': 1.742, 'epoch': 3.0}





In [54]:
# After training run 4 (5 full epochs in total)
# Evaluate and collect results on validation set
# Change to tokenized_train to check on train data
eval_results = trainer.evaluate(tokenized_valid)
print('results:')
print(eval_results)

# Separate the validation dataset into answerable and unanswerable subsets
answerable_data = tokenized_valid.filter(lambda x: x["answerable"])
unanswerable_data = tokenized_valid.filter(lambda x: not x["answerable"])

# Evaluate on answerable examples
answerable_results = trainer.evaluate(answerable_data)
print('answerable:')
print(answerable_results)

# Evaluate on unanswerable examples
unanswerable_results = trainer.evaluate(unanswerable_data)
print('unanswerable:')
print(unanswerable_results)

100%|██████████| 345/345 [02:52<00:00,  2.00it/s]


results:
{'eval_loss': 0.06458806246519089, 'eval_sacrebleu': 16.69781380255021, 'eval_runtime': 173.6791, 'eval_samples_per_second': 7.946, 'eval_steps_per_second': 1.986, 'epoch': 2.0}


100%|██████████| 238/238 [02:19<00:00,  1.71it/s]


answerable:
{'eval_loss': 0.03925130516290665, 'eval_sacrebleu': 20.067011131231375, 'eval_runtime': 140.1183, 'eval_samples_per_second': 6.787, 'eval_steps_per_second': 1.699, 'epoch': 2.0}


100%|██████████| 108/108 [01:02<00:00,  1.73it/s]

unanswerable:
{'eval_loss': 0.12144031375646591, 'eval_sacrebleu': 1.231167138246746, 'eval_runtime': 63.0068, 'eval_samples_per_second': 6.809, 'eval_steps_per_second': 1.714, 'epoch': 2.0}





In [70]:
# After training run 5 (8 full epochs in total)
# Evaluate and collect results on validation set
# Change to tokenized_train to check on train data
eval_results = trainer.evaluate(tokenized_valid)
print('results:')
print(eval_results)

# Separate the validation dataset into answerable and unanswerable subsets
answerable_data = tokenized_valid.filter(lambda x: x["answerable"])
unanswerable_data = tokenized_valid.filter(lambda x: not x["answerable"])

# Evaluate on answerable examples
answerable_results = trainer.evaluate(answerable_data)
print('answerable:')
print(answerable_results)

# Evaluate on unanswerable examples
unanswerable_results = trainer.evaluate(unanswerable_data)
print('unanswerable:')
print(unanswerable_results)

100%|██████████| 345/345 [03:24<00:00,  1.69it/s]


results:
{'eval_loss': 0.06243009865283966, 'eval_sacrebleu': 18.525779406299115, 'eval_runtime': 205.1515, 'eval_samples_per_second': 6.727, 'eval_steps_per_second': 1.682, 'epoch': 3.0}


100%|██████████| 238/238 [02:11<00:00,  1.82it/s]


answerable:
{'eval_loss': 0.03843402862548828, 'eval_sacrebleu': 22.138882711139406, 'eval_runtime': 131.7506, 'eval_samples_per_second': 7.218, 'eval_steps_per_second': 1.806, 'epoch': 3.0}


100%|██████████| 108/108 [00:48<00:00,  2.21it/s]

unanswerable:
{'eval_loss': 0.11598038673400879, 'eval_sacrebleu': 1.8359183966947754, 'eval_runtime': 49.6105, 'eval_samples_per_second': 8.647, 'eval_steps_per_second': 2.177, 'epoch': 3.0}





Train examples

In [40]:
# After training run 1
sample_inputs = train_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: The communist-controlled PKWN
Reference: no
Predicted: mining
Reference: Ybarra-Mier y Compañía
Predicted: Roman
Reference: 21 April 753 BC
Predicted: Bilbao
Reference: Vitoria-Gasteiz
Predicted: Austro-Hungarian Navy
Reference: six
Predicted: Salvador Guillermo Allende Gossens
Reference: Salvador Guillermo Allende Gossens
Predicted: Oswald
Reference: murder
Predicted: Paris
Reference: Paris
Predicted: Dutch chartered companies often dictated that their possessions be kept as confined as possible in
Reference: 1795
Predicted: Albuquerque
Reference: Albuquerque
Predicted: Tkaid Shinkansen
Reference: 1964
Predicted: Adolf Hitler
Reference: no
Predicted: New York
Reference: New York
Predicted: Daz had been in power since 1876 and saw the occasion of the centen
Reference: 1810
Predicted: Saint Petersburg
Reference: Saint Petersburg
Predicted: Constantinople
Reference: Constantinople
Predicted: ethnic
Reference: no
Predicted: Protestantism is diverse, being more divided theologic

In [46]:
# After training run 2
sample_inputs = train_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: the Soviets
Reference: no
Predicted: mining
Reference: Ybarra-Mier y Compañía
Predicted: a wall
Reference: 21 April 753 BC
Predicted: Bilbao
Reference: Vitoria-Gasteiz
Predicted: a sailor
Reference: six
Predicted: a symphony
Reference: Salvador Guillermo Allende Gossens
Predicted: a police officer
Reference: murder
Predicted: asian
Reference: Paris
Predicted: Dutch chartered companies often dictated that their possessions be kept as confined as possible in
Reference: 1795
Predicted: Albuquerque
Reference: Albuquerque
Predicted: saskatchewan
Reference: 1964
Predicted: fascist uprisings
Reference: no
Predicted: coal
Reference: New York
Predicted: the statue of the Angel
Reference: 1810
Predicted: Saint Petersburg
Reference: Saint Petersburg
Predicted: Constantinople
Reference: Constantinople
Predicted: adolescence
Reference: no
Predicted: Protestantism
Reference: diverse, being more divided theologically and ecclesiastically than either the Roman Catholic Church
Predicted: agg

In [50]:
# After training run 3 (3 full epochs in total)
sample_inputs = train_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Lublin
Reference: no
Predicted: no
Reference: Ybarra-Mier y Compañía
Predicted: 21 April 753 BC
Reference: 21 April 753 BC
Predicted: no
Reference: Vitoria-Gasteiz
Predicted: six
Reference: six
Predicted: Salvador Guillermo Allende Gossens
Reference: Salvador Guillermo Allende Gossens
Predicted: 45 minutes
Reference: murder
Predicted: Paris
Reference: Paris
Predicted: Dutch colonial empire
Reference: 1795
Predicted: Albuquerque
Reference: Albuquerque
Predicted: 1964
Reference: 1964
Predicted: no
Reference: no
Predicted: 96,000
Reference: New York
Predicted: 1810
Reference: 1810
Predicted: Saint Petersburg Governorate
Reference: Saint Petersburg
Predicted: Constantinople
Reference: Constantinople
Predicted: extremists within Judaism motivated by religious rather than ethnic or nationalistic beliefs
Reference: no
Predicted: no
Reference: diverse, being more divided theologically and ecclesiastically than either the Roman Catholic Church
Predicted: Thomas the Apostle and Mar Ad

In [55]:
# After training run 4 (5 full epochs in total)
sample_inputs = train_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Lublin
Reference: no
Predicted: no
Reference: Ybarra-Mier y Compañía
Predicted: 21 April 753 BC
Reference: 21 April 753 BC
Predicted: Vitoria-Gasteiz
Reference: Vitoria-Gasteiz
Predicted: six
Reference: six
Predicted: Salvador Guillermo Allende Gossens
Reference: Salvador Guillermo Allende Gossens
Predicted: 45 minutes
Reference: murder
Predicted: Paris
Reference: Paris
Predicted: Dutch colonial empire
Reference: 1795
Predicted: Albuquerque
Reference: Albuquerque
Predicted: 1964
Reference: 1964
Predicted: no
Reference: no
Predicted: 96,000
Reference: New York
Predicted: 1810
Reference: 1810
Predicted: Saint Petersburg Governorate
Reference: Saint Petersburg
Predicted: Constantinople
Reference: Constantinople
Predicted: extremists within Judaism motivated by religious rather than ethnic or nationalistic beliefs
Reference: no
Predicted: no
Reference: diverse, being more divided theologically and ecclesiastically than either the Roman Catholic Church
Predicted: Thomas the Apost

In [71]:
# After training run 5 (8 full epochs in total)
sample_inputs = train_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Lublin
Reference: no
Predicted: no
Reference: Ybarra-Mier y Compañía
Predicted: 21 April 753 BC
Reference: 21 April 753 BC
Predicted: Vitoria-Gasteiz
Reference: Vitoria-Gasteiz
Predicted: no
Reference: six
Predicted: Salvador Guillermo Allende Gossens
Reference: Salvador Guillermo Allende Gossens
Predicted: 45 minutes
Reference: murder
Predicted: 2,140,526
Reference: Paris
Predicted: trade-based system which derived most of its influence from merchant enterprise and from Dutch control of
Reference: 1795
Predicted: Albuquerque
Reference: Albuquerque
Predicted: 1964
Reference: 1964
Predicted: Benito Mussolini
Reference: no
Predicted: 96,000
Reference: New York
Predicted: 1810
Reference: 1810
Predicted: Saint Petersburg Governorate
Reference: Saint Petersburg
Predicted: Constantinople
Reference: Constantinople
Predicted: extremists within Judaism motivated by religious rather than ethnic or nationalistic beliefs
Reference: no
Predicted: no
Reference: diverse, being more divided

Validation examples

In [41]:
# After training run 1
sample_inputs = valid_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Alec Fleming "Alex" Churchill
Reference: Colombo, British Ceylon
Predicted: Buddhism
Reference: Christianity
Predicted: Rachel
Reference: Rachel Barbra Berry
Predicted: "Killzone"
Reference: November 2004
Predicted: Pennsylvania
Reference: December 12, 1787
Predicted: Michael
Reference: Denville Hall
Predicted: 
Reference: 5th century BCE
Predicted: digital television
Reference: transmission of television signals, including the sound channel, using digital encoding
Predicted: Sami
Reference: no
Predicted: Rachel's journey of finding herself within the glee club
Reference: Lea Michele
Predicted: "Birdland"
Reference: Birdland
Predicted: drier mountains of southern Peru south to northern Chile south to about 30°S
Reference: no
Predicted: Tiziana
Reference: Lucca, Tuscany
Predicted: Takahashi
Reference: 1984
Predicted: Tang Dynasty
Reference: Tang Dynasty
Predicted: niin
Reference: nomen
Predicted: rak
Reference: anise
Predicted: machine-state functionalism
Reference: Hilary Pu

In [51]:
# After training run 3 (3 full epochs in total)
sample_inputs = valid_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Alec Fleming "Alex" Churchill
Reference: Colombo, British Ceylon
Predicted: 5,8 billion people and 84% of the population
Reference: Christianity
Predicted: Rachel Barbra Berry
Reference: Rachel Barbra Berry
Predicted: series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment'
Reference: November 2004
Predicted: 1787
Reference: December 12, 1787
Predicted: Sheila Sim
Reference: Denville Hall
Predicted: no
Reference: 5th century BCE
Predicted: no
Reference: transmission of television signals, including the sound channel, using digital encoding
Predicted: no
Reference: no
Predicted: Rachel Barbra Berry
Reference: Lea Michele
Predicted: Birdland
Reference: Birdland
Predicted: no
Reference: no
Predicted: Lucca, Tuscany
Reference: Lucca, Tuscany
Predicted: Higashinaruse, Akita
Reference: 1984
Predicted: Zhou Dynasty
Reference: Tang Dynasty
Predicted: no
Reference: nomen
Predicted: rak, pastis and sambuca
Reference: anise
Predicted: Hilary P

In [56]:
# After training run 4 (5 full epochs in total)
sample_inputs = valid_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Colombo, British Ceylon
Reference: Colombo, British Ceylon
Predicted: 5.8 billion people and 84% of the population
Reference: Christianity
Predicted: Rachel Barbra Berry
Reference: Rachel Barbra Berry
Predicted: series of first-person shooter and twin sticks shooter video games for Sony Computer Entertainment'
Reference: November 2004
Predicted: 13
Reference: December 12, 1787
Predicted: Sheila Sim
Reference: Denville Hall
Predicted: no
Reference: 5th century BCE
Predicted: transmission of television signals, including the sound channel, using digital encoding, in contrast to
Reference: transmission of television signals, including the sound channel, using digital encoding
Predicted: no
Reference: no
Predicted: Rachel Barbra Berry
Reference: Lea Michele
Predicted: Birdland
Reference: Birdland
Predicted: no
Reference: no
Predicted: Lucca, Tuscany
Reference: Lucca, Tuscany
Predicted: Higashinaruse, Akita
Reference: 1984
Predicted: Zhou Dynasty
Reference: Tang Dynasty
Predicted

In [72]:
# After training run 5 (8 full epochs in total)
sample_inputs = valid_data[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Predicted: Hong Kong
Reference: Colombo, British Ceylon
Predicted: 5.8 billion people
Reference: Christianity
Predicted: Rachel Barbra Berry
Reference: Rachel Barbra Berry
Predicted: 2004
Reference: November 2004
Predicted: 13
Reference: December 12, 1787
Predicted: Sheila Sim
Reference: Denville Hall
Predicted: 5th century BCE
Reference: 5th century BCE
Predicted: transmission of television signals, including the sound channel, using digital encoding, in contrast to
Reference: transmission of television signals, including the sound channel, using digital encoding
Predicted: no
Reference: no
Predicted: Rachel Barbra Berry
Reference: Lea Michele
Predicted: heavy Weather
Reference: Birdland
Predicted: no
Reference: no
Predicted: Lucca, Tuscany
Reference: Lucca, Tuscany
Predicted: 1953
Reference: 1984
Predicted: Zhou Dynasty
Reference: Tang Dynasty
Predicted: Latin "nomen", "name"
Reference: nomen
Predicted: rak, pastis and sambuca
Reference: anise
Predicted: Hilary Putnam
Reference: Hila

Looking at unanswerable validation data only

In [67]:
# After training run 4 (5 full epochs in total)
sample_inputs = valid_data.filter(lambda x: not x['answerable'])[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Questiion: {sample_inputs['question'][i]}")
    print(f"Context: {sample_inputs['context'][i]}")
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")




Questiion: Ovatko suomen kielen murteet olleet aina olemassa?
Context: Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with Finnish, Sami and in recent centuries, the national standard language Swedish. Westrobothnian is the northernmost dialect group of the North Germanic languages in Sweden and borders the traditional Sami-speaking Lapland to the west and Finnish-speaking Torne Valley to the north. Like all Scandinavian, the different varieties of Westrobothnian originate in Proto-Norse and dialects of Old Norse, spoken by immigrating Germanic settlers during the Viking Age.
Predicted: no
Reference: no
Questiion: Onko Andien huiput kokonaan lumen peitossa?
Context: The climate in the Andes varies greatly depending on latitude, altitude, and proximity to the sea. Temperature, atmospheric pressure and humidity decrease in higher elevations. The southern se

In [76]:
# After training run 5 (8 full epochs in total)
sample_inputs = valid_data.filter(lambda x: not x['answerable'])[:20]  # Get samples for testing
inputs = tokenizer([f"Question: {q} Context: {c}" for q, c in zip(sample_inputs["question"], sample_inputs["context"])], return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    outputs = model.generate(**inputs)
    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i in range(20):
    print(f"Questiion: {sample_inputs['question'][i]}")
    print(f"Context: {sample_inputs['context'][i]}")
    print(f"Predicted: {generated[i]}")
    print(f"Reference: {sample_inputs['answer'][i]}")


Questiion: Ovatko suomen kielen murteet olleet aina olemassa?
Context: Westrobothnian () is a number of closely related non-standardized Scandinavian dialects spoken natively along the coast of the historical province of Westrobothnia in co-existence with Finnish, Sami and in recent centuries, the national standard language Swedish. Westrobothnian is the northernmost dialect group of the North Germanic languages in Sweden and borders the traditional Sami-speaking Lapland to the west and Finnish-speaking Torne Valley to the north. Like all Scandinavian, the different varieties of Westrobothnian originate in Proto-Norse and dialects of Old Norse, spoken by immigrating Germanic settlers during the Viking Age.
Predicted: no
Reference: no
Questiion: Onko Andien huiput kokonaan lumen peitossa?
Context: The climate in the Andes varies greatly depending on latitude, altitude, and proximity to the sea. Temperature, atmospheric pressure and humidity decrease in higher elevations. The southern se