# Small GPT

### 1. Load Model

In [7]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel

# Load pre-trained tokenizer and model
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [8]:
print(model)

OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40478, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=40478, bias=False)
)


In [9]:
# Calculate the number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print the number of parameters
print(f"Total number of parameters: {total_params}")

Total number of parameters: 116534784


### 2. Load Fine-tuning Dataset

In [10]:
from datasets import load_dataset

# Load a summarization dataset (CNN/DailyMail)
dataset = load_dataset('cnn_dailymail', '3.0.0')

In [11]:
# Set the `eos_token` as the `pad_token`
# tokenizer.pad_token = tokenizer.eos_token  # or use `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Tokenization of dataset
def tokenize_data(example):
    inputs = tokenizer(
        example['article'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    labels = tokenizer(
        example['highlights'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    # print(f"Input Length: {len(inputs['input_ids'])}, Label Length: {len(labels['input_ids'])}")
    inputs['labels'] = labels['input_ids']
    return inputs


In [12]:
len(dataset['train'])

287113

In [13]:
# Tokenize dataset
train_data = dataset['train'].select(range(10000)).map(tokenize_data, batched=True)
val_data = dataset['validation'].select(range(500)).map(tokenize_data, batched=True)

Map: 100%|██████████| 10000/10000 [00:49<00:00, 203.69 examples/s]
Map: 100%|██████████| 500/500 [00:02<00:00, 196.84 examples/s]


In [14]:
len(train_data[0]['labels'])

512

In [15]:
len(val_data[0]['input_ids'])

512

### 3. Fine-tune the Model

In [16]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Directory to save the model
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=4,     # Batch size for training
    per_device_eval_batch_size=4,      # Batch size for evaluation
    warmup_steps=50,                  # Warmup steps
    weight_decay=0.01,                 # Weight decay
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,
    evaluation_strategy="epoch"        # Evaluate after every epoch
)

# Trainer for fine-tuning
trainer = Trainer(
    model=model,                       # Pre-trained model
    args=training_args,                # Training arguments
    train_dataset=train_data,          # Training dataset
    eval_dataset=val_data              # Evaluation dataset
)

# Fine-tune the model
trainer.train()

  0%|          | 10/7500 [00:14<2:48:46,  1.35s/it]

{'loss': 7.2192, 'grad_norm': 20.54697608947754, 'learning_rate': 1e-05, 'epoch': 0.0}


  0%|          | 20/7500 [00:27<2:47:26,  1.34s/it]

{'loss': 4.2778, 'grad_norm': 17.997617721557617, 'learning_rate': 2e-05, 'epoch': 0.01}


  0%|          | 30/7500 [00:41<2:47:10,  1.34s/it]

{'loss': 1.3277, 'grad_norm': 5.197851657867432, 'learning_rate': 3e-05, 'epoch': 0.01}


  1%|          | 40/7500 [00:54<2:47:03,  1.34s/it]

{'loss': 0.9863, 'grad_norm': 0.9392551183700562, 'learning_rate': 4e-05, 'epoch': 0.02}


  1%|          | 50/7500 [01:07<2:46:36,  1.34s/it]

{'loss': 0.9619, 'grad_norm': 2.4671390056610107, 'learning_rate': 5e-05, 'epoch': 0.02}


  1%|          | 60/7500 [01:21<2:46:46,  1.34s/it]

{'loss': 0.8948, 'grad_norm': 0.9039092659950256, 'learning_rate': 4.9932885906040274e-05, 'epoch': 0.02}


  1%|          | 70/7500 [01:34<2:46:14,  1.34s/it]

{'loss': 0.8235, 'grad_norm': 0.6370143294334412, 'learning_rate': 4.986577181208054e-05, 'epoch': 0.03}


  1%|          | 80/7500 [01:48<2:46:19,  1.35s/it]

{'loss': 0.8524, 'grad_norm': 1.2789846658706665, 'learning_rate': 4.9798657718120805e-05, 'epoch': 0.03}


  1%|          | 90/7500 [02:01<2:46:43,  1.35s/it]

{'loss': 0.8705, 'grad_norm': 0.8959183096885681, 'learning_rate': 4.9731543624161077e-05, 'epoch': 0.04}


  1%|▏         | 100/7500 [02:15<2:46:45,  1.35s/it]

{'loss': 0.8589, 'grad_norm': 0.5612483620643616, 'learning_rate': 4.966442953020135e-05, 'epoch': 0.04}


  1%|▏         | 110/7500 [02:28<2:47:13,  1.36s/it]

{'loss': 0.8734, 'grad_norm': 1.0386316776275635, 'learning_rate': 4.9597315436241614e-05, 'epoch': 0.04}


  2%|▏         | 120/7500 [02:42<2:47:51,  1.36s/it]

{'loss': 0.821, 'grad_norm': 0.8375300765037537, 'learning_rate': 4.953020134228188e-05, 'epoch': 0.05}


  2%|▏         | 130/7500 [02:56<2:48:06,  1.37s/it]

{'loss': 0.861, 'grad_norm': 0.9603061079978943, 'learning_rate': 4.946308724832215e-05, 'epoch': 0.05}


  2%|▏         | 140/7500 [03:10<2:49:04,  1.38s/it]

{'loss': 0.843, 'grad_norm': 0.8582592606544495, 'learning_rate': 4.9395973154362416e-05, 'epoch': 0.06}


  2%|▏         | 150/7500 [03:25<3:07:39,  1.53s/it]

{'loss': 0.8481, 'grad_norm': 1.6298304796218872, 'learning_rate': 4.932885906040269e-05, 'epoch': 0.06}


  2%|▏         | 160/7500 [03:41<3:21:36,  1.65s/it]

{'loss': 0.8423, 'grad_norm': 0.5427634716033936, 'learning_rate': 4.926174496644296e-05, 'epoch': 0.06}


  2%|▏         | 170/7500 [03:58<3:22:43,  1.66s/it]

{'loss': 0.849, 'grad_norm': 0.5403362512588501, 'learning_rate': 4.9194630872483225e-05, 'epoch': 0.07}


  2%|▏         | 180/7500 [04:14<3:17:54,  1.62s/it]

{'loss': 0.8278, 'grad_norm': 0.560753345489502, 'learning_rate': 4.912751677852349e-05, 'epoch': 0.07}


  3%|▎         | 190/7500 [04:30<3:11:16,  1.57s/it]

{'loss': 0.8601, 'grad_norm': 1.3031898736953735, 'learning_rate': 4.906040268456376e-05, 'epoch': 0.08}


  3%|▎         | 200/7500 [04:45<3:12:26,  1.58s/it]

{'loss': 0.8777, 'grad_norm': 1.2487454414367676, 'learning_rate': 4.8993288590604034e-05, 'epoch': 0.08}


  3%|▎         | 210/7500 [05:02<3:23:50,  1.68s/it]

{'loss': 0.8598, 'grad_norm': 0.8592296838760376, 'learning_rate': 4.89261744966443e-05, 'epoch': 0.08}


  3%|▎         | 220/7500 [05:19<3:24:13,  1.68s/it]

{'loss': 0.8418, 'grad_norm': 0.48188167810440063, 'learning_rate': 4.8859060402684564e-05, 'epoch': 0.09}


  3%|▎         | 230/7500 [05:36<3:26:53,  1.71s/it]

{'loss': 0.8453, 'grad_norm': 0.5473179817199707, 'learning_rate': 4.8791946308724836e-05, 'epoch': 0.09}


  3%|▎         | 240/7500 [05:52<3:08:59,  1.56s/it]

{'loss': 0.7918, 'grad_norm': 0.5599721670150757, 'learning_rate': 4.87248322147651e-05, 'epoch': 0.1}


  3%|▎         | 250/7500 [06:07<3:08:08,  1.56s/it]

{'loss': 0.8712, 'grad_norm': 0.6982989311218262, 'learning_rate': 4.865771812080537e-05, 'epoch': 0.1}


  3%|▎         | 260/7500 [06:23<3:11:54,  1.59s/it]

{'loss': 0.8451, 'grad_norm': 1.2819366455078125, 'learning_rate': 4.859060402684564e-05, 'epoch': 0.1}


  4%|▎         | 270/7500 [06:39<3:12:24,  1.60s/it]

{'loss': 0.8385, 'grad_norm': 0.5226606726646423, 'learning_rate': 4.852348993288591e-05, 'epoch': 0.11}


  4%|▎         | 280/7500 [06:55<3:13:04,  1.60s/it]

{'loss': 0.7833, 'grad_norm': 0.5008462071418762, 'learning_rate': 4.8456375838926175e-05, 'epoch': 0.11}


  4%|▍         | 290/7500 [07:11<3:17:32,  1.64s/it]

{'loss': 0.8486, 'grad_norm': 1.2340930700302124, 'learning_rate': 4.838926174496645e-05, 'epoch': 0.12}


  4%|▍         | 300/7500 [07:28<3:15:59,  1.63s/it]

{'loss': 0.8454, 'grad_norm': 0.7215428352355957, 'learning_rate': 4.832214765100672e-05, 'epoch': 0.12}


  4%|▍         | 310/7500 [07:44<3:15:49,  1.63s/it]

{'loss': 0.8367, 'grad_norm': 0.45835378766059875, 'learning_rate': 4.825503355704698e-05, 'epoch': 0.12}


  4%|▍         | 320/7500 [08:00<3:17:43,  1.65s/it]

{'loss': 0.881, 'grad_norm': 0.5725738406181335, 'learning_rate': 4.818791946308725e-05, 'epoch': 0.13}


  4%|▍         | 330/7500 [08:17<3:23:36,  1.70s/it]

{'loss': 0.8411, 'grad_norm': 0.5172438025474548, 'learning_rate': 4.812080536912752e-05, 'epoch': 0.13}


  5%|▍         | 340/7500 [08:35<3:27:37,  1.74s/it]

{'loss': 0.8197, 'grad_norm': 0.5713397860527039, 'learning_rate': 4.8053691275167786e-05, 'epoch': 0.14}


  5%|▍         | 350/7500 [08:52<3:28:10,  1.75s/it]

{'loss': 0.8344, 'grad_norm': 0.7406358122825623, 'learning_rate': 4.798657718120805e-05, 'epoch': 0.14}


  5%|▍         | 360/7500 [09:09<3:19:10,  1.67s/it]

{'loss': 0.8614, 'grad_norm': 1.0620665550231934, 'learning_rate': 4.7919463087248323e-05, 'epoch': 0.14}


  5%|▍         | 370/7500 [09:26<3:17:13,  1.66s/it]

{'loss': 0.776, 'grad_norm': 0.5897084474563599, 'learning_rate': 4.7852348993288595e-05, 'epoch': 0.15}


  5%|▌         | 380/7500 [09:43<3:20:27,  1.69s/it]

{'loss': 0.8518, 'grad_norm': 0.9528807997703552, 'learning_rate': 4.778523489932886e-05, 'epoch': 0.15}


  5%|▌         | 390/7500 [10:00<3:31:26,  1.78s/it]

{'loss': 0.8103, 'grad_norm': 0.6772333383560181, 'learning_rate': 4.771812080536913e-05, 'epoch': 0.16}


  5%|▌         | 400/7500 [10:19<3:36:07,  1.83s/it]

{'loss': 0.8657, 'grad_norm': 0.5406872630119324, 'learning_rate': 4.76510067114094e-05, 'epoch': 0.16}


  5%|▌         | 410/7500 [10:38<3:28:11,  1.76s/it]

{'loss': 0.833, 'grad_norm': 0.4869690537452698, 'learning_rate': 4.758389261744966e-05, 'epoch': 0.16}


  6%|▌         | 420/7500 [10:55<3:39:14,  1.86s/it]

{'loss': 0.836, 'grad_norm': 0.5587607026100159, 'learning_rate': 4.7516778523489935e-05, 'epoch': 0.17}


  6%|▌         | 430/7500 [11:12<3:13:40,  1.64s/it]

{'loss': 0.8344, 'grad_norm': 0.4961508512496948, 'learning_rate': 4.7449664429530207e-05, 'epoch': 0.17}


  6%|▌         | 440/7500 [11:29<3:23:55,  1.73s/it]

{'loss': 0.8589, 'grad_norm': 0.5374148488044739, 'learning_rate': 4.738255033557047e-05, 'epoch': 0.18}


  6%|▌         | 450/7500 [11:47<3:40:09,  1.87s/it]

{'loss': 0.8162, 'grad_norm': 1.360719919204712, 'learning_rate': 4.731543624161074e-05, 'epoch': 0.18}


  6%|▌         | 460/7500 [12:07<3:56:39,  2.02s/it]

{'loss': 0.8278, 'grad_norm': 0.6096611022949219, 'learning_rate': 4.724832214765101e-05, 'epoch': 0.18}


  6%|▋         | 470/7500 [12:26<3:26:24,  1.76s/it]

{'loss': 0.8039, 'grad_norm': 0.5074437856674194, 'learning_rate': 4.718120805369128e-05, 'epoch': 0.19}


  6%|▋         | 480/7500 [12:43<3:15:42,  1.67s/it]

{'loss': 0.8367, 'grad_norm': 0.8463903665542603, 'learning_rate': 4.7114093959731546e-05, 'epoch': 0.19}


  7%|▋         | 490/7500 [13:00<3:25:57,  1.76s/it]

{'loss': 0.8085, 'grad_norm': 0.5017498135566711, 'learning_rate': 4.704697986577181e-05, 'epoch': 0.2}


  7%|▋         | 500/7500 [13:18<3:20:03,  1.71s/it]

{'loss': 0.8183, 'grad_norm': 0.9334362745285034, 'learning_rate': 4.697986577181208e-05, 'epoch': 0.2}


  7%|▋         | 510/7500 [13:36<3:23:38,  1.75s/it]

{'loss': 0.821, 'grad_norm': 0.4984315037727356, 'learning_rate': 4.691275167785235e-05, 'epoch': 0.2}


  7%|▋         | 520/7500 [13:54<3:18:03,  1.70s/it]

{'loss': 0.7701, 'grad_norm': 0.6503698229789734, 'learning_rate': 4.684563758389262e-05, 'epoch': 0.21}


  7%|▋         | 530/7500 [14:11<3:18:24,  1.71s/it]

{'loss': 0.8194, 'grad_norm': 0.7346305251121521, 'learning_rate': 4.677852348993289e-05, 'epoch': 0.21}


  7%|▋         | 540/7500 [14:28<3:29:23,  1.81s/it]

{'loss': 0.8514, 'grad_norm': 0.9758926630020142, 'learning_rate': 4.671140939597316e-05, 'epoch': 0.22}


  7%|▋         | 550/7500 [14:47<3:36:56,  1.87s/it]

{'loss': 0.8465, 'grad_norm': 0.45581871271133423, 'learning_rate': 4.664429530201342e-05, 'epoch': 0.22}


  7%|▋         | 560/7500 [15:05<3:22:35,  1.75s/it]

{'loss': 0.8748, 'grad_norm': 0.6763519644737244, 'learning_rate': 4.6577181208053694e-05, 'epoch': 0.22}


  8%|▊         | 570/7500 [15:22<3:17:41,  1.71s/it]

{'loss': 0.8135, 'grad_norm': 0.6995507478713989, 'learning_rate': 4.6510067114093966e-05, 'epoch': 0.23}


  8%|▊         | 580/7500 [15:39<3:15:45,  1.70s/it]

{'loss': 0.7993, 'grad_norm': 0.5261162519454956, 'learning_rate': 4.644295302013423e-05, 'epoch': 0.23}


  8%|▊         | 590/7500 [15:57<3:24:12,  1.77s/it]

{'loss': 0.8248, 'grad_norm': 0.4808844029903412, 'learning_rate': 4.6375838926174496e-05, 'epoch': 0.24}


  8%|▊         | 600/7500 [16:14<3:18:46,  1.73s/it]

{'loss': 0.8715, 'grad_norm': 0.49604395031929016, 'learning_rate': 4.630872483221477e-05, 'epoch': 0.24}


  8%|▊         | 610/7500 [16:32<3:21:20,  1.75s/it]

{'loss': 0.8011, 'grad_norm': 0.4924074411392212, 'learning_rate': 4.624161073825504e-05, 'epoch': 0.24}


  8%|▊         | 620/7500 [16:49<3:13:23,  1.69s/it]

{'loss': 0.8435, 'grad_norm': 0.8879619240760803, 'learning_rate': 4.6174496644295305e-05, 'epoch': 0.25}


  8%|▊         | 630/7500 [17:07<3:28:11,  1.82s/it]

{'loss': 0.847, 'grad_norm': 0.4192865490913391, 'learning_rate': 4.610738255033557e-05, 'epoch': 0.25}


  9%|▊         | 640/7500 [17:24<3:19:27,  1.74s/it]

{'loss': 0.9052, 'grad_norm': 0.9215360879898071, 'learning_rate': 4.604026845637584e-05, 'epoch': 0.26}


  9%|▊         | 650/7500 [17:43<3:34:25,  1.88s/it]

{'loss': 0.8126, 'grad_norm': 0.47187671065330505, 'learning_rate': 4.597315436241611e-05, 'epoch': 0.26}


  9%|▉         | 660/7500 [18:02<3:34:46,  1.88s/it]

{'loss': 0.8332, 'grad_norm': 0.4968216121196747, 'learning_rate': 4.590604026845638e-05, 'epoch': 0.26}


  9%|▉         | 670/7500 [18:21<4:03:58,  2.14s/it]

{'loss': 0.8075, 'grad_norm': 0.6828830242156982, 'learning_rate': 4.583892617449665e-05, 'epoch': 0.27}


  9%|▉         | 680/7500 [18:41<3:41:12,  1.95s/it]

{'loss': 0.7929, 'grad_norm': 0.565346360206604, 'learning_rate': 4.5771812080536916e-05, 'epoch': 0.27}


  9%|▉         | 690/7500 [18:59<3:27:20,  1.83s/it]

{'loss': 0.8479, 'grad_norm': 0.7092617750167847, 'learning_rate': 4.570469798657718e-05, 'epoch': 0.28}


  9%|▉         | 700/7500 [19:17<3:26:03,  1.82s/it]

{'loss': 0.8431, 'grad_norm': 0.5477826595306396, 'learning_rate': 4.5637583892617453e-05, 'epoch': 0.28}


  9%|▉         | 710/7500 [19:35<3:16:38,  1.74s/it]

{'loss': 0.8098, 'grad_norm': 1.1531317234039307, 'learning_rate': 4.5570469798657725e-05, 'epoch': 0.28}


 10%|▉         | 720/7500 [19:52<3:17:23,  1.75s/it]

{'loss': 0.8098, 'grad_norm': 0.7269076108932495, 'learning_rate': 4.5503355704697984e-05, 'epoch': 0.29}


 10%|▉         | 730/7500 [20:10<3:20:14,  1.77s/it]

{'loss': 0.802, 'grad_norm': 0.7933775782585144, 'learning_rate': 4.5436241610738256e-05, 'epoch': 0.29}


 10%|▉         | 740/7500 [20:28<3:17:46,  1.76s/it]

{'loss': 0.7892, 'grad_norm': 0.43326494097709656, 'learning_rate': 4.536912751677853e-05, 'epoch': 0.3}


 10%|█         | 750/7500 [20:46<3:20:26,  1.78s/it]

{'loss': 0.8076, 'grad_norm': 0.46539050340652466, 'learning_rate': 4.530201342281879e-05, 'epoch': 0.3}


 10%|█         | 760/7500 [21:03<3:13:55,  1.73s/it]

{'loss': 0.7772, 'grad_norm': 0.9607053399085999, 'learning_rate': 4.5234899328859065e-05, 'epoch': 0.3}


 10%|█         | 770/7500 [21:20<3:16:31,  1.75s/it]

{'loss': 0.8173, 'grad_norm': 0.46290916204452515, 'learning_rate': 4.516778523489933e-05, 'epoch': 0.31}


 10%|█         | 780/7500 [21:38<3:18:55,  1.78s/it]

{'loss': 0.8226, 'grad_norm': 0.5415942668914795, 'learning_rate': 4.51006711409396e-05, 'epoch': 0.31}


 11%|█         | 790/7500 [21:57<3:25:18,  1.84s/it]

{'loss': 0.7938, 'grad_norm': 0.6456159949302673, 'learning_rate': 4.503355704697987e-05, 'epoch': 0.32}


 11%|█         | 800/7500 [22:16<3:27:14,  1.86s/it]

{'loss': 0.8467, 'grad_norm': 0.4392617642879486, 'learning_rate': 4.496644295302014e-05, 'epoch': 0.32}


 11%|█         | 810/7500 [22:36<3:48:11,  2.05s/it]

{'loss': 0.7719, 'grad_norm': 0.5430977940559387, 'learning_rate': 4.4899328859060404e-05, 'epoch': 0.32}


 11%|█         | 820/7500 [22:56<3:25:12,  1.84s/it]

{'loss': 0.831, 'grad_norm': 0.7108575701713562, 'learning_rate': 4.483221476510067e-05, 'epoch': 0.33}


 11%|█         | 830/7500 [23:14<3:15:08,  1.76s/it]

{'loss': 0.813, 'grad_norm': 0.7025067806243896, 'learning_rate': 4.476510067114094e-05, 'epoch': 0.33}


 11%|█         | 840/7500 [23:32<3:26:56,  1.86s/it]

{'loss': 0.8457, 'grad_norm': 0.8825845718383789, 'learning_rate': 4.469798657718121e-05, 'epoch': 0.34}


 11%|█▏        | 850/7500 [23:52<3:26:01,  1.86s/it]

{'loss': 0.805, 'grad_norm': 0.507270097732544, 'learning_rate': 4.463087248322148e-05, 'epoch': 0.34}


 11%|█▏        | 860/7500 [24:10<3:13:36,  1.75s/it]

{'loss': 0.8023, 'grad_norm': 0.4108327031135559, 'learning_rate': 4.456375838926174e-05, 'epoch': 0.34}


 12%|█▏        | 870/7500 [24:28<3:24:11,  1.85s/it]

{'loss': 0.8207, 'grad_norm': 0.6144447922706604, 'learning_rate': 4.4496644295302015e-05, 'epoch': 0.35}


 12%|█▏        | 880/7500 [24:47<3:23:50,  1.85s/it]

{'loss': 0.7565, 'grad_norm': 0.9577409625053406, 'learning_rate': 4.442953020134229e-05, 'epoch': 0.35}


 12%|█▏        | 890/7500 [25:05<3:16:23,  1.78s/it]

{'loss': 0.8316, 'grad_norm': 0.8942714333534241, 'learning_rate': 4.436241610738255e-05, 'epoch': 0.36}


 12%|█▏        | 900/7500 [25:23<3:20:14,  1.82s/it]

{'loss': 0.8147, 'grad_norm': 0.631851077079773, 'learning_rate': 4.4295302013422824e-05, 'epoch': 0.36}


 12%|█▏        | 910/7500 [25:42<3:26:41,  1.88s/it]

{'loss': 0.8765, 'grad_norm': 0.45403823256492615, 'learning_rate': 4.422818791946309e-05, 'epoch': 0.36}


 12%|█▏        | 920/7500 [26:01<3:19:04,  1.82s/it]

{'loss': 0.849, 'grad_norm': 0.4697723984718323, 'learning_rate': 4.4161073825503354e-05, 'epoch': 0.37}


 12%|█▏        | 930/7500 [26:18<3:13:30,  1.77s/it]

{'loss': 0.8352, 'grad_norm': 0.5047340989112854, 'learning_rate': 4.4093959731543626e-05, 'epoch': 0.37}


 13%|█▎        | 940/7500 [26:36<3:12:54,  1.76s/it]

{'loss': 0.8076, 'grad_norm': 0.5387201905250549, 'learning_rate': 4.40268456375839e-05, 'epoch': 0.38}


 13%|█▎        | 950/7500 [26:55<3:28:41,  1.91s/it]

{'loss': 0.8104, 'grad_norm': 0.5377605557441711, 'learning_rate': 4.395973154362416e-05, 'epoch': 0.38}


 13%|█▎        | 960/7500 [27:15<3:49:12,  2.10s/it]

{'loss': 0.7981, 'grad_norm': 0.8003900051116943, 'learning_rate': 4.389261744966443e-05, 'epoch': 0.38}


 13%|█▎        | 970/7500 [27:34<3:19:41,  1.83s/it]

{'loss': 0.8553, 'grad_norm': 0.6913622617721558, 'learning_rate': 4.38255033557047e-05, 'epoch': 0.39}


 13%|█▎        | 980/7500 [27:53<3:29:40,  1.93s/it]

{'loss': 0.8242, 'grad_norm': 0.5349119305610657, 'learning_rate': 4.375838926174497e-05, 'epoch': 0.39}


 13%|█▎        | 990/7500 [28:12<3:23:08,  1.87s/it]

{'loss': 0.8196, 'grad_norm': 0.4521259367465973, 'learning_rate': 4.369127516778524e-05, 'epoch': 0.4}


 13%|█▎        | 1000/7500 [28:31<3:32:15,  1.96s/it]

{'loss': 0.795, 'grad_norm': 0.5906194448471069, 'learning_rate': 4.36241610738255e-05, 'epoch': 0.4}


 13%|█▎        | 1010/7500 [28:53<3:39:06,  2.03s/it]

{'loss': 0.8487, 'grad_norm': 0.5392061471939087, 'learning_rate': 4.3557046979865775e-05, 'epoch': 0.4}


 14%|█▎        | 1020/7500 [29:12<3:26:10,  1.91s/it]

{'loss': 0.8369, 'grad_norm': 0.7863031029701233, 'learning_rate': 4.348993288590604e-05, 'epoch': 0.41}


 14%|█▎        | 1030/7500 [29:33<3:55:23,  2.18s/it]

{'loss': 0.8549, 'grad_norm': 0.7921955585479736, 'learning_rate': 4.342281879194631e-05, 'epoch': 0.41}


 14%|█▍        | 1040/7500 [29:52<3:23:13,  1.89s/it]

{'loss': 0.8149, 'grad_norm': 0.5810761451721191, 'learning_rate': 4.335570469798658e-05, 'epoch': 0.42}


 14%|█▍        | 1050/7500 [30:13<3:19:14,  1.85s/it]

{'loss': 0.8387, 'grad_norm': 0.5386340618133545, 'learning_rate': 4.328859060402685e-05, 'epoch': 0.42}


 14%|█▍        | 1060/7500 [30:31<3:30:25,  1.96s/it]

{'loss': 0.8287, 'grad_norm': 0.5110085606575012, 'learning_rate': 4.3221476510067114e-05, 'epoch': 0.42}


 14%|█▍        | 1070/7500 [30:51<3:35:13,  2.01s/it]

{'loss': 0.7997, 'grad_norm': 0.5099552869796753, 'learning_rate': 4.3154362416107386e-05, 'epoch': 0.43}


 14%|█▍        | 1080/7500 [31:09<3:11:07,  1.79s/it]

{'loss': 0.8151, 'grad_norm': 0.4933452904224396, 'learning_rate': 4.308724832214766e-05, 'epoch': 0.43}


 15%|█▍        | 1090/7500 [31:28<3:36:04,  2.02s/it]

{'loss': 0.8255, 'grad_norm': 0.4240090548992157, 'learning_rate': 4.3020134228187916e-05, 'epoch': 0.44}


 15%|█▍        | 1100/7500 [31:47<3:23:36,  1.91s/it]

{'loss': 0.7909, 'grad_norm': 0.5352188348770142, 'learning_rate': 4.295302013422819e-05, 'epoch': 0.44}


 15%|█▍        | 1110/7500 [32:06<3:14:31,  1.83s/it]

{'loss': 0.7976, 'grad_norm': 0.7096287608146667, 'learning_rate': 4.288590604026846e-05, 'epoch': 0.44}


 15%|█▍        | 1120/7500 [32:26<4:00:56,  2.27s/it]

{'loss': 0.7904, 'grad_norm': 0.477713942527771, 'learning_rate': 4.2818791946308725e-05, 'epoch': 0.45}


 15%|█▌        | 1130/7500 [32:46<3:30:41,  1.98s/it]

{'loss': 0.8007, 'grad_norm': 0.6297976970672607, 'learning_rate': 4.2751677852349e-05, 'epoch': 0.45}


 15%|█▌        | 1140/7500 [33:05<3:11:58,  1.81s/it]

{'loss': 0.8277, 'grad_norm': 0.9824968576431274, 'learning_rate': 4.268456375838926e-05, 'epoch': 0.46}


 15%|█▌        | 1150/7500 [33:25<3:15:42,  1.85s/it]

{'loss': 0.8386, 'grad_norm': 1.2418516874313354, 'learning_rate': 4.2617449664429534e-05, 'epoch': 0.46}


 15%|█▌        | 1160/7500 [33:42<3:07:26,  1.77s/it]

{'loss': 0.8219, 'grad_norm': 0.6123254299163818, 'learning_rate': 4.25503355704698e-05, 'epoch': 0.46}


 16%|█▌        | 1170/7500 [34:00<3:08:20,  1.79s/it]

{'loss': 0.7817, 'grad_norm': 0.5539658069610596, 'learning_rate': 4.248322147651007e-05, 'epoch': 0.47}


 16%|█▌        | 1180/7500 [34:18<3:11:36,  1.82s/it]

{'loss': 0.8064, 'grad_norm': 0.5697259902954102, 'learning_rate': 4.2416107382550336e-05, 'epoch': 0.47}


 16%|█▌        | 1190/7500 [34:37<3:13:02,  1.84s/it]

{'loss': 0.846, 'grad_norm': 0.45949190855026245, 'learning_rate': 4.234899328859061e-05, 'epoch': 0.48}


 16%|█▌        | 1200/7500 [34:55<3:15:39,  1.86s/it]

{'loss': 0.8382, 'grad_norm': 0.5509850978851318, 'learning_rate': 4.228187919463087e-05, 'epoch': 0.48}


 16%|█▌        | 1210/7500 [35:14<3:12:21,  1.83s/it]

{'loss': 0.7594, 'grad_norm': 0.8826730847358704, 'learning_rate': 4.2214765100671145e-05, 'epoch': 0.48}


 16%|█▋        | 1220/7500 [35:33<3:21:34,  1.93s/it]

{'loss': 0.7956, 'grad_norm': 0.6002177000045776, 'learning_rate': 4.214765100671142e-05, 'epoch': 0.49}


 16%|█▋        | 1230/7500 [35:53<3:16:27,  1.88s/it]

{'loss': 0.8222, 'grad_norm': 0.4374740421772003, 'learning_rate': 4.2080536912751675e-05, 'epoch': 0.49}


 17%|█▋        | 1240/7500 [36:12<3:31:30,  2.03s/it]

{'loss': 0.8127, 'grad_norm': 0.4665905833244324, 'learning_rate': 4.201342281879195e-05, 'epoch': 0.5}


 17%|█▋        | 1250/7500 [36:31<3:17:53,  1.90s/it]

{'loss': 0.8377, 'grad_norm': 0.46020588278770447, 'learning_rate': 4.194630872483222e-05, 'epoch': 0.5}


 17%|█▋        | 1260/7500 [36:50<3:09:31,  1.82s/it]

{'loss': 0.7795, 'grad_norm': 0.4657531678676605, 'learning_rate': 4.1879194630872484e-05, 'epoch': 0.5}


 17%|█▋        | 1270/7500 [37:08<3:14:16,  1.87s/it]

{'loss': 0.7757, 'grad_norm': 0.49753424525260925, 'learning_rate': 4.181208053691275e-05, 'epoch': 0.51}


 17%|█▋        | 1280/7500 [37:27<3:12:28,  1.86s/it]

{'loss': 0.7842, 'grad_norm': 0.9991347193717957, 'learning_rate': 4.174496644295302e-05, 'epoch': 0.51}


 17%|█▋        | 1290/7500 [37:47<3:43:00,  2.15s/it]

{'loss': 0.806, 'grad_norm': 0.6566558480262756, 'learning_rate': 4.1677852348993293e-05, 'epoch': 0.52}


 17%|█▋        | 1300/7500 [38:07<3:22:42,  1.96s/it]

{'loss': 0.831, 'grad_norm': 0.5119073390960693, 'learning_rate': 4.161073825503356e-05, 'epoch': 0.52}


 17%|█▋        | 1310/7500 [38:26<3:23:54,  1.98s/it]

{'loss': 0.8308, 'grad_norm': 0.5444889664649963, 'learning_rate': 4.154362416107383e-05, 'epoch': 0.52}


 18%|█▊        | 1320/7500 [38:46<3:15:06,  1.89s/it]

{'loss': 0.7908, 'grad_norm': 0.3844054341316223, 'learning_rate': 4.1476510067114096e-05, 'epoch': 0.53}


 18%|█▊        | 1330/7500 [39:07<3:46:53,  2.21s/it]

{'loss': 0.7784, 'grad_norm': 0.43897467851638794, 'learning_rate': 4.140939597315436e-05, 'epoch': 0.53}


 18%|█▊        | 1340/7500 [39:26<3:08:30,  1.84s/it]

{'loss': 0.8375, 'grad_norm': 0.4445217251777649, 'learning_rate': 4.134228187919463e-05, 'epoch': 0.54}


 18%|█▊        | 1350/7500 [39:45<3:22:01,  1.97s/it]

{'loss': 0.861, 'grad_norm': 2.43528151512146, 'learning_rate': 4.1275167785234905e-05, 'epoch': 0.54}


 18%|█▊        | 1360/7500 [40:05<3:27:32,  2.03s/it]

{'loss': 0.7689, 'grad_norm': 0.6590541005134583, 'learning_rate': 4.120805369127517e-05, 'epoch': 0.54}


 18%|█▊        | 1370/7500 [40:25<3:15:57,  1.92s/it]

{'loss': 0.8107, 'grad_norm': 0.5591938495635986, 'learning_rate': 4.1140939597315435e-05, 'epoch': 0.55}


 18%|█▊        | 1380/7500 [40:44<3:13:55,  1.90s/it]

{'loss': 0.8268, 'grad_norm': 0.6588031053543091, 'learning_rate': 4.107382550335571e-05, 'epoch': 0.55}


 19%|█▊        | 1390/7500 [41:03<3:11:15,  1.88s/it]

{'loss': 0.7951, 'grad_norm': 0.5812787413597107, 'learning_rate': 4.100671140939598e-05, 'epoch': 0.56}


 19%|█▊        | 1400/7500 [41:22<3:12:28,  1.89s/it]

{'loss': 0.8123, 'grad_norm': 0.4550161063671112, 'learning_rate': 4.0939597315436244e-05, 'epoch': 0.56}


 19%|█▉        | 1410/7500 [41:43<3:13:27,  1.91s/it]

{'loss': 0.8263, 'grad_norm': 0.4159180521965027, 'learning_rate': 4.087248322147651e-05, 'epoch': 0.56}


 19%|█▉        | 1420/7500 [42:02<3:09:00,  1.87s/it]

{'loss': 0.8125, 'grad_norm': 0.3974171578884125, 'learning_rate': 4.080536912751678e-05, 'epoch': 0.57}


 19%|█▉        | 1430/7500 [42:23<3:38:31,  2.16s/it]

{'loss': 0.7863, 'grad_norm': 0.5129969120025635, 'learning_rate': 4.0738255033557046e-05, 'epoch': 0.57}


 19%|█▉        | 1440/7500 [42:46<3:31:48,  2.10s/it]

{'loss': 0.8503, 'grad_norm': 0.47771331667900085, 'learning_rate': 4.067114093959732e-05, 'epoch': 0.58}


 19%|█▉        | 1450/7500 [43:05<3:17:38,  1.96s/it]

{'loss': 0.8182, 'grad_norm': 0.47253164649009705, 'learning_rate': 4.060402684563759e-05, 'epoch': 0.58}


 19%|█▉        | 1460/7500 [43:24<3:14:18,  1.93s/it]

{'loss': 0.7965, 'grad_norm': 0.38527539372444153, 'learning_rate': 4.0536912751677855e-05, 'epoch': 0.58}


 20%|█▉        | 1470/7500 [43:44<3:17:39,  1.97s/it]

{'loss': 0.7877, 'grad_norm': 0.4188908636569977, 'learning_rate': 4.046979865771812e-05, 'epoch': 0.59}


 20%|█▉        | 1480/7500 [44:03<3:09:47,  1.89s/it]

{'loss': 0.8162, 'grad_norm': 0.44766008853912354, 'learning_rate': 4.040268456375839e-05, 'epoch': 0.59}


 20%|█▉        | 1490/7500 [44:23<3:16:08,  1.96s/it]

{'loss': 0.8027, 'grad_norm': 0.3971464931964874, 'learning_rate': 4.0335570469798664e-05, 'epoch': 0.6}


 20%|██        | 1500/7500 [44:43<3:15:28,  1.95s/it]

{'loss': 0.8185, 'grad_norm': 0.6015833020210266, 'learning_rate': 4.026845637583892e-05, 'epoch': 0.6}


 20%|██        | 1510/7500 [45:04<3:14:01,  1.94s/it]

{'loss': 0.8445, 'grad_norm': 0.5475671291351318, 'learning_rate': 4.0201342281879194e-05, 'epoch': 0.6}


 20%|██        | 1520/7500 [45:25<3:26:12,  2.07s/it]

{'loss': 0.8209, 'grad_norm': 0.6299822926521301, 'learning_rate': 4.0134228187919466e-05, 'epoch': 0.61}


 20%|██        | 1530/7500 [45:46<3:23:59,  2.05s/it]

{'loss': 0.7963, 'grad_norm': 0.4749510586261749, 'learning_rate': 4.006711409395973e-05, 'epoch': 0.61}


 21%|██        | 1540/7500 [46:06<3:28:36,  2.10s/it]

{'loss': 0.8811, 'grad_norm': 0.5322360396385193, 'learning_rate': 4e-05, 'epoch': 0.62}


 21%|██        | 1550/7500 [46:25<3:05:25,  1.87s/it]

{'loss': 0.877, 'grad_norm': 0.5978991985321045, 'learning_rate': 3.993288590604027e-05, 'epoch': 0.62}


 21%|██        | 1560/7500 [46:44<3:06:21,  1.88s/it]

{'loss': 0.8113, 'grad_norm': 0.4866475462913513, 'learning_rate': 3.986577181208054e-05, 'epoch': 0.62}


 21%|██        | 1570/7500 [47:03<3:03:58,  1.86s/it]

{'loss': 0.8165, 'grad_norm': 0.4356890618801117, 'learning_rate': 3.9798657718120805e-05, 'epoch': 0.63}


 21%|██        | 1580/7500 [47:22<3:15:52,  1.99s/it]

{'loss': 0.8451, 'grad_norm': 0.7098485827445984, 'learning_rate': 3.973154362416108e-05, 'epoch': 0.63}


 21%|██        | 1590/7500 [47:43<3:17:11,  2.00s/it]

{'loss': 0.8418, 'grad_norm': 0.7634230256080627, 'learning_rate': 3.966442953020135e-05, 'epoch': 0.64}


 21%|██▏       | 1600/7500 [48:02<3:02:17,  1.85s/it]

{'loss': 0.8242, 'grad_norm': 0.5442659258842468, 'learning_rate': 3.959731543624161e-05, 'epoch': 0.64}


 21%|██▏       | 1610/7500 [48:20<3:02:51,  1.86s/it]

{'loss': 0.7721, 'grad_norm': 0.4108196794986725, 'learning_rate': 3.953020134228188e-05, 'epoch': 0.64}


 22%|██▏       | 1620/7500 [48:39<3:01:36,  1.85s/it]

{'loss': 0.8505, 'grad_norm': 0.8936275839805603, 'learning_rate': 3.946308724832215e-05, 'epoch': 0.65}


 22%|██▏       | 1630/7500 [48:57<3:01:10,  1.85s/it]

{'loss': 0.8095, 'grad_norm': 0.6224902272224426, 'learning_rate': 3.939597315436242e-05, 'epoch': 0.65}


 22%|██▏       | 1640/7500 [49:16<3:00:29,  1.85s/it]

{'loss': 0.7734, 'grad_norm': 0.4374496638774872, 'learning_rate': 3.932885906040268e-05, 'epoch': 0.66}


 22%|██▏       | 1650/7500 [49:34<3:00:39,  1.85s/it]

{'loss': 0.813, 'grad_norm': 0.801213264465332, 'learning_rate': 3.9261744966442954e-05, 'epoch': 0.66}


 22%|██▏       | 1660/7500 [49:53<3:05:11,  1.90s/it]

{'loss': 0.8173, 'grad_norm': 0.5311410427093506, 'learning_rate': 3.9194630872483226e-05, 'epoch': 0.66}


 22%|██▏       | 1670/7500 [50:15<3:36:00,  2.22s/it]

{'loss': 0.7803, 'grad_norm': 0.5795221924781799, 'learning_rate': 3.912751677852349e-05, 'epoch': 0.67}


 22%|██▏       | 1680/7500 [50:35<3:09:53,  1.96s/it]

{'loss': 0.8122, 'grad_norm': 0.5528156161308289, 'learning_rate': 3.906040268456376e-05, 'epoch': 0.67}


 23%|██▎       | 1690/7500 [50:55<3:14:50,  2.01s/it]

{'loss': 0.7944, 'grad_norm': 0.6832247376441956, 'learning_rate': 3.899328859060403e-05, 'epoch': 0.68}


 23%|██▎       | 1700/7500 [51:15<3:25:12,  2.12s/it]

{'loss': 0.8527, 'grad_norm': 0.4384561777114868, 'learning_rate': 3.89261744966443e-05, 'epoch': 0.68}


 23%|██▎       | 1710/7500 [51:37<3:30:43,  2.18s/it]

{'loss': 0.8025, 'grad_norm': 0.9165161848068237, 'learning_rate': 3.8859060402684565e-05, 'epoch': 0.68}


 23%|██▎       | 1720/7500 [51:56<3:04:24,  1.91s/it]

{'loss': 0.824, 'grad_norm': 0.43605735898017883, 'learning_rate': 3.879194630872484e-05, 'epoch': 0.69}


 23%|██▎       | 1730/7500 [52:15<3:04:51,  1.92s/it]

{'loss': 0.8188, 'grad_norm': 0.47718727588653564, 'learning_rate': 3.87248322147651e-05, 'epoch': 0.69}


 23%|██▎       | 1740/7500 [52:35<3:02:27,  1.90s/it]

{'loss': 0.8194, 'grad_norm': 0.6776829957962036, 'learning_rate': 3.865771812080537e-05, 'epoch': 0.7}


 23%|██▎       | 1750/7500 [52:55<3:27:43,  2.17s/it]

{'loss': 0.7726, 'grad_norm': 0.6039469242095947, 'learning_rate': 3.859060402684564e-05, 'epoch': 0.7}


 23%|██▎       | 1760/7500 [53:15<3:10:04,  1.99s/it]

{'loss': 0.8164, 'grad_norm': 0.6339350342750549, 'learning_rate': 3.852348993288591e-05, 'epoch': 0.7}


 24%|██▎       | 1770/7500 [53:37<3:30:23,  2.20s/it]

{'loss': 0.7988, 'grad_norm': 0.5462363958358765, 'learning_rate': 3.8456375838926176e-05, 'epoch': 0.71}


 24%|██▎       | 1780/7500 [53:57<3:01:02,  1.90s/it]

{'loss': 0.8406, 'grad_norm': 0.8839244842529297, 'learning_rate': 3.838926174496644e-05, 'epoch': 0.71}


 24%|██▍       | 1790/7500 [54:17<2:55:39,  1.85s/it]

{'loss': 0.7906, 'grad_norm': 0.9911792874336243, 'learning_rate': 3.832214765100671e-05, 'epoch': 0.72}


 24%|██▍       | 1800/7500 [54:34<2:44:51,  1.74s/it]

{'loss': 0.8307, 'grad_norm': 0.49707940220832825, 'learning_rate': 3.8255033557046985e-05, 'epoch': 0.72}


 24%|██▍       | 1810/7500 [54:52<2:43:20,  1.72s/it]

{'loss': 0.8226, 'grad_norm': 0.4082269072532654, 'learning_rate': 3.818791946308725e-05, 'epoch': 0.72}


 24%|██▍       | 1820/7500 [55:09<2:42:31,  1.72s/it]

{'loss': 0.8152, 'grad_norm': 0.4301064908504486, 'learning_rate': 3.812080536912752e-05, 'epoch': 0.73}


 24%|██▍       | 1830/7500 [55:26<2:43:44,  1.73s/it]

{'loss': 0.825, 'grad_norm': 0.7726950645446777, 'learning_rate': 3.805369127516779e-05, 'epoch': 0.73}


 25%|██▍       | 1840/7500 [55:44<2:44:36,  1.74s/it]

{'loss': 0.8584, 'grad_norm': 0.401119589805603, 'learning_rate': 3.798657718120805e-05, 'epoch': 0.74}


 25%|██▍       | 1850/7500 [56:01<2:46:02,  1.76s/it]

{'loss': 0.7505, 'grad_norm': 0.41656574606895447, 'learning_rate': 3.7919463087248324e-05, 'epoch': 0.74}


 25%|██▍       | 1860/7500 [56:19<2:44:41,  1.75s/it]

{'loss': 0.7664, 'grad_norm': 0.4050058424472809, 'learning_rate': 3.7852348993288596e-05, 'epoch': 0.74}


 25%|██▍       | 1870/7500 [56:36<2:44:50,  1.76s/it]

{'loss': 0.7999, 'grad_norm': 0.47722405195236206, 'learning_rate': 3.778523489932886e-05, 'epoch': 0.75}


 25%|██▌       | 1880/7500 [56:54<2:44:29,  1.76s/it]

{'loss': 0.8066, 'grad_norm': 0.6129531860351562, 'learning_rate': 3.7718120805369127e-05, 'epoch': 0.75}


 25%|██▌       | 1890/7500 [57:11<2:44:25,  1.76s/it]

{'loss': 0.7985, 'grad_norm': 0.5419004559516907, 'learning_rate': 3.76510067114094e-05, 'epoch': 0.76}


 25%|██▌       | 1900/7500 [57:29<2:44:27,  1.76s/it]

{'loss': 0.771, 'grad_norm': 0.5306125283241272, 'learning_rate': 3.758389261744967e-05, 'epoch': 0.76}


 25%|██▌       | 1910/7500 [57:47<2:44:22,  1.76s/it]

{'loss': 0.8509, 'grad_norm': 0.49655261635780334, 'learning_rate': 3.7516778523489936e-05, 'epoch': 0.76}


 26%|██▌       | 1920/7500 [58:04<2:44:31,  1.77s/it]

{'loss': 0.7898, 'grad_norm': 0.5950737595558167, 'learning_rate': 3.74496644295302e-05, 'epoch': 0.77}


 26%|██▌       | 1930/7500 [58:22<2:45:14,  1.78s/it]

{'loss': 0.8582, 'grad_norm': 0.48936158418655396, 'learning_rate': 3.738255033557047e-05, 'epoch': 0.77}


 26%|██▌       | 1940/7500 [58:40<2:41:56,  1.75s/it]

{'loss': 0.8307, 'grad_norm': 0.4162445366382599, 'learning_rate': 3.731543624161074e-05, 'epoch': 0.78}


 26%|██▌       | 1950/7500 [58:57<2:44:43,  1.78s/it]

{'loss': 0.8442, 'grad_norm': 0.411698579788208, 'learning_rate': 3.724832214765101e-05, 'epoch': 0.78}


 26%|██▌       | 1960/7500 [59:15<2:44:03,  1.78s/it]

{'loss': 0.8096, 'grad_norm': 0.44956275820732117, 'learning_rate': 3.7181208053691275e-05, 'epoch': 0.78}


 26%|██▋       | 1970/7500 [59:33<2:41:25,  1.75s/it]

{'loss': 0.8389, 'grad_norm': 1.3910843133926392, 'learning_rate': 3.711409395973155e-05, 'epoch': 0.79}


 26%|██▋       | 1980/7500 [59:50<2:40:51,  1.75s/it]

{'loss': 0.8437, 'grad_norm': 0.5026639699935913, 'learning_rate': 3.704697986577181e-05, 'epoch': 0.79}


 27%|██▋       | 1990/7500 [1:00:08<2:39:22,  1.74s/it]

{'loss': 0.8186, 'grad_norm': 0.6044839024543762, 'learning_rate': 3.6979865771812084e-05, 'epoch': 0.8}


 27%|██▋       | 2000/7500 [1:00:24<2:31:34,  1.65s/it]

{'loss': 0.8055, 'grad_norm': 1.0025805234909058, 'learning_rate': 3.6912751677852356e-05, 'epoch': 0.8}


 27%|██▋       | 2010/7500 [1:00:42<2:26:53,  1.61s/it]

{'loss': 0.8177, 'grad_norm': 0.5734140872955322, 'learning_rate': 3.6845637583892614e-05, 'epoch': 0.8}


 27%|██▋       | 2020/7500 [1:00:58<2:21:59,  1.55s/it]

{'loss': 0.7823, 'grad_norm': 0.4604457914829254, 'learning_rate': 3.6778523489932886e-05, 'epoch': 0.81}


 27%|██▋       | 2030/7500 [1:01:12<2:14:21,  1.47s/it]

{'loss': 0.8179, 'grad_norm': 0.5588091015815735, 'learning_rate': 3.671140939597316e-05, 'epoch': 0.81}


 27%|██▋       | 2040/7500 [1:01:27<2:14:39,  1.48s/it]

{'loss': 0.7932, 'grad_norm': 0.5185137987136841, 'learning_rate': 3.664429530201342e-05, 'epoch': 0.82}


 27%|██▋       | 2050/7500 [1:01:41<2:11:11,  1.44s/it]

{'loss': 0.8363, 'grad_norm': 0.7579860091209412, 'learning_rate': 3.6577181208053695e-05, 'epoch': 0.82}


 27%|██▋       | 2060/7500 [1:01:56<2:13:34,  1.47s/it]

{'loss': 0.7844, 'grad_norm': 0.744287371635437, 'learning_rate': 3.651006711409396e-05, 'epoch': 0.82}


 28%|██▊       | 2070/7500 [1:02:11<2:14:13,  1.48s/it]

{'loss': 0.7722, 'grad_norm': 0.5714116096496582, 'learning_rate': 3.644295302013423e-05, 'epoch': 0.83}


 28%|██▊       | 2080/7500 [1:02:27<2:19:00,  1.54s/it]

{'loss': 0.8362, 'grad_norm': 0.5235586166381836, 'learning_rate': 3.63758389261745e-05, 'epoch': 0.83}


 28%|██▊       | 2090/7500 [1:02:42<2:16:05,  1.51s/it]

{'loss': 0.8228, 'grad_norm': 0.4743541181087494, 'learning_rate': 3.630872483221477e-05, 'epoch': 0.84}


 28%|██▊       | 2100/7500 [1:02:57<2:15:27,  1.51s/it]

{'loss': 0.8005, 'grad_norm': 0.4205895662307739, 'learning_rate': 3.6241610738255034e-05, 'epoch': 0.84}


 28%|██▊       | 2110/7500 [1:03:12<2:16:03,  1.51s/it]

{'loss': 0.8316, 'grad_norm': 0.4532695412635803, 'learning_rate': 3.61744966442953e-05, 'epoch': 0.84}


 28%|██▊       | 2120/7500 [1:03:27<2:15:11,  1.51s/it]

{'loss': 0.796, 'grad_norm': 0.4437156915664673, 'learning_rate': 3.610738255033557e-05, 'epoch': 0.85}


 28%|██▊       | 2130/7500 [1:03:42<2:14:01,  1.50s/it]

{'loss': 0.7696, 'grad_norm': 0.44007226824760437, 'learning_rate': 3.604026845637584e-05, 'epoch': 0.85}


 29%|██▊       | 2140/7500 [1:03:58<2:24:17,  1.62s/it]

{'loss': 0.8142, 'grad_norm': 0.39402613043785095, 'learning_rate': 3.597315436241611e-05, 'epoch': 0.86}


 29%|██▊       | 2150/7500 [1:04:13<2:15:43,  1.52s/it]

{'loss': 0.7867, 'grad_norm': 0.9225101470947266, 'learning_rate': 3.5906040268456373e-05, 'epoch': 0.86}


 29%|██▉       | 2160/7500 [1:04:29<2:16:50,  1.54s/it]

{'loss': 0.8032, 'grad_norm': 0.420204222202301, 'learning_rate': 3.5838926174496645e-05, 'epoch': 0.86}


 29%|██▉       | 2170/7500 [1:04:45<2:22:16,  1.60s/it]

{'loss': 0.7978, 'grad_norm': 0.48570728302001953, 'learning_rate': 3.577181208053692e-05, 'epoch': 0.87}


 29%|██▉       | 2180/7500 [1:05:02<2:32:13,  1.72s/it]

{'loss': 0.7655, 'grad_norm': 0.4699016213417053, 'learning_rate': 3.570469798657718e-05, 'epoch': 0.87}


 29%|██▉       | 2190/7500 [1:05:19<2:28:51,  1.68s/it]

{'loss': 0.8267, 'grad_norm': 0.5210456848144531, 'learning_rate': 3.563758389261745e-05, 'epoch': 0.88}


 29%|██▉       | 2200/7500 [1:05:35<2:25:39,  1.65s/it]

{'loss': 0.8799, 'grad_norm': 0.5754959583282471, 'learning_rate': 3.557046979865772e-05, 'epoch': 0.88}


 29%|██▉       | 2210/7500 [1:05:54<2:39:18,  1.81s/it]

{'loss': 0.8255, 'grad_norm': 0.4451625943183899, 'learning_rate': 3.550335570469799e-05, 'epoch': 0.88}


 30%|██▉       | 2220/7500 [1:06:14<3:11:40,  2.18s/it]

{'loss': 0.8207, 'grad_norm': 0.4658513069152832, 'learning_rate': 3.5436241610738257e-05, 'epoch': 0.89}


 30%|██▉       | 2230/7500 [1:06:37<3:18:32,  2.26s/it]

{'loss': 0.7679, 'grad_norm': 0.4933529496192932, 'learning_rate': 3.536912751677853e-05, 'epoch': 0.89}


 30%|██▉       | 2240/7500 [1:06:59<3:06:36,  2.13s/it]

{'loss': 0.8372, 'grad_norm': 0.7945248484611511, 'learning_rate': 3.5302013422818794e-05, 'epoch': 0.9}


 30%|███       | 2250/7500 [1:07:20<3:05:06,  2.12s/it]

{'loss': 0.7929, 'grad_norm': 0.5172239542007446, 'learning_rate': 3.523489932885906e-05, 'epoch': 0.9}


 30%|███       | 2260/7500 [1:07:42<3:06:53,  2.14s/it]

{'loss': 0.7447, 'grad_norm': 0.7391238212585449, 'learning_rate': 3.516778523489933e-05, 'epoch': 0.9}


 30%|███       | 2270/7500 [1:08:03<3:06:15,  2.14s/it]

{'loss': 0.7935, 'grad_norm': 0.5234575867652893, 'learning_rate': 3.51006711409396e-05, 'epoch': 0.91}


 30%|███       | 2280/7500 [1:08:25<3:12:31,  2.21s/it]

{'loss': 0.8114, 'grad_norm': 0.6008894443511963, 'learning_rate': 3.503355704697987e-05, 'epoch': 0.91}


 31%|███       | 2290/7500 [1:08:48<3:15:13,  2.25s/it]

{'loss': 0.7796, 'grad_norm': 0.6070494651794434, 'learning_rate': 3.496644295302013e-05, 'epoch': 0.92}


 31%|███       | 2300/7500 [1:09:11<3:16:17,  2.26s/it]

{'loss': 0.8334, 'grad_norm': 0.4486670196056366, 'learning_rate': 3.4899328859060405e-05, 'epoch': 0.92}


 31%|███       | 2310/7500 [1:09:33<3:12:20,  2.22s/it]

{'loss': 0.7922, 'grad_norm': 0.35800373554229736, 'learning_rate': 3.483221476510068e-05, 'epoch': 0.92}


 31%|███       | 2320/7500 [1:09:55<3:07:49,  2.18s/it]

{'loss': 0.8366, 'grad_norm': 0.38733577728271484, 'learning_rate': 3.476510067114094e-05, 'epoch': 0.93}


 31%|███       | 2330/7500 [1:10:11<2:25:21,  1.69s/it]

{'loss': 0.792, 'grad_norm': 0.5221362709999084, 'learning_rate': 3.469798657718121e-05, 'epoch': 0.93}


 31%|███       | 2340/7500 [1:10:28<2:26:06,  1.70s/it]

{'loss': 0.8099, 'grad_norm': 0.42169883847236633, 'learning_rate': 3.463087248322148e-05, 'epoch': 0.94}


 31%|███▏      | 2350/7500 [1:10:45<2:25:55,  1.70s/it]

{'loss': 0.7735, 'grad_norm': 0.44882601499557495, 'learning_rate': 3.4563758389261744e-05, 'epoch': 0.94}


 31%|███▏      | 2360/7500 [1:11:02<2:25:00,  1.69s/it]

{'loss': 0.7827, 'grad_norm': 0.3539263606071472, 'learning_rate': 3.4496644295302016e-05, 'epoch': 0.94}


 32%|███▏      | 2370/7500 [1:11:18<2:18:30,  1.62s/it]

{'loss': 0.8066, 'grad_norm': 0.47871240973472595, 'learning_rate': 3.442953020134229e-05, 'epoch': 0.95}


 32%|███▏      | 2380/7500 [1:11:34<2:17:25,  1.61s/it]

{'loss': 0.7917, 'grad_norm': 0.5123698711395264, 'learning_rate': 3.436241610738255e-05, 'epoch': 0.95}


 32%|███▏      | 2390/7500 [1:11:51<2:26:25,  1.72s/it]

{'loss': 0.7925, 'grad_norm': 0.6458567976951599, 'learning_rate': 3.429530201342282e-05, 'epoch': 0.96}


 32%|███▏      | 2400/7500 [1:12:08<2:26:05,  1.72s/it]

{'loss': 0.8813, 'grad_norm': 0.5956690311431885, 'learning_rate': 3.422818791946309e-05, 'epoch': 0.96}


 32%|███▏      | 2410/7500 [1:12:31<3:22:18,  2.38s/it]

{'loss': 0.7863, 'grad_norm': 0.4297516942024231, 'learning_rate': 3.416107382550336e-05, 'epoch': 0.96}


 32%|███▏      | 2420/7500 [1:12:54<3:14:50,  2.30s/it]

{'loss': 0.8083, 'grad_norm': 0.8957582116127014, 'learning_rate': 3.409395973154362e-05, 'epoch': 0.97}


 32%|███▏      | 2430/7500 [1:13:15<2:53:53,  2.06s/it]

{'loss': 0.8041, 'grad_norm': 0.4128551185131073, 'learning_rate': 3.402684563758389e-05, 'epoch': 0.97}


 33%|███▎      | 2440/7500 [1:13:33<2:28:33,  1.76s/it]

{'loss': 0.8118, 'grad_norm': 0.41386014223098755, 'learning_rate': 3.3959731543624164e-05, 'epoch': 0.98}


 33%|███▎      | 2450/7500 [1:13:50<2:24:01,  1.71s/it]

{'loss': 0.812, 'grad_norm': 0.36687329411506653, 'learning_rate': 3.389261744966443e-05, 'epoch': 0.98}


 33%|███▎      | 2460/7500 [1:14:07<2:22:12,  1.69s/it]

{'loss': 0.7801, 'grad_norm': 0.48532840609550476, 'learning_rate': 3.38255033557047e-05, 'epoch': 0.98}


 33%|███▎      | 2470/7500 [1:14:24<2:23:10,  1.71s/it]

{'loss': 0.7702, 'grad_norm': 0.5029432773590088, 'learning_rate': 3.3758389261744966e-05, 'epoch': 0.99}


 33%|███▎      | 2480/7500 [1:14:41<2:22:22,  1.70s/it]

{'loss': 0.8088, 'grad_norm': 0.47873228788375854, 'learning_rate': 3.369127516778524e-05, 'epoch': 0.99}


 33%|███▎      | 2490/7500 [1:14:59<2:23:51,  1.72s/it]

{'loss': 0.8291, 'grad_norm': 0.3891439735889435, 'learning_rate': 3.3624161073825504e-05, 'epoch': 1.0}


 33%|███▎      | 2500/7500 [1:15:16<2:21:06,  1.69s/it]

{'loss': 0.8101, 'grad_norm': 0.5628089308738708, 'learning_rate': 3.3557046979865775e-05, 'epoch': 1.0}


                                                       
 33%|███▎      | 2500/7500 [1:16:23<2:21:06,  1.69s/it]

{'eval_loss': 0.6651220917701721, 'eval_runtime': 64.5242, 'eval_samples_per_second': 7.749, 'eval_steps_per_second': 1.937, 'epoch': 1.0}


 33%|███▎      | 2510/7500 [1:16:40<3:34:14,  2.58s/it] 

{'loss': 0.8002, 'grad_norm': 0.6486207842826843, 'learning_rate': 3.348993288590605e-05, 'epoch': 1.0}


 34%|███▎      | 2520/7500 [1:16:58<2:26:37,  1.77s/it]

{'loss': 0.7817, 'grad_norm': 0.41100430488586426, 'learning_rate': 3.3422818791946306e-05, 'epoch': 1.01}


 34%|███▎      | 2530/7500 [1:17:15<2:23:21,  1.73s/it]

{'loss': 0.7885, 'grad_norm': 0.5309016704559326, 'learning_rate': 3.335570469798658e-05, 'epoch': 1.01}


 34%|███▍      | 2540/7500 [1:17:32<2:22:55,  1.73s/it]

{'loss': 0.7895, 'grad_norm': 0.4791862666606903, 'learning_rate': 3.328859060402685e-05, 'epoch': 1.02}


 34%|███▍      | 2550/7500 [1:17:50<2:22:15,  1.72s/it]

{'loss': 0.7717, 'grad_norm': 0.4467006027698517, 'learning_rate': 3.3221476510067115e-05, 'epoch': 1.02}


 34%|███▍      | 2560/7500 [1:18:07<2:21:33,  1.72s/it]

{'loss': 0.7473, 'grad_norm': 0.6166042685508728, 'learning_rate': 3.315436241610738e-05, 'epoch': 1.02}


 34%|███▍      | 2570/7500 [1:18:24<2:22:30,  1.73s/it]

{'loss': 0.8301, 'grad_norm': 0.46493884921073914, 'learning_rate': 3.308724832214765e-05, 'epoch': 1.03}


 34%|███▍      | 2580/7500 [1:18:43<2:30:11,  1.83s/it]

{'loss': 0.7893, 'grad_norm': 0.46080708503723145, 'learning_rate': 3.3020134228187924e-05, 'epoch': 1.03}


 35%|███▍      | 2590/7500 [1:19:02<2:34:48,  1.89s/it]

{'loss': 0.7418, 'grad_norm': 0.4527905583381653, 'learning_rate': 3.295302013422819e-05, 'epoch': 1.04}


 35%|███▍      | 2600/7500 [1:19:20<2:31:51,  1.86s/it]

{'loss': 0.7442, 'grad_norm': 0.47521650791168213, 'learning_rate': 3.288590604026846e-05, 'epoch': 1.04}


 35%|███▍      | 2610/7500 [1:19:38<2:24:50,  1.78s/it]

{'loss': 0.7889, 'grad_norm': 0.5259033441543579, 'learning_rate': 3.2818791946308726e-05, 'epoch': 1.04}


 35%|███▍      | 2620/7500 [1:19:56<2:25:38,  1.79s/it]

{'loss': 0.7947, 'grad_norm': 0.5252101421356201, 'learning_rate': 3.275167785234899e-05, 'epoch': 1.05}


 35%|███▌      | 2630/7500 [1:20:13<2:21:27,  1.74s/it]

{'loss': 0.7784, 'grad_norm': 0.5556963086128235, 'learning_rate': 3.268456375838926e-05, 'epoch': 1.05}


 35%|███▌      | 2640/7500 [1:20:30<2:18:44,  1.71s/it]

{'loss': 0.795, 'grad_norm': 0.4681684076786041, 'learning_rate': 3.2617449664429535e-05, 'epoch': 1.06}


 35%|███▌      | 2650/7500 [1:20:47<2:19:15,  1.72s/it]

{'loss': 0.7646, 'grad_norm': 0.4724542200565338, 'learning_rate': 3.25503355704698e-05, 'epoch': 1.06}


 35%|███▌      | 2660/7500 [1:21:05<2:21:34,  1.76s/it]

{'loss': 0.7682, 'grad_norm': 0.6122799515724182, 'learning_rate': 3.2483221476510065e-05, 'epoch': 1.06}


 36%|███▌      | 2670/7500 [1:21:22<2:20:46,  1.75s/it]

{'loss': 0.7915, 'grad_norm': 0.6424400210380554, 'learning_rate': 3.241610738255034e-05, 'epoch': 1.07}


 36%|███▌      | 2680/7500 [1:21:40<2:20:56,  1.75s/it]

{'loss': 0.7835, 'grad_norm': 0.5852692127227783, 'learning_rate': 3.234899328859061e-05, 'epoch': 1.07}


 36%|███▌      | 2690/7500 [1:21:57<2:21:21,  1.76s/it]

{'loss': 0.7774, 'grad_norm': 0.5488207936286926, 'learning_rate': 3.2281879194630874e-05, 'epoch': 1.08}


 36%|███▌      | 2700/7500 [1:22:15<2:21:38,  1.77s/it]

{'loss': 0.7972, 'grad_norm': 0.5656976699829102, 'learning_rate': 3.221476510067114e-05, 'epoch': 1.08}


 36%|███▌      | 2710/7500 [1:22:33<2:21:59,  1.78s/it]

{'loss': 0.7736, 'grad_norm': 0.6728845238685608, 'learning_rate': 3.214765100671141e-05, 'epoch': 1.08}


 36%|███▋      | 2720/7500 [1:22:51<2:21:04,  1.77s/it]

{'loss': 0.8161, 'grad_norm': 0.6468605399131775, 'learning_rate': 3.208053691275168e-05, 'epoch': 1.09}


 36%|███▋      | 2730/7500 [1:23:08<2:20:35,  1.77s/it]

{'loss': 0.7624, 'grad_norm': 0.5242992639541626, 'learning_rate': 3.201342281879195e-05, 'epoch': 1.09}


 37%|███▋      | 2740/7500 [1:23:27<2:22:55,  1.80s/it]

{'loss': 0.8039, 'grad_norm': 0.4925195872783661, 'learning_rate': 3.194630872483222e-05, 'epoch': 1.1}


 37%|███▋      | 2750/7500 [1:23:44<2:20:01,  1.77s/it]

{'loss': 0.7818, 'grad_norm': 0.5012977123260498, 'learning_rate': 3.1879194630872485e-05, 'epoch': 1.1}


 37%|███▋      | 2760/7500 [1:24:02<2:19:03,  1.76s/it]

{'loss': 0.8, 'grad_norm': 0.5005251169204712, 'learning_rate': 3.181208053691275e-05, 'epoch': 1.1}


 37%|███▋      | 2770/7500 [1:24:20<2:18:20,  1.75s/it]

{'loss': 0.7624, 'grad_norm': 0.47433900833129883, 'learning_rate': 3.174496644295302e-05, 'epoch': 1.11}


 37%|███▋      | 2780/7500 [1:24:37<2:18:14,  1.76s/it]

{'loss': 0.7689, 'grad_norm': 0.5272237062454224, 'learning_rate': 3.1677852348993294e-05, 'epoch': 1.11}


 37%|███▋      | 2790/7500 [1:24:55<2:22:44,  1.82s/it]

{'loss': 0.766, 'grad_norm': 0.5101783275604248, 'learning_rate': 3.161073825503356e-05, 'epoch': 1.12}


 37%|███▋      | 2800/7500 [1:25:13<2:21:02,  1.80s/it]

{'loss': 0.8127, 'grad_norm': 0.4306938946247101, 'learning_rate': 3.1543624161073825e-05, 'epoch': 1.12}


 37%|███▋      | 2810/7500 [1:25:31<2:20:15,  1.79s/it]

{'loss': 0.7885, 'grad_norm': 1.2065268754959106, 'learning_rate': 3.1476510067114096e-05, 'epoch': 1.12}


 38%|███▊      | 2820/7500 [1:25:49<2:19:59,  1.79s/it]

{'loss': 0.7772, 'grad_norm': 0.4700937271118164, 'learning_rate': 3.140939597315437e-05, 'epoch': 1.13}


 38%|███▊      | 2830/7500 [1:26:07<2:20:39,  1.81s/it]

{'loss': 0.7706, 'grad_norm': 0.5218549370765686, 'learning_rate': 3.1342281879194634e-05, 'epoch': 1.13}


 38%|███▊      | 2840/7500 [1:26:25<2:20:03,  1.80s/it]

{'loss': 0.7732, 'grad_norm': 0.5657075047492981, 'learning_rate': 3.12751677852349e-05, 'epoch': 1.14}


 38%|███▊      | 2850/7500 [1:26:43<2:20:57,  1.82s/it]

{'loss': 0.7892, 'grad_norm': 0.531923770904541, 'learning_rate': 3.120805369127517e-05, 'epoch': 1.14}


 38%|███▊      | 2860/7500 [1:27:02<2:19:06,  1.80s/it]

{'loss': 0.7627, 'grad_norm': 0.4388981759548187, 'learning_rate': 3.1140939597315436e-05, 'epoch': 1.14}


 38%|███▊      | 2870/7500 [1:27:19<2:16:37,  1.77s/it]

{'loss': 0.7687, 'grad_norm': 0.4286942481994629, 'learning_rate': 3.107382550335571e-05, 'epoch': 1.15}


 38%|███▊      | 2880/7500 [1:27:37<2:17:12,  1.78s/it]

{'loss': 0.8355, 'grad_norm': 0.457025945186615, 'learning_rate': 3.100671140939597e-05, 'epoch': 1.15}


 39%|███▊      | 2890/7500 [1:27:55<2:17:05,  1.78s/it]

{'loss': 0.7575, 'grad_norm': 0.5800076127052307, 'learning_rate': 3.0939597315436245e-05, 'epoch': 1.16}


 39%|███▊      | 2900/7500 [1:28:13<2:18:02,  1.80s/it]

{'loss': 0.7498, 'grad_norm': 0.5276281237602234, 'learning_rate': 3.087248322147651e-05, 'epoch': 1.16}


 39%|███▉      | 2910/7500 [1:28:31<2:18:29,  1.81s/it]

{'loss': 0.761, 'grad_norm': 0.7007582783699036, 'learning_rate': 3.080536912751678e-05, 'epoch': 1.16}


 39%|███▉      | 2920/7500 [1:28:49<2:18:26,  1.81s/it]

{'loss': 0.8385, 'grad_norm': 0.5294583439826965, 'learning_rate': 3.0738255033557054e-05, 'epoch': 1.17}


 39%|███▉      | 2930/7500 [1:29:07<2:17:40,  1.81s/it]

{'loss': 0.8269, 'grad_norm': 0.6260055303573608, 'learning_rate': 3.067114093959731e-05, 'epoch': 1.17}


 39%|███▉      | 2940/7500 [1:29:25<2:16:23,  1.79s/it]

{'loss': 0.8381, 'grad_norm': 0.47341206669807434, 'learning_rate': 3.0604026845637584e-05, 'epoch': 1.18}


 39%|███▉      | 2950/7500 [1:29:43<2:16:14,  1.80s/it]

{'loss': 0.7675, 'grad_norm': 0.5055795311927795, 'learning_rate': 3.0536912751677856e-05, 'epoch': 1.18}


 39%|███▉      | 2960/7500 [1:30:01<2:15:57,  1.80s/it]

{'loss': 0.7285, 'grad_norm': 0.6985388398170471, 'learning_rate': 3.0469798657718124e-05, 'epoch': 1.18}


 40%|███▉      | 2970/7500 [1:30:19<2:15:53,  1.80s/it]

{'loss': 0.8089, 'grad_norm': 0.5541196465492249, 'learning_rate': 3.0402684563758393e-05, 'epoch': 1.19}


 40%|███▉      | 2980/7500 [1:30:37<2:16:24,  1.81s/it]

{'loss': 0.7923, 'grad_norm': 0.49398207664489746, 'learning_rate': 3.0335570469798658e-05, 'epoch': 1.19}


 40%|███▉      | 2990/7500 [1:30:55<2:15:22,  1.80s/it]

{'loss': 0.7522, 'grad_norm': 0.5911165475845337, 'learning_rate': 3.0268456375838927e-05, 'epoch': 1.2}


 40%|████      | 3000/7500 [1:31:13<2:16:09,  1.82s/it]

{'loss': 0.7669, 'grad_norm': 0.4938422441482544, 'learning_rate': 3.02013422818792e-05, 'epoch': 1.2}


 40%|████      | 3010/7500 [1:31:33<2:18:31,  1.85s/it]

{'loss': 0.7641, 'grad_norm': 0.9208253026008606, 'learning_rate': 3.0134228187919467e-05, 'epoch': 1.2}


 40%|████      | 3020/7500 [1:31:52<2:16:19,  1.83s/it]

{'loss': 0.7675, 'grad_norm': 0.5167770981788635, 'learning_rate': 3.0067114093959732e-05, 'epoch': 1.21}


 40%|████      | 3030/7500 [1:32:10<2:16:00,  1.83s/it]

{'loss': 0.7652, 'grad_norm': 0.45968136191368103, 'learning_rate': 3e-05, 'epoch': 1.21}


 41%|████      | 3040/7500 [1:32:28<2:14:39,  1.81s/it]

{'loss': 0.7823, 'grad_norm': 0.5777748823165894, 'learning_rate': 2.993288590604027e-05, 'epoch': 1.22}


 41%|████      | 3050/7500 [1:32:46<2:15:23,  1.83s/it]

{'loss': 0.7655, 'grad_norm': 0.4886890649795532, 'learning_rate': 2.986577181208054e-05, 'epoch': 1.22}


 41%|████      | 3060/7500 [1:33:04<2:15:18,  1.83s/it]

{'loss': 0.797, 'grad_norm': 0.7634465098381042, 'learning_rate': 2.979865771812081e-05, 'epoch': 1.22}


 41%|████      | 3070/7500 [1:33:23<2:13:51,  1.81s/it]

{'loss': 0.785, 'grad_norm': 0.592985987663269, 'learning_rate': 2.9731543624161075e-05, 'epoch': 1.23}


 41%|████      | 3080/7500 [1:33:41<2:13:44,  1.82s/it]

{'loss': 0.7582, 'grad_norm': 0.45766276121139526, 'learning_rate': 2.9664429530201343e-05, 'epoch': 1.23}


 41%|████      | 3090/7500 [1:33:59<2:12:08,  1.80s/it]

{'loss': 0.7995, 'grad_norm': 0.4725337028503418, 'learning_rate': 2.9597315436241612e-05, 'epoch': 1.24}


 41%|████▏     | 3100/7500 [1:34:17<2:11:11,  1.79s/it]

{'loss': 0.7812, 'grad_norm': 0.49268674850463867, 'learning_rate': 2.9530201342281884e-05, 'epoch': 1.24}


 41%|████▏     | 3110/7500 [1:34:35<2:12:12,  1.81s/it]

{'loss': 0.7838, 'grad_norm': 0.5757458209991455, 'learning_rate': 2.9463087248322146e-05, 'epoch': 1.24}


 42%|████▏     | 3120/7500 [1:34:53<2:11:54,  1.81s/it]

{'loss': 0.7595, 'grad_norm': 0.4229472577571869, 'learning_rate': 2.9395973154362418e-05, 'epoch': 1.25}


 42%|████▏     | 3130/7500 [1:35:11<2:16:46,  1.88s/it]

{'loss': 0.7752, 'grad_norm': 0.46872973442077637, 'learning_rate': 2.9328859060402686e-05, 'epoch': 1.25}


 42%|████▏     | 3140/7500 [1:35:30<2:13:07,  1.83s/it]

{'loss': 0.766, 'grad_norm': 0.4384276270866394, 'learning_rate': 2.9261744966442955e-05, 'epoch': 1.26}


 42%|████▏     | 3150/7500 [1:35:48<2:11:20,  1.81s/it]

{'loss': 0.7391, 'grad_norm': 0.4464544355869293, 'learning_rate': 2.9194630872483227e-05, 'epoch': 1.26}


 42%|████▏     | 3160/7500 [1:36:06<2:10:40,  1.81s/it]

{'loss': 0.7546, 'grad_norm': 0.4408946931362152, 'learning_rate': 2.9127516778523488e-05, 'epoch': 1.26}


 42%|████▏     | 3170/7500 [1:36:24<2:09:48,  1.80s/it]

{'loss': 0.7659, 'grad_norm': 0.44755181670188904, 'learning_rate': 2.906040268456376e-05, 'epoch': 1.27}


 42%|████▏     | 3180/7500 [1:36:42<2:09:57,  1.80s/it]

{'loss': 0.8085, 'grad_norm': 0.590898871421814, 'learning_rate': 2.899328859060403e-05, 'epoch': 1.27}


 43%|████▎     | 3190/7500 [1:37:00<2:11:12,  1.83s/it]

{'loss': 0.7421, 'grad_norm': 1.0835721492767334, 'learning_rate': 2.8926174496644297e-05, 'epoch': 1.28}


 43%|████▎     | 3200/7500 [1:37:19<2:11:00,  1.83s/it]

{'loss': 0.7629, 'grad_norm': 0.47515735030174255, 'learning_rate': 2.885906040268457e-05, 'epoch': 1.28}


 43%|████▎     | 3210/7500 [1:37:37<2:10:04,  1.82s/it]

{'loss': 0.8212, 'grad_norm': 0.5067833662033081, 'learning_rate': 2.879194630872483e-05, 'epoch': 1.28}


 43%|████▎     | 3220/7500 [1:37:55<2:09:56,  1.82s/it]

{'loss': 0.7807, 'grad_norm': 0.5096706748008728, 'learning_rate': 2.8724832214765103e-05, 'epoch': 1.29}


 43%|████▎     | 3230/7500 [1:38:13<2:09:22,  1.82s/it]

{'loss': 0.7815, 'grad_norm': 0.645085871219635, 'learning_rate': 2.865771812080537e-05, 'epoch': 1.29}


 43%|████▎     | 3240/7500 [1:38:31<2:08:45,  1.81s/it]

{'loss': 0.7531, 'grad_norm': 0.5004830956459045, 'learning_rate': 2.859060402684564e-05, 'epoch': 1.3}


 43%|████▎     | 3250/7500 [1:38:49<2:09:29,  1.83s/it]

{'loss': 0.7756, 'grad_norm': 0.47578439116477966, 'learning_rate': 2.8523489932885905e-05, 'epoch': 1.3}


 43%|████▎     | 3260/7500 [1:39:08<2:07:40,  1.81s/it]

{'loss': 0.7907, 'grad_norm': 0.754500150680542, 'learning_rate': 2.8456375838926174e-05, 'epoch': 1.3}


 44%|████▎     | 3270/7500 [1:39:26<2:07:53,  1.81s/it]

{'loss': 0.7525, 'grad_norm': 0.42999786138534546, 'learning_rate': 2.8389261744966445e-05, 'epoch': 1.31}


 44%|████▎     | 3280/7500 [1:39:44<2:06:44,  1.80s/it]

{'loss': 0.7967, 'grad_norm': 0.6670849323272705, 'learning_rate': 2.8322147651006714e-05, 'epoch': 1.31}


 44%|████▍     | 3290/7500 [1:40:02<2:06:54,  1.81s/it]

{'loss': 0.7397, 'grad_norm': 0.39618411660194397, 'learning_rate': 2.8255033557046983e-05, 'epoch': 1.32}


 44%|████▍     | 3300/7500 [1:40:21<2:12:31,  1.89s/it]

{'loss': 0.7944, 'grad_norm': 0.7117746472358704, 'learning_rate': 2.8187919463087248e-05, 'epoch': 1.32}


 44%|████▍     | 3310/7500 [1:40:39<2:10:02,  1.86s/it]

{'loss': 0.8154, 'grad_norm': 0.5057651996612549, 'learning_rate': 2.8120805369127516e-05, 'epoch': 1.32}


 44%|████▍     | 3320/7500 [1:40:58<2:07:46,  1.83s/it]

{'loss': 0.7792, 'grad_norm': 0.572659432888031, 'learning_rate': 2.8053691275167788e-05, 'epoch': 1.33}


 44%|████▍     | 3330/7500 [1:41:16<2:07:19,  1.83s/it]

{'loss': 0.8186, 'grad_norm': 0.5363041162490845, 'learning_rate': 2.7986577181208057e-05, 'epoch': 1.33}


 45%|████▍     | 3340/7500 [1:41:34<2:06:23,  1.82s/it]

{'loss': 0.7786, 'grad_norm': 0.6772052645683289, 'learning_rate': 2.7919463087248322e-05, 'epoch': 1.34}


 45%|████▍     | 3350/7500 [1:41:53<2:05:14,  1.81s/it]

{'loss': 0.7811, 'grad_norm': 0.39317813515663147, 'learning_rate': 2.785234899328859e-05, 'epoch': 1.34}


 45%|████▍     | 3360/7500 [1:42:11<2:04:15,  1.80s/it]

{'loss': 0.7809, 'grad_norm': 0.47953760623931885, 'learning_rate': 2.778523489932886e-05, 'epoch': 1.34}


 45%|████▍     | 3370/7500 [1:42:29<2:03:38,  1.80s/it]

{'loss': 0.7524, 'grad_norm': 0.43806925415992737, 'learning_rate': 2.771812080536913e-05, 'epoch': 1.35}


 45%|████▌     | 3380/7500 [1:42:47<2:04:17,  1.81s/it]

{'loss': 0.756, 'grad_norm': 0.5886784195899963, 'learning_rate': 2.76510067114094e-05, 'epoch': 1.35}


 45%|████▌     | 3390/7500 [1:43:05<2:03:58,  1.81s/it]

{'loss': 0.8292, 'grad_norm': 0.4637012779712677, 'learning_rate': 2.7583892617449664e-05, 'epoch': 1.36}


 45%|████▌     | 3400/7500 [1:43:23<2:03:14,  1.80s/it]

{'loss': 0.773, 'grad_norm': 0.906657338142395, 'learning_rate': 2.7516778523489933e-05, 'epoch': 1.36}


 45%|████▌     | 3410/7500 [1:43:41<2:03:13,  1.81s/it]

{'loss': 0.7646, 'grad_norm': 0.844290018081665, 'learning_rate': 2.74496644295302e-05, 'epoch': 1.36}


 46%|████▌     | 3420/7500 [1:43:59<2:01:20,  1.78s/it]

{'loss': 0.8085, 'grad_norm': 0.8744303584098816, 'learning_rate': 2.7382550335570473e-05, 'epoch': 1.37}


 46%|████▌     | 3430/7500 [1:44:17<2:01:36,  1.79s/it]

{'loss': 0.7821, 'grad_norm': 0.5245136022567749, 'learning_rate': 2.7315436241610742e-05, 'epoch': 1.37}


 46%|████▌     | 3440/7500 [1:44:35<2:01:38,  1.80s/it]

{'loss': 0.8388, 'grad_norm': 0.6008451581001282, 'learning_rate': 2.7248322147651007e-05, 'epoch': 1.38}


 46%|████▌     | 3450/7500 [1:44:53<2:02:07,  1.81s/it]

{'loss': 0.7655, 'grad_norm': 0.6683800220489502, 'learning_rate': 2.7181208053691276e-05, 'epoch': 1.38}


 46%|████▌     | 3460/7500 [1:45:11<2:02:04,  1.81s/it]

{'loss': 0.8139, 'grad_norm': 0.6989672780036926, 'learning_rate': 2.7114093959731544e-05, 'epoch': 1.38}


 46%|████▋     | 3470/7500 [1:45:29<2:02:33,  1.82s/it]

{'loss': 0.7152, 'grad_norm': 0.5059955716133118, 'learning_rate': 2.7046979865771816e-05, 'epoch': 1.39}


 46%|████▋     | 3480/7500 [1:45:47<2:01:55,  1.82s/it]

{'loss': 0.7834, 'grad_norm': 0.5039818286895752, 'learning_rate': 2.6979865771812078e-05, 'epoch': 1.39}


 47%|████▋     | 3490/7500 [1:46:05<2:00:16,  1.80s/it]

{'loss': 0.824, 'grad_norm': 9.128307342529297, 'learning_rate': 2.691275167785235e-05, 'epoch': 1.4}


 47%|████▋     | 3500/7500 [1:46:24<2:00:34,  1.81s/it]

{'loss': 0.8377, 'grad_norm': 0.6480047106742859, 'learning_rate': 2.6845637583892618e-05, 'epoch': 1.4}


 47%|████▋     | 3510/7500 [1:46:43<2:03:38,  1.86s/it]

{'loss': 0.7315, 'grad_norm': 0.45462507009506226, 'learning_rate': 2.6778523489932887e-05, 'epoch': 1.4}


 47%|████▋     | 3520/7500 [1:47:02<2:01:11,  1.83s/it]

{'loss': 0.7647, 'grad_norm': 0.5806149244308472, 'learning_rate': 2.671140939597316e-05, 'epoch': 1.41}


 47%|████▋     | 3530/7500 [1:47:20<2:01:03,  1.83s/it]

{'loss': 0.8016, 'grad_norm': 0.43792489171028137, 'learning_rate': 2.6644295302013424e-05, 'epoch': 1.41}


 47%|████▋     | 3540/7500 [1:47:38<2:02:30,  1.86s/it]

{'loss': 0.799, 'grad_norm': 0.48497459292411804, 'learning_rate': 2.6577181208053692e-05, 'epoch': 1.42}


 47%|████▋     | 3550/7500 [1:47:57<1:59:49,  1.82s/it]

{'loss': 0.7616, 'grad_norm': 0.5760969519615173, 'learning_rate': 2.651006711409396e-05, 'epoch': 1.42}


 47%|████▋     | 3560/7500 [1:48:15<2:00:37,  1.84s/it]

{'loss': 0.8007, 'grad_norm': 0.48525309562683105, 'learning_rate': 2.6442953020134233e-05, 'epoch': 1.42}


 48%|████▊     | 3570/7500 [1:48:33<1:58:51,  1.81s/it]

{'loss': 0.7709, 'grad_norm': 0.4777243435382843, 'learning_rate': 2.6375838926174495e-05, 'epoch': 1.43}


 48%|████▊     | 3580/7500 [1:48:51<1:58:41,  1.82s/it]

{'loss': 0.7718, 'grad_norm': 0.5871350169181824, 'learning_rate': 2.6308724832214767e-05, 'epoch': 1.43}


 48%|████▊     | 3590/7500 [1:49:10<1:58:51,  1.82s/it]

{'loss': 0.7805, 'grad_norm': 0.7655724287033081, 'learning_rate': 2.6241610738255035e-05, 'epoch': 1.44}


 48%|████▊     | 3600/7500 [1:49:28<1:57:56,  1.81s/it]

{'loss': 0.8142, 'grad_norm': 0.6093347668647766, 'learning_rate': 2.6174496644295304e-05, 'epoch': 1.44}


 48%|████▊     | 3610/7500 [1:49:46<1:57:57,  1.82s/it]

{'loss': 0.7524, 'grad_norm': 0.4896851181983948, 'learning_rate': 2.6107382550335576e-05, 'epoch': 1.44}


 48%|████▊     | 3620/7500 [1:50:06<2:11:04,  2.03s/it]

{'loss': 0.7489, 'grad_norm': 0.6201667189598083, 'learning_rate': 2.6040268456375837e-05, 'epoch': 1.45}


 48%|████▊     | 3630/7500 [1:51:12<2:15:02,  2.09s/it] 

{'loss': 0.7724, 'grad_norm': 0.5787298083305359, 'learning_rate': 2.597315436241611e-05, 'epoch': 1.45}


 49%|████▊     | 3640/7500 [1:51:25<1:26:20,  1.34s/it]

{'loss': 0.8096, 'grad_norm': 0.5570557117462158, 'learning_rate': 2.5906040268456378e-05, 'epoch': 1.46}


 49%|████▊     | 3650/7500 [1:51:38<1:24:41,  1.32s/it]

{'loss': 0.7793, 'grad_norm': 0.511702835559845, 'learning_rate': 2.5838926174496646e-05, 'epoch': 1.46}


 49%|████▉     | 3660/7500 [1:52:48<4:04:43,  3.82s/it] 

{'loss': 0.7916, 'grad_norm': 0.4958331882953644, 'learning_rate': 2.5771812080536918e-05, 'epoch': 1.46}


 49%|████▉     | 3670/7500 [1:53:02<1:29:00,  1.39s/it]

{'loss': 0.7879, 'grad_norm': 0.7840672731399536, 'learning_rate': 2.570469798657718e-05, 'epoch': 1.47}


 49%|████▉     | 3680/7500 [1:53:15<1:23:58,  1.32s/it]

{'loss': 0.7951, 'grad_norm': 0.5743019580841064, 'learning_rate': 2.5637583892617452e-05, 'epoch': 1.47}


 49%|████▉     | 3690/7500 [1:54:16<8:53:15,  8.40s/it] 

{'loss': 0.7718, 'grad_norm': 0.9124742746353149, 'learning_rate': 2.557046979865772e-05, 'epoch': 1.48}


 49%|████▉     | 3700/7500 [1:54:31<1:40:31,  1.59s/it]

{'loss': 0.7691, 'grad_norm': 0.5519729256629944, 'learning_rate': 2.550335570469799e-05, 'epoch': 1.48}


 49%|████▉     | 3710/7500 [1:54:45<1:24:16,  1.33s/it]

{'loss': 0.7985, 'grad_norm': 0.49303847551345825, 'learning_rate': 2.5436241610738254e-05, 'epoch': 1.48}


 50%|████▉     | 3720/7500 [1:54:58<1:23:16,  1.32s/it]

{'loss': 0.7558, 'grad_norm': 0.617606520652771, 'learning_rate': 2.5369127516778523e-05, 'epoch': 1.49}


 50%|████▉     | 3730/7500 [1:55:59<2:30:05,  2.39s/it] 

{'loss': 0.7964, 'grad_norm': 0.654484212398529, 'learning_rate': 2.5302013422818795e-05, 'epoch': 1.49}


 50%|████▉     | 3740/7500 [1:56:13<1:24:24,  1.35s/it]

{'loss': 0.761, 'grad_norm': 0.490685373544693, 'learning_rate': 2.5234899328859063e-05, 'epoch': 1.5}


 50%|█████     | 3750/7500 [1:56:26<1:22:25,  1.32s/it]

{'loss': 0.8711, 'grad_norm': 0.49225345253944397, 'learning_rate': 2.516778523489933e-05, 'epoch': 1.5}


 50%|█████     | 3760/7500 [1:57:28<5:03:56,  4.88s/it] 

{'loss': 0.7611, 'grad_norm': 0.49934762716293335, 'learning_rate': 2.5100671140939597e-05, 'epoch': 1.5}


 50%|█████     | 3770/7500 [1:57:43<1:30:25,  1.45s/it]

{'loss': 0.8299, 'grad_norm': 0.5199798941612244, 'learning_rate': 2.5033557046979865e-05, 'epoch': 1.51}


 50%|█████     | 3780/7500 [1:57:56<1:22:08,  1.32s/it]

{'loss': 0.7783, 'grad_norm': 0.536929190158844, 'learning_rate': 2.4966442953020137e-05, 'epoch': 1.51}


 51%|█████     | 3790/7500 [1:59:04<18:03:25, 17.52s/it]

{'loss': 0.7735, 'grad_norm': 0.5832937955856323, 'learning_rate': 2.4899328859060402e-05, 'epoch': 1.52}


 51%|█████     | 3800/7500 [1:59:19<1:56:57,  1.90s/it] 

{'loss': 0.7662, 'grad_norm': 0.4505569636821747, 'learning_rate': 2.4832214765100674e-05, 'epoch': 1.52}


 51%|█████     | 3810/7500 [1:59:32<1:22:06,  1.33s/it]

{'loss': 0.7919, 'grad_norm': 0.5131313800811768, 'learning_rate': 2.476510067114094e-05, 'epoch': 1.52}


 51%|█████     | 3820/7500 [1:59:45<1:20:55,  1.32s/it]

{'loss': 0.7741, 'grad_norm': 0.6953157186508179, 'learning_rate': 2.4697986577181208e-05, 'epoch': 1.53}


 51%|█████     | 3830/7500 [2:00:52<3:46:43,  3.71s/it] 

{'loss': 0.8528, 'grad_norm': 0.5517287850379944, 'learning_rate': 2.463087248322148e-05, 'epoch': 1.53}


 51%|█████     | 3840/7500 [2:01:07<1:26:06,  1.41s/it]

{'loss': 0.7765, 'grad_norm': 0.615082323551178, 'learning_rate': 2.4563758389261745e-05, 'epoch': 1.54}


 51%|█████▏    | 3850/7500 [2:01:20<1:20:40,  1.33s/it]

{'loss': 0.8302, 'grad_norm': 0.6728899478912354, 'learning_rate': 2.4496644295302017e-05, 'epoch': 1.54}


 51%|█████▏    | 3860/7500 [2:01:41<2:08:38,  2.12s/it]

{'loss': 0.7993, 'grad_norm': 0.6030591726303101, 'learning_rate': 2.4429530201342282e-05, 'epoch': 1.54}


 52%|█████▏    | 3870/7500 [2:01:54<1:23:15,  1.38s/it]

{'loss': 0.781, 'grad_norm': 0.664415180683136, 'learning_rate': 2.436241610738255e-05, 'epoch': 1.55}


 52%|█████▏    | 3880/7500 [2:02:08<1:22:47,  1.37s/it]

{'loss': 0.7622, 'grad_norm': 0.7355032563209534, 'learning_rate': 2.429530201342282e-05, 'epoch': 1.55}


 52%|█████▏    | 3890/7500 [2:02:24<1:48:52,  1.81s/it]

{'loss': 0.7343, 'grad_norm': 0.5302519798278809, 'learning_rate': 2.4228187919463088e-05, 'epoch': 1.56}


 52%|█████▏    | 3900/7500 [2:02:51<2:37:04,  2.62s/it]

{'loss': 0.7773, 'grad_norm': 0.6568731069564819, 'learning_rate': 2.416107382550336e-05, 'epoch': 1.56}


 52%|█████▏    | 3910/7500 [2:03:14<2:11:58,  2.21s/it]

{'loss': 0.7818, 'grad_norm': 0.49306485056877136, 'learning_rate': 2.4093959731543625e-05, 'epoch': 1.56}


 52%|█████▏    | 3920/7500 [2:03:34<1:57:11,  1.96s/it]

{'loss': 0.7994, 'grad_norm': 0.48287439346313477, 'learning_rate': 2.4026845637583893e-05, 'epoch': 1.57}


 52%|█████▏    | 3930/7500 [2:03:53<1:48:04,  1.82s/it]

{'loss': 0.7715, 'grad_norm': 0.5940903425216675, 'learning_rate': 2.3959731543624162e-05, 'epoch': 1.57}


 53%|█████▎    | 3940/7500 [2:04:11<1:45:06,  1.77s/it]

{'loss': 0.7847, 'grad_norm': 0.5730188488960266, 'learning_rate': 2.389261744966443e-05, 'epoch': 1.58}


 53%|█████▎    | 3950/7500 [2:04:28<1:45:32,  1.78s/it]

{'loss': 0.7554, 'grad_norm': 0.5714937448501587, 'learning_rate': 2.38255033557047e-05, 'epoch': 1.58}


 53%|█████▎    | 3960/7500 [2:04:46<1:46:31,  1.81s/it]

{'loss': 0.7827, 'grad_norm': 0.6096695065498352, 'learning_rate': 2.3758389261744967e-05, 'epoch': 1.58}


 53%|█████▎    | 3970/7500 [2:05:05<1:51:26,  1.89s/it]

{'loss': 0.788, 'grad_norm': 0.7298617362976074, 'learning_rate': 2.3691275167785236e-05, 'epoch': 1.59}


 53%|█████▎    | 3980/7500 [2:05:24<1:53:32,  1.94s/it]

{'loss': 0.7923, 'grad_norm': 0.9438443779945374, 'learning_rate': 2.3624161073825504e-05, 'epoch': 1.59}


 53%|█████▎    | 3990/7500 [2:05:42<1:44:06,  1.78s/it]

{'loss': 0.8119, 'grad_norm': 0.4612027406692505, 'learning_rate': 2.3557046979865773e-05, 'epoch': 1.6}


 53%|█████▎    | 4000/7500 [2:05:59<1:43:49,  1.78s/it]

{'loss': 0.7461, 'grad_norm': 0.49901872873306274, 'learning_rate': 2.348993288590604e-05, 'epoch': 1.6}


 53%|█████▎    | 4010/7500 [2:06:19<1:44:37,  1.80s/it]

{'loss': 0.7453, 'grad_norm': 0.5094708204269409, 'learning_rate': 2.342281879194631e-05, 'epoch': 1.6}


 54%|█████▎    | 4020/7500 [2:06:39<1:59:45,  2.06s/it]

{'loss': 0.7837, 'grad_norm': 0.6371473670005798, 'learning_rate': 2.335570469798658e-05, 'epoch': 1.61}


 54%|█████▎    | 4030/7500 [2:07:00<1:58:40,  2.05s/it]

{'loss': 0.7771, 'grad_norm': 0.4657331705093384, 'learning_rate': 2.3288590604026847e-05, 'epoch': 1.61}


 54%|█████▍    | 4040/7500 [2:07:20<1:59:38,  2.07s/it]

{'loss': 0.7408, 'grad_norm': 0.695652425289154, 'learning_rate': 2.3221476510067116e-05, 'epoch': 1.62}


 54%|█████▍    | 4050/7500 [2:07:41<2:02:37,  2.13s/it]

{'loss': 0.7897, 'grad_norm': 0.47802475094795227, 'learning_rate': 2.3154362416107384e-05, 'epoch': 1.62}


 54%|█████▍    | 4060/7500 [2:08:01<1:51:20,  1.94s/it]

{'loss': 0.7748, 'grad_norm': 0.5500807166099548, 'learning_rate': 2.3087248322147653e-05, 'epoch': 1.62}


 54%|█████▍    | 4070/7500 [2:08:19<1:44:58,  1.84s/it]

{'loss': 0.8133, 'grad_norm': 0.49098166823387146, 'learning_rate': 2.302013422818792e-05, 'epoch': 1.63}


 54%|█████▍    | 4080/7500 [2:08:40<1:52:22,  1.97s/it]

{'loss': 0.7429, 'grad_norm': 0.7440332174301147, 'learning_rate': 2.295302013422819e-05, 'epoch': 1.63}


 55%|█████▍    | 4090/7500 [2:09:01<1:52:12,  1.97s/it]

{'loss': 0.8127, 'grad_norm': 0.6338997483253479, 'learning_rate': 2.2885906040268458e-05, 'epoch': 1.64}


 55%|█████▍    | 4100/7500 [2:09:20<1:44:44,  1.85s/it]

{'loss': 0.7597, 'grad_norm': 0.502383291721344, 'learning_rate': 2.2818791946308727e-05, 'epoch': 1.64}


 55%|█████▍    | 4110/7500 [2:09:39<1:44:28,  1.85s/it]

{'loss': 0.7637, 'grad_norm': 0.4850587546825409, 'learning_rate': 2.2751677852348992e-05, 'epoch': 1.64}


 55%|█████▍    | 4120/7500 [2:09:57<1:45:01,  1.86s/it]

{'loss': 0.7899, 'grad_norm': 0.671695351600647, 'learning_rate': 2.2684563758389264e-05, 'epoch': 1.65}


 55%|█████▌    | 4130/7500 [2:10:17<1:50:37,  1.97s/it]

{'loss': 0.7789, 'grad_norm': 0.4721072018146515, 'learning_rate': 2.2617449664429532e-05, 'epoch': 1.65}


 55%|█████▌    | 4140/7500 [2:10:36<1:42:20,  1.83s/it]

{'loss': 0.7435, 'grad_norm': 0.8191743493080139, 'learning_rate': 2.25503355704698e-05, 'epoch': 1.66}


 55%|█████▌    | 4150/7500 [2:10:54<1:39:26,  1.78s/it]

{'loss': 0.7802, 'grad_norm': 0.5047791004180908, 'learning_rate': 2.248322147651007e-05, 'epoch': 1.66}


 55%|█████▌    | 4160/7500 [2:11:12<1:41:02,  1.82s/it]

{'loss': 0.8041, 'grad_norm': 0.49518120288848877, 'learning_rate': 2.2416107382550335e-05, 'epoch': 1.66}


 56%|█████▌    | 4170/7500 [2:11:30<1:39:04,  1.79s/it]

{'loss': 0.8144, 'grad_norm': 0.4629901945590973, 'learning_rate': 2.2348993288590606e-05, 'epoch': 1.67}


 56%|█████▌    | 4180/7500 [2:11:48<1:48:16,  1.96s/it]

{'loss': 0.7877, 'grad_norm': 0.5184993743896484, 'learning_rate': 2.228187919463087e-05, 'epoch': 1.67}


 56%|█████▌    | 4190/7500 [2:12:08<1:42:32,  1.86s/it]

{'loss': 0.7612, 'grad_norm': 0.5542566180229187, 'learning_rate': 2.2214765100671144e-05, 'epoch': 1.68}


 56%|█████▌    | 4200/7500 [2:12:26<1:42:26,  1.86s/it]

{'loss': 0.745, 'grad_norm': 0.565523087978363, 'learning_rate': 2.2147651006711412e-05, 'epoch': 1.68}


 56%|█████▌    | 4210/7500 [2:12:45<1:39:52,  1.82s/it]

{'loss': 0.8038, 'grad_norm': 0.5594495534896851, 'learning_rate': 2.2080536912751677e-05, 'epoch': 1.68}


 56%|█████▋    | 4220/7500 [2:13:04<1:43:23,  1.89s/it]

{'loss': 0.7691, 'grad_norm': 0.9492373466491699, 'learning_rate': 2.201342281879195e-05, 'epoch': 1.69}


 56%|█████▋    | 4230/7500 [2:13:23<1:40:44,  1.85s/it]

{'loss': 0.7634, 'grad_norm': 0.7029899954795837, 'learning_rate': 2.1946308724832214e-05, 'epoch': 1.69}


 57%|█████▋    | 4240/7500 [2:13:54<4:05:35,  4.52s/it]

{'loss': 0.7586, 'grad_norm': 0.6399865746498108, 'learning_rate': 2.1879194630872486e-05, 'epoch': 1.7}


 57%|█████▋    | 4250/7500 [2:14:08<1:18:24,  1.45s/it]

{'loss': 0.7708, 'grad_norm': 0.5616192817687988, 'learning_rate': 2.181208053691275e-05, 'epoch': 1.7}


 57%|█████▋    | 4260/7500 [2:14:21<1:12:24,  1.34s/it]

{'loss': 0.7627, 'grad_norm': 0.49952125549316406, 'learning_rate': 2.174496644295302e-05, 'epoch': 1.7}


 57%|█████▋    | 4270/7500 [2:14:35<1:12:27,  1.35s/it]

{'loss': 0.8049, 'grad_norm': 0.5723720788955688, 'learning_rate': 2.167785234899329e-05, 'epoch': 1.71}


 57%|█████▋    | 4280/7500 [2:14:49<1:19:25,  1.48s/it]

{'loss': 0.7787, 'grad_norm': 0.5023280382156372, 'learning_rate': 2.1610738255033557e-05, 'epoch': 1.71}


 57%|█████▋    | 4290/7500 [2:15:07<1:43:56,  1.94s/it]

{'loss': 0.7581, 'grad_norm': 0.5418107509613037, 'learning_rate': 2.154362416107383e-05, 'epoch': 1.72}


 57%|█████▋    | 4300/7500 [2:15:26<1:46:11,  1.99s/it]

{'loss': 0.7757, 'grad_norm': 0.550758421421051, 'learning_rate': 2.1476510067114094e-05, 'epoch': 1.72}


 57%|█████▋    | 4310/7500 [2:15:47<1:57:50,  2.22s/it]

{'loss': 0.758, 'grad_norm': 0.6229334473609924, 'learning_rate': 2.1409395973154362e-05, 'epoch': 1.72}


 58%|█████▊    | 4320/7500 [2:16:07<1:41:43,  1.92s/it]

{'loss': 0.7766, 'grad_norm': 0.7560971975326538, 'learning_rate': 2.134228187919463e-05, 'epoch': 1.73}


 58%|█████▊    | 4330/7500 [2:16:26<1:42:13,  1.93s/it]

{'loss': 0.7726, 'grad_norm': 0.5502182245254517, 'learning_rate': 2.12751677852349e-05, 'epoch': 1.73}


 58%|█████▊    | 4340/7500 [2:16:43<1:32:18,  1.75s/it]

{'loss': 0.7686, 'grad_norm': 0.5100824236869812, 'learning_rate': 2.1208053691275168e-05, 'epoch': 1.74}


 58%|█████▊    | 4350/7500 [2:17:00<1:28:59,  1.69s/it]

{'loss': 0.7737, 'grad_norm': 0.6846480965614319, 'learning_rate': 2.1140939597315437e-05, 'epoch': 1.74}


 58%|█████▊    | 4360/7500 [2:17:18<1:30:40,  1.73s/it]

{'loss': 0.7869, 'grad_norm': 0.958928108215332, 'learning_rate': 2.107382550335571e-05, 'epoch': 1.74}


 58%|█████▊    | 4370/7500 [2:17:38<1:47:32,  2.06s/it]

{'loss': 0.779, 'grad_norm': 0.5189948678016663, 'learning_rate': 2.1006711409395974e-05, 'epoch': 1.75}


 58%|█████▊    | 4380/7500 [2:17:59<1:55:05,  2.21s/it]

{'loss': 0.7954, 'grad_norm': 0.6499903798103333, 'learning_rate': 2.0939597315436242e-05, 'epoch': 1.75}


 59%|█████▊    | 4390/7500 [2:18:18<1:40:52,  1.95s/it]

{'loss': 0.783, 'grad_norm': 0.513389527797699, 'learning_rate': 2.087248322147651e-05, 'epoch': 1.76}


 59%|█████▊    | 4400/7500 [2:18:38<1:42:24,  1.98s/it]

{'loss': 0.7534, 'grad_norm': 0.6541904211044312, 'learning_rate': 2.080536912751678e-05, 'epoch': 1.76}


 59%|█████▉    | 4410/7500 [2:18:56<1:31:23,  1.77s/it]

{'loss': 0.7513, 'grad_norm': 0.7138945460319519, 'learning_rate': 2.0738255033557048e-05, 'epoch': 1.76}


 59%|█████▉    | 4420/7500 [2:19:13<1:29:31,  1.74s/it]

{'loss': 0.8128, 'grad_norm': 0.4934995174407959, 'learning_rate': 2.0671140939597316e-05, 'epoch': 1.77}


 59%|█████▉    | 4430/7500 [2:19:31<1:28:17,  1.73s/it]

{'loss': 0.7628, 'grad_norm': 0.5203095078468323, 'learning_rate': 2.0604026845637585e-05, 'epoch': 1.77}


 59%|█████▉    | 4440/7500 [2:19:49<1:34:33,  1.85s/it]

{'loss': 0.7442, 'grad_norm': 0.5292111039161682, 'learning_rate': 2.0536912751677853e-05, 'epoch': 1.78}


 59%|█████▉    | 4450/7500 [2:20:07<1:33:23,  1.84s/it]

{'loss': 0.8093, 'grad_norm': 0.6504424810409546, 'learning_rate': 2.0469798657718122e-05, 'epoch': 1.78}


 59%|█████▉    | 4460/7500 [2:20:25<1:29:50,  1.77s/it]

{'loss': 0.8042, 'grad_norm': 0.49870508909225464, 'learning_rate': 2.040268456375839e-05, 'epoch': 1.78}


 60%|█████▉    | 4470/7500 [2:20:42<1:28:57,  1.76s/it]

{'loss': 0.8114, 'grad_norm': 0.6156183481216431, 'learning_rate': 2.033557046979866e-05, 'epoch': 1.79}


 60%|█████▉    | 4480/7500 [2:21:00<1:27:07,  1.73s/it]

{'loss': 0.776, 'grad_norm': 0.6228813529014587, 'learning_rate': 2.0268456375838928e-05, 'epoch': 1.79}


 60%|█████▉    | 4490/7500 [2:21:17<1:27:24,  1.74s/it]

{'loss': 0.8017, 'grad_norm': 0.7455339431762695, 'learning_rate': 2.0201342281879196e-05, 'epoch': 1.8}


 60%|██████    | 4500/7500 [2:21:35<1:26:33,  1.73s/it]

{'loss': 0.8072, 'grad_norm': 0.7493080496788025, 'learning_rate': 2.013422818791946e-05, 'epoch': 1.8}


 60%|██████    | 4510/7500 [2:21:54<1:30:21,  1.81s/it]

{'loss': 0.7907, 'grad_norm': 0.5388664603233337, 'learning_rate': 2.0067114093959733e-05, 'epoch': 1.8}


 60%|██████    | 4520/7500 [2:22:12<1:29:03,  1.79s/it]

{'loss': 0.7769, 'grad_norm': 0.5445564389228821, 'learning_rate': 2e-05, 'epoch': 1.81}


 60%|██████    | 4530/7500 [2:22:30<1:28:59,  1.80s/it]

{'loss': 0.8052, 'grad_norm': 0.5510087609291077, 'learning_rate': 1.993288590604027e-05, 'epoch': 1.81}


 61%|██████    | 4540/7500 [2:22:47<1:28:34,  1.80s/it]

{'loss': 0.7353, 'grad_norm': 0.5915560126304626, 'learning_rate': 1.986577181208054e-05, 'epoch': 1.82}


 61%|██████    | 4550/7500 [2:23:05<1:27:28,  1.78s/it]

{'loss': 0.7963, 'grad_norm': 0.5912167429924011, 'learning_rate': 1.9798657718120804e-05, 'epoch': 1.82}


 61%|██████    | 4560/7500 [2:23:23<1:26:49,  1.77s/it]

{'loss': 0.7962, 'grad_norm': 0.8102577328681946, 'learning_rate': 1.9731543624161076e-05, 'epoch': 1.82}


 61%|██████    | 4570/7500 [2:23:41<1:26:27,  1.77s/it]

{'loss': 0.7697, 'grad_norm': 0.5163044929504395, 'learning_rate': 1.966442953020134e-05, 'epoch': 1.83}


 61%|██████    | 4580/7500 [2:23:58<1:27:16,  1.79s/it]

{'loss': 0.7779, 'grad_norm': 0.5300886034965515, 'learning_rate': 1.9597315436241613e-05, 'epoch': 1.83}


 61%|██████    | 4590/7500 [2:24:17<1:28:42,  1.83s/it]

{'loss': 0.7547, 'grad_norm': 0.8540322184562683, 'learning_rate': 1.953020134228188e-05, 'epoch': 1.84}


 61%|██████▏   | 4600/7500 [2:24:35<1:26:43,  1.79s/it]

{'loss': 0.7811, 'grad_norm': 0.6087718605995178, 'learning_rate': 1.946308724832215e-05, 'epoch': 1.84}


 61%|██████▏   | 4610/7500 [2:24:53<1:24:32,  1.76s/it]

{'loss': 0.7879, 'grad_norm': 0.7160661816596985, 'learning_rate': 1.939597315436242e-05, 'epoch': 1.84}


 62%|██████▏   | 4620/7500 [2:25:10<1:23:38,  1.74s/it]

{'loss': 0.7749, 'grad_norm': 0.5380996465682983, 'learning_rate': 1.9328859060402684e-05, 'epoch': 1.85}


 62%|██████▏   | 4630/7500 [2:25:27<1:23:36,  1.75s/it]

{'loss': 0.7845, 'grad_norm': 0.808830201625824, 'learning_rate': 1.9261744966442955e-05, 'epoch': 1.85}


 62%|██████▏   | 4640/7500 [2:25:45<1:25:37,  1.80s/it]

{'loss': 0.8021, 'grad_norm': 0.7724510431289673, 'learning_rate': 1.919463087248322e-05, 'epoch': 1.86}


 62%|██████▏   | 4650/7500 [2:26:04<1:24:51,  1.79s/it]

{'loss': 0.7773, 'grad_norm': 0.8435102701187134, 'learning_rate': 1.9127516778523493e-05, 'epoch': 1.86}


 62%|██████▏   | 4660/7500 [2:26:22<1:27:15,  1.84s/it]

{'loss': 0.731, 'grad_norm': 0.425363689661026, 'learning_rate': 1.906040268456376e-05, 'epoch': 1.86}


 62%|██████▏   | 4670/7500 [2:26:40<1:22:59,  1.76s/it]

{'loss': 0.7845, 'grad_norm': 0.478768527507782, 'learning_rate': 1.8993288590604026e-05, 'epoch': 1.87}


 62%|██████▏   | 4680/7500 [2:26:58<1:21:47,  1.74s/it]

{'loss': 0.7765, 'grad_norm': 0.668074369430542, 'learning_rate': 1.8926174496644298e-05, 'epoch': 1.87}


 63%|██████▎   | 4690/7500 [2:27:15<1:21:14,  1.73s/it]

{'loss': 0.7598, 'grad_norm': 0.5718802213668823, 'learning_rate': 1.8859060402684563e-05, 'epoch': 1.88}


 63%|██████▎   | 4700/7500 [2:27:32<1:20:25,  1.72s/it]

{'loss': 0.7986, 'grad_norm': 0.4949658513069153, 'learning_rate': 1.8791946308724835e-05, 'epoch': 1.88}


 63%|██████▎   | 4710/7500 [2:27:51<1:26:59,  1.87s/it]

{'loss': 0.7588, 'grad_norm': 0.5253276228904724, 'learning_rate': 1.87248322147651e-05, 'epoch': 1.88}


 63%|██████▎   | 4720/7500 [2:28:10<1:26:15,  1.86s/it]

{'loss': 0.7658, 'grad_norm': 0.5417819023132324, 'learning_rate': 1.865771812080537e-05, 'epoch': 1.89}


 63%|██████▎   | 4730/7500 [2:28:28<1:25:05,  1.84s/it]

{'loss': 0.753, 'grad_norm': 1.2314648628234863, 'learning_rate': 1.8590604026845637e-05, 'epoch': 1.89}


 63%|██████▎   | 4740/7500 [2:28:46<1:22:16,  1.79s/it]

{'loss': 0.7719, 'grad_norm': 0.6582053899765015, 'learning_rate': 1.8523489932885906e-05, 'epoch': 1.9}


 63%|██████▎   | 4750/7500 [2:29:05<1:29:20,  1.95s/it]

{'loss': 0.7638, 'grad_norm': 0.5641537308692932, 'learning_rate': 1.8456375838926178e-05, 'epoch': 1.9}


 63%|██████▎   | 4760/7500 [2:29:24<1:26:09,  1.89s/it]

{'loss': 0.8208, 'grad_norm': 0.7088724970817566, 'learning_rate': 1.8389261744966443e-05, 'epoch': 1.9}


 64%|██████▎   | 4770/7500 [2:29:42<1:23:06,  1.83s/it]

{'loss': 0.7738, 'grad_norm': 0.5474995374679565, 'learning_rate': 1.832214765100671e-05, 'epoch': 1.91}


 64%|██████▎   | 4780/7500 [2:30:01<1:27:42,  1.93s/it]

{'loss': 0.7558, 'grad_norm': 0.41280755400657654, 'learning_rate': 1.825503355704698e-05, 'epoch': 1.91}


 64%|██████▍   | 4790/7500 [2:30:20<1:27:24,  1.94s/it]

{'loss': 0.7451, 'grad_norm': 0.5475167036056519, 'learning_rate': 1.818791946308725e-05, 'epoch': 1.92}


 64%|██████▍   | 4800/7500 [2:30:39<1:22:11,  1.83s/it]

{'loss': 0.7996, 'grad_norm': 0.614711344242096, 'learning_rate': 1.8120805369127517e-05, 'epoch': 1.92}


 64%|██████▍   | 4810/7500 [2:30:58<1:23:42,  1.87s/it]

{'loss': 0.7929, 'grad_norm': 0.5446210503578186, 'learning_rate': 1.8053691275167786e-05, 'epoch': 1.92}


 64%|██████▍   | 4820/7500 [2:31:16<1:25:47,  1.92s/it]

{'loss': 0.7631, 'grad_norm': 0.5665425658226013, 'learning_rate': 1.7986577181208054e-05, 'epoch': 1.93}


 64%|██████▍   | 4830/7500 [2:31:35<1:22:41,  1.86s/it]

{'loss': 0.7745, 'grad_norm': 0.5283769369125366, 'learning_rate': 1.7919463087248323e-05, 'epoch': 1.93}


 65%|██████▍   | 4840/7500 [2:31:53<1:19:28,  1.79s/it]

{'loss': 0.7541, 'grad_norm': 0.6666338443756104, 'learning_rate': 1.785234899328859e-05, 'epoch': 1.94}


 65%|██████▍   | 4850/7500 [2:32:11<1:18:43,  1.78s/it]

{'loss': 0.7373, 'grad_norm': 0.553178608417511, 'learning_rate': 1.778523489932886e-05, 'epoch': 1.94}


 65%|██████▍   | 4860/7500 [2:32:28<1:18:20,  1.78s/it]

{'loss': 0.7623, 'grad_norm': 0.7234200239181519, 'learning_rate': 1.7718120805369128e-05, 'epoch': 1.94}


 65%|██████▍   | 4870/7500 [2:32:46<1:15:59,  1.73s/it]

{'loss': 0.75, 'grad_norm': 0.6711499691009521, 'learning_rate': 1.7651006711409397e-05, 'epoch': 1.95}


 65%|██████▌   | 4880/7500 [2:33:04<1:20:42,  1.85s/it]

{'loss': 0.8124, 'grad_norm': 0.6689451336860657, 'learning_rate': 1.7583892617449665e-05, 'epoch': 1.95}


 65%|██████▌   | 4890/7500 [2:33:24<1:27:17,  2.01s/it]

{'loss': 0.7754, 'grad_norm': 0.4860284924507141, 'learning_rate': 1.7516778523489934e-05, 'epoch': 1.96}


 65%|██████▌   | 4900/7500 [2:33:43<1:19:37,  1.84s/it]

{'loss': 0.7841, 'grad_norm': 0.48934781551361084, 'learning_rate': 1.7449664429530202e-05, 'epoch': 1.96}


 65%|██████▌   | 4910/7500 [2:34:01<1:18:20,  1.82s/it]

{'loss': 0.7388, 'grad_norm': 0.8673892021179199, 'learning_rate': 1.738255033557047e-05, 'epoch': 1.96}


 66%|██████▌   | 4920/7500 [2:34:19<1:16:14,  1.77s/it]

{'loss': 0.8041, 'grad_norm': 0.5943146347999573, 'learning_rate': 1.731543624161074e-05, 'epoch': 1.97}


 66%|██████▌   | 4930/7500 [2:34:37<1:17:10,  1.80s/it]

{'loss': 0.7681, 'grad_norm': 0.4956080913543701, 'learning_rate': 1.7248322147651008e-05, 'epoch': 1.97}


 66%|██████▌   | 4940/7500 [2:34:54<1:14:54,  1.76s/it]

{'loss': 0.7454, 'grad_norm': 0.5481093525886536, 'learning_rate': 1.7181208053691277e-05, 'epoch': 1.98}


 66%|██████▌   | 4950/7500 [2:35:13<1:17:08,  1.82s/it]

{'loss': 0.7485, 'grad_norm': 0.45609888434410095, 'learning_rate': 1.7114093959731545e-05, 'epoch': 1.98}


 66%|██████▌   | 4960/7500 [2:35:30<1:16:07,  1.80s/it]

{'loss': 0.7496, 'grad_norm': 0.7263022661209106, 'learning_rate': 1.704697986577181e-05, 'epoch': 1.98}


 66%|██████▋   | 4970/7500 [2:35:48<1:15:48,  1.80s/it]

{'loss': 0.7788, 'grad_norm': 0.4741921126842499, 'learning_rate': 1.6979865771812082e-05, 'epoch': 1.99}


 66%|██████▋   | 4980/7500 [2:36:06<1:14:33,  1.78s/it]

{'loss': 0.8196, 'grad_norm': 0.6266677379608154, 'learning_rate': 1.691275167785235e-05, 'epoch': 1.99}


 67%|██████▋   | 4990/7500 [2:36:24<1:16:18,  1.82s/it]

{'loss': 0.7689, 'grad_norm': 0.452332466840744, 'learning_rate': 1.684563758389262e-05, 'epoch': 2.0}


 67%|██████▋   | 5000/7500 [2:36:43<1:15:35,  1.81s/it]

{'loss': 0.7415, 'grad_norm': 0.5930585265159607, 'learning_rate': 1.6778523489932888e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 5000/7500 [2:37:52<1:15:35,  1.81s/it]

{'eval_loss': 0.6526898741722107, 'eval_runtime': 67.6521, 'eval_samples_per_second': 7.391, 'eval_steps_per_second': 1.848, 'epoch': 2.0}


 67%|██████▋   | 5010/7500 [2:38:12<1:56:36,  2.81s/it] 

{'loss': 0.7179, 'grad_norm': 0.45652255415916443, 'learning_rate': 1.6711409395973153e-05, 'epoch': 2.0}


 67%|██████▋   | 5020/7500 [2:38:33<1:31:45,  2.22s/it]

{'loss': 0.7489, 'grad_norm': 0.6945042610168457, 'learning_rate': 1.6644295302013425e-05, 'epoch': 2.01}


 67%|██████▋   | 5030/7500 [2:38:52<1:15:57,  1.85s/it]

{'loss': 0.7808, 'grad_norm': 0.6037280559539795, 'learning_rate': 1.657718120805369e-05, 'epoch': 2.01}


 67%|██████▋   | 5040/7500 [2:39:10<1:25:02,  2.07s/it]

{'loss': 0.7898, 'grad_norm': 0.506242573261261, 'learning_rate': 1.6510067114093962e-05, 'epoch': 2.02}


 67%|██████▋   | 5050/7500 [2:39:29<1:17:07,  1.89s/it]

{'loss': 0.7419, 'grad_norm': 0.7033630609512329, 'learning_rate': 1.644295302013423e-05, 'epoch': 2.02}


 67%|██████▋   | 5060/7500 [2:39:48<1:17:34,  1.91s/it]

{'loss': 0.7578, 'grad_norm': 0.7118988037109375, 'learning_rate': 1.6375838926174496e-05, 'epoch': 2.02}


 68%|██████▊   | 5070/7500 [2:40:07<1:15:00,  1.85s/it]

{'loss': 0.783, 'grad_norm': 0.6090036630630493, 'learning_rate': 1.6308724832214767e-05, 'epoch': 2.03}


 68%|██████▊   | 5080/7500 [2:40:24<1:11:37,  1.78s/it]

{'loss': 0.7721, 'grad_norm': 0.5199494957923889, 'learning_rate': 1.6241610738255033e-05, 'epoch': 2.03}


 68%|██████▊   | 5090/7500 [2:40:41<1:06:22,  1.65s/it]

{'loss': 0.7632, 'grad_norm': 0.5556172132492065, 'learning_rate': 1.6174496644295304e-05, 'epoch': 2.04}


 68%|██████▊   | 5100/7500 [2:40:58<1:06:03,  1.65s/it]

{'loss': 0.7259, 'grad_norm': 0.5260768532752991, 'learning_rate': 1.610738255033557e-05, 'epoch': 2.04}


 68%|██████▊   | 5110/7500 [2:41:14<1:06:10,  1.66s/it]

{'loss': 0.7587, 'grad_norm': 0.7825997471809387, 'learning_rate': 1.604026845637584e-05, 'epoch': 2.04}


 68%|██████▊   | 5120/7500 [2:41:31<1:06:18,  1.67s/it]

{'loss': 0.7492, 'grad_norm': 0.6048825979232788, 'learning_rate': 1.597315436241611e-05, 'epoch': 2.05}


 68%|██████▊   | 5130/7500 [2:41:48<1:05:40,  1.66s/it]

{'loss': 0.728, 'grad_norm': 0.5040994882583618, 'learning_rate': 1.5906040268456375e-05, 'epoch': 2.05}


 69%|██████▊   | 5140/7500 [2:42:05<1:06:07,  1.68s/it]

{'loss': 0.7323, 'grad_norm': 0.5645473599433899, 'learning_rate': 1.5838926174496647e-05, 'epoch': 2.06}


 69%|██████▊   | 5150/7500 [2:42:24<1:15:09,  1.92s/it]

{'loss': 0.7868, 'grad_norm': 0.5048760771751404, 'learning_rate': 1.5771812080536912e-05, 'epoch': 2.06}


 69%|██████▉   | 5160/7500 [2:42:41<1:06:01,  1.69s/it]

{'loss': 0.7845, 'grad_norm': 0.5517919659614563, 'learning_rate': 1.5704697986577184e-05, 'epoch': 2.06}


 69%|██████▉   | 5170/7500 [2:42:58<1:05:52,  1.70s/it]

{'loss': 0.7611, 'grad_norm': 0.9011064171791077, 'learning_rate': 1.563758389261745e-05, 'epoch': 2.07}


 69%|██████▉   | 5180/7500 [2:43:17<1:08:40,  1.78s/it]

{'loss': 0.7597, 'grad_norm': 0.9071077704429626, 'learning_rate': 1.5570469798657718e-05, 'epoch': 2.07}


 69%|██████▉   | 5190/7500 [2:43:34<1:04:52,  1.69s/it]

{'loss': 0.7518, 'grad_norm': 0.5032564997673035, 'learning_rate': 1.5503355704697986e-05, 'epoch': 2.08}


 69%|██████▉   | 5200/7500 [2:43:50<1:03:34,  1.66s/it]

{'loss': 0.7532, 'grad_norm': 0.5597071647644043, 'learning_rate': 1.5436241610738255e-05, 'epoch': 2.08}


 69%|██████▉   | 5210/7500 [2:44:07<1:03:21,  1.66s/it]

{'loss': 0.8009, 'grad_norm': 0.6357955932617188, 'learning_rate': 1.5369127516778527e-05, 'epoch': 2.08}


 70%|██████▉   | 5220/7500 [2:44:26<1:08:57,  1.81s/it]

{'loss': 0.7773, 'grad_norm': 0.49548497796058655, 'learning_rate': 1.5302013422818792e-05, 'epoch': 2.09}


 70%|██████▉   | 5230/7500 [2:44:42<1:01:37,  1.63s/it]

{'loss': 0.7147, 'grad_norm': 0.66475510597229, 'learning_rate': 1.5234899328859062e-05, 'epoch': 2.09}


 70%|██████▉   | 5240/7500 [2:44:58<1:00:37,  1.61s/it]

{'loss': 0.7304, 'grad_norm': 0.936793863773346, 'learning_rate': 1.5167785234899329e-05, 'epoch': 2.1}


 70%|███████   | 5250/7500 [2:45:14<1:00:28,  1.61s/it]

{'loss': 0.7568, 'grad_norm': 0.6279051303863525, 'learning_rate': 1.51006711409396e-05, 'epoch': 2.1}


 70%|███████   | 5260/7500 [2:45:58<2:46:24,  4.46s/it]

{'loss': 0.7708, 'grad_norm': 0.8006362915039062, 'learning_rate': 1.5033557046979866e-05, 'epoch': 2.1}


 70%|███████   | 5270/7500 [2:46:13<58:39,  1.58s/it]  

{'loss': 0.7482, 'grad_norm': 0.6298482418060303, 'learning_rate': 1.4966442953020135e-05, 'epoch': 2.11}


 70%|███████   | 5280/7500 [2:46:27<50:50,  1.37s/it]

{'loss': 0.7524, 'grad_norm': 0.47729673981666565, 'learning_rate': 1.4899328859060405e-05, 'epoch': 2.11}


 71%|███████   | 5290/7500 [2:46:41<54:50,  1.49s/it]

{'loss': 0.7402, 'grad_norm': 0.6557919383049011, 'learning_rate': 1.4832214765100672e-05, 'epoch': 2.12}


 71%|███████   | 5300/7500 [2:47:11<2:25:15,  3.96s/it]

{'loss': 0.7364, 'grad_norm': 0.6254293918609619, 'learning_rate': 1.4765100671140942e-05, 'epoch': 2.12}


 71%|███████   | 5310/7500 [2:47:43<1:48:43,  2.98s/it]

{'loss': 0.7328, 'grad_norm': 0.6939457654953003, 'learning_rate': 1.4697986577181209e-05, 'epoch': 2.12}


 71%|███████   | 5320/7500 [2:48:06<1:18:52,  2.17s/it]

{'loss': 0.7745, 'grad_norm': 0.5371350049972534, 'learning_rate': 1.4630872483221477e-05, 'epoch': 2.13}


 71%|███████   | 5330/7500 [2:48:25<1:08:36,  1.90s/it]

{'loss': 0.7753, 'grad_norm': 0.6466635465621948, 'learning_rate': 1.4563758389261744e-05, 'epoch': 2.13}


 71%|███████   | 5340/7500 [2:48:45<1:11:11,  1.98s/it]

{'loss': 0.7042, 'grad_norm': 0.5386261940002441, 'learning_rate': 1.4496644295302014e-05, 'epoch': 2.14}


 71%|███████▏  | 5350/7500 [2:49:04<1:07:40,  1.89s/it]

{'loss': 0.742, 'grad_norm': 0.5823283791542053, 'learning_rate': 1.4429530201342285e-05, 'epoch': 2.14}


 71%|███████▏  | 5360/7500 [2:49:24<1:09:23,  1.95s/it]

{'loss': 0.7718, 'grad_norm': 0.5851000547409058, 'learning_rate': 1.4362416107382551e-05, 'epoch': 2.14}


 72%|███████▏  | 5370/7500 [2:49:44<1:09:10,  1.95s/it]

{'loss': 0.7262, 'grad_norm': 0.751868486404419, 'learning_rate': 1.429530201342282e-05, 'epoch': 2.15}


 72%|███████▏  | 5380/7500 [2:50:03<1:09:58,  1.98s/it]

{'loss': 0.7137, 'grad_norm': 0.764678418636322, 'learning_rate': 1.4228187919463087e-05, 'epoch': 2.15}


 72%|███████▏  | 5390/7500 [2:50:24<1:09:02,  1.96s/it]

{'loss': 0.7375, 'grad_norm': 0.6117401123046875, 'learning_rate': 1.4161073825503357e-05, 'epoch': 2.16}


 72%|███████▏  | 5400/7500 [2:50:43<1:07:46,  1.94s/it]

{'loss': 0.7765, 'grad_norm': 0.559266984462738, 'learning_rate': 1.4093959731543624e-05, 'epoch': 2.16}


 72%|███████▏  | 5410/7500 [2:51:04<1:16:03,  2.18s/it]

{'loss': 0.7306, 'grad_norm': 0.7390576004981995, 'learning_rate': 1.4026845637583894e-05, 'epoch': 2.16}


 72%|███████▏  | 5420/7500 [2:51:26<1:09:37,  2.01s/it]

{'loss': 0.787, 'grad_norm': 0.7908015847206116, 'learning_rate': 1.3959731543624161e-05, 'epoch': 2.17}


 72%|███████▏  | 5430/7500 [2:51:47<1:11:03,  2.06s/it]

{'loss': 0.7672, 'grad_norm': 0.7273011803627014, 'learning_rate': 1.389261744966443e-05, 'epoch': 2.17}


 73%|███████▎  | 5440/7500 [2:52:06<1:05:48,  1.92s/it]

{'loss': 0.7548, 'grad_norm': 0.5446162223815918, 'learning_rate': 1.38255033557047e-05, 'epoch': 2.18}


 73%|███████▎  | 5450/7500 [2:52:25<1:05:34,  1.92s/it]

{'loss': 0.7296, 'grad_norm': 0.6443489789962769, 'learning_rate': 1.3758389261744966e-05, 'epoch': 2.18}


 73%|███████▎  | 5460/7500 [2:52:44<1:04:57,  1.91s/it]

{'loss': 0.7701, 'grad_norm': 0.8946313858032227, 'learning_rate': 1.3691275167785237e-05, 'epoch': 2.18}


 73%|███████▎  | 5470/7500 [2:53:03<1:05:48,  1.94s/it]

{'loss': 0.7838, 'grad_norm': 0.7073444724082947, 'learning_rate': 1.3624161073825504e-05, 'epoch': 2.19}


 73%|███████▎  | 5480/7500 [2:53:23<1:06:19,  1.97s/it]

{'loss': 0.7143, 'grad_norm': 0.6059813499450684, 'learning_rate': 1.3557046979865772e-05, 'epoch': 2.19}


 73%|███████▎  | 5490/7500 [2:53:42<1:04:34,  1.93s/it]

{'loss': 0.7554, 'grad_norm': 0.5090625286102295, 'learning_rate': 1.3489932885906039e-05, 'epoch': 2.2}


 73%|███████▎  | 5500/7500 [2:54:01<1:02:43,  1.88s/it]

{'loss': 0.7333, 'grad_norm': 0.5593028664588928, 'learning_rate': 1.3422818791946309e-05, 'epoch': 2.2}


 73%|███████▎  | 5510/7500 [2:54:23<1:08:43,  2.07s/it]

{'loss': 0.7475, 'grad_norm': 0.5603170990943909, 'learning_rate': 1.335570469798658e-05, 'epoch': 2.2}


 74%|███████▎  | 5520/7500 [2:54:42<1:05:03,  1.97s/it]

{'loss': 0.776, 'grad_norm': 0.6202621459960938, 'learning_rate': 1.3288590604026846e-05, 'epoch': 2.21}


 74%|███████▎  | 5530/7500 [2:55:03<1:10:55,  2.16s/it]

{'loss': 0.7475, 'grad_norm': 0.5379713177680969, 'learning_rate': 1.3221476510067116e-05, 'epoch': 2.21}


 74%|███████▍  | 5540/7500 [2:55:22<1:00:34,  1.85s/it]

{'loss': 0.7687, 'grad_norm': 0.6892325282096863, 'learning_rate': 1.3154362416107383e-05, 'epoch': 2.22}


 74%|███████▍  | 5550/7500 [2:55:41<1:00:19,  1.86s/it]

{'loss': 0.741, 'grad_norm': 0.556932806968689, 'learning_rate': 1.3087248322147652e-05, 'epoch': 2.22}


 74%|███████▍  | 5560/7500 [2:55:59<1:00:25,  1.87s/it]

{'loss': 0.7485, 'grad_norm': 0.6029717326164246, 'learning_rate': 1.3020134228187919e-05, 'epoch': 2.22}


 74%|███████▍  | 5570/7500 [2:56:18<1:00:12,  1.87s/it]

{'loss': 0.7667, 'grad_norm': 0.5010673403739929, 'learning_rate': 1.2953020134228189e-05, 'epoch': 2.23}


 74%|███████▍  | 5580/7500 [2:56:37<1:00:21,  1.89s/it]

{'loss': 0.7342, 'grad_norm': 0.6529967784881592, 'learning_rate': 1.2885906040268459e-05, 'epoch': 2.23}


 75%|███████▍  | 5590/7500 [2:56:56<1:01:23,  1.93s/it]

{'loss': 0.783, 'grad_norm': 0.7907880544662476, 'learning_rate': 1.2818791946308726e-05, 'epoch': 2.24}


 75%|███████▍  | 5600/7500 [2:57:16<1:03:13,  2.00s/it]

{'loss': 0.7428, 'grad_norm': 0.657203197479248, 'learning_rate': 1.2751677852348994e-05, 'epoch': 2.24}


 75%|███████▍  | 5610/7500 [2:57:41<1:26:40,  2.75s/it]

{'loss': 0.7743, 'grad_norm': 0.702331006526947, 'learning_rate': 1.2684563758389261e-05, 'epoch': 2.24}


 75%|███████▍  | 5620/7500 [2:58:01<1:02:05,  1.98s/it]

{'loss': 0.7443, 'grad_norm': 0.67232346534729, 'learning_rate': 1.2617449664429532e-05, 'epoch': 2.25}


 75%|███████▌  | 5630/7500 [2:58:21<1:00:14,  1.93s/it]

{'loss': 0.7618, 'grad_norm': 0.5667345523834229, 'learning_rate': 1.2550335570469798e-05, 'epoch': 2.25}


 75%|███████▌  | 5640/7500 [2:58:40<1:00:32,  1.95s/it]

{'loss': 0.7647, 'grad_norm': 0.6318372488021851, 'learning_rate': 1.2483221476510069e-05, 'epoch': 2.26}


 75%|███████▌  | 5650/7500 [2:58:59<58:57,  1.91s/it]  

{'loss': 0.7785, 'grad_norm': 0.6489272117614746, 'learning_rate': 1.2416107382550337e-05, 'epoch': 2.26}


 75%|███████▌  | 5660/7500 [2:59:19<59:30,  1.94s/it]

{'loss': 0.7006, 'grad_norm': 0.5338150858879089, 'learning_rate': 1.2348993288590604e-05, 'epoch': 2.26}


 76%|███████▌  | 5670/7500 [2:59:38<58:26,  1.92s/it]

{'loss': 0.778, 'grad_norm': 0.6332360506057739, 'learning_rate': 1.2281879194630872e-05, 'epoch': 2.27}


 76%|███████▌  | 5680/7500 [2:59:57<57:57,  1.91s/it]

{'loss': 0.7878, 'grad_norm': 0.7234639525413513, 'learning_rate': 1.2214765100671141e-05, 'epoch': 2.27}


 76%|███████▌  | 5690/7500 [3:00:16<58:20,  1.93s/it]

{'loss': 0.7023, 'grad_norm': 1.0718004703521729, 'learning_rate': 1.214765100671141e-05, 'epoch': 2.28}


 76%|███████▌  | 5700/7500 [3:00:36<57:45,  1.93s/it]

{'loss': 0.7852, 'grad_norm': 0.5004761815071106, 'learning_rate': 1.208053691275168e-05, 'epoch': 2.28}


 76%|███████▌  | 5710/7500 [3:00:55<57:22,  1.92s/it]

{'loss': 0.7394, 'grad_norm': 0.632374107837677, 'learning_rate': 1.2013422818791947e-05, 'epoch': 2.28}


 76%|███████▋  | 5720/7500 [3:01:14<56:48,  1.91s/it]

{'loss': 0.7251, 'grad_norm': 0.49418070912361145, 'learning_rate': 1.1946308724832215e-05, 'epoch': 2.29}


 76%|███████▋  | 5730/7500 [3:01:33<55:56,  1.90s/it]

{'loss': 0.7225, 'grad_norm': 0.5701956152915955, 'learning_rate': 1.1879194630872484e-05, 'epoch': 2.29}


 77%|███████▋  | 5740/7500 [3:01:52<56:21,  1.92s/it]

{'loss': 0.7418, 'grad_norm': 0.5810706615447998, 'learning_rate': 1.1812080536912752e-05, 'epoch': 2.3}


 77%|███████▋  | 5750/7500 [3:02:11<55:14,  1.89s/it]

{'loss': 0.7044, 'grad_norm': 0.8182178735733032, 'learning_rate': 1.174496644295302e-05, 'epoch': 2.3}


 77%|███████▋  | 5760/7500 [3:02:30<55:37,  1.92s/it]

{'loss': 0.7317, 'grad_norm': 0.5948638916015625, 'learning_rate': 1.167785234899329e-05, 'epoch': 2.3}


 77%|███████▋  | 5770/7500 [3:02:51<1:02:20,  2.16s/it]

{'loss': 0.7206, 'grad_norm': 0.538685142993927, 'learning_rate': 1.1610738255033558e-05, 'epoch': 2.31}


 77%|███████▋  | 5780/7500 [3:03:11<54:25,  1.90s/it]  

{'loss': 0.7757, 'grad_norm': 0.7373612523078918, 'learning_rate': 1.1543624161073826e-05, 'epoch': 2.31}


 77%|███████▋  | 5790/7500 [3:03:29<52:41,  1.85s/it]

{'loss': 0.7263, 'grad_norm': 0.7284355163574219, 'learning_rate': 1.1476510067114095e-05, 'epoch': 2.32}


 77%|███████▋  | 5800/7500 [3:03:48<53:03,  1.87s/it]

{'loss': 0.7912, 'grad_norm': 0.7563712000846863, 'learning_rate': 1.1409395973154363e-05, 'epoch': 2.32}


 77%|███████▋  | 5810/7500 [3:04:07<52:30,  1.86s/it]

{'loss': 0.7658, 'grad_norm': 0.5845866203308105, 'learning_rate': 1.1342281879194632e-05, 'epoch': 2.32}


 78%|███████▊  | 5820/7500 [3:04:26<53:34,  1.91s/it]

{'loss': 0.7666, 'grad_norm': 0.539501965045929, 'learning_rate': 1.12751677852349e-05, 'epoch': 2.33}


 78%|███████▊  | 5830/7500 [3:04:45<53:51,  1.94s/it]

{'loss': 0.7212, 'grad_norm': 0.5222535729408264, 'learning_rate': 1.1208053691275167e-05, 'epoch': 2.33}


 78%|███████▊  | 5840/7500 [3:05:05<54:21,  1.96s/it]

{'loss': 0.7481, 'grad_norm': 0.9611390233039856, 'learning_rate': 1.1140939597315436e-05, 'epoch': 2.34}


 78%|███████▊  | 5850/7500 [3:05:24<51:49,  1.88s/it]

{'loss': 0.7565, 'grad_norm': 0.6121340990066528, 'learning_rate': 1.1073825503355706e-05, 'epoch': 2.34}


 78%|███████▊  | 5860/7500 [3:05:45<57:13,  2.09s/it]

{'loss': 0.7158, 'grad_norm': 0.6421166062355042, 'learning_rate': 1.1006711409395975e-05, 'epoch': 2.34}


 78%|███████▊  | 5870/7500 [3:06:06<55:00,  2.03s/it]  

{'loss': 0.7324, 'grad_norm': 0.470849871635437, 'learning_rate': 1.0939597315436243e-05, 'epoch': 2.35}


 78%|███████▊  | 5880/7500 [3:06:27<53:53,  2.00s/it]  

{'loss': 0.7197, 'grad_norm': 0.5796228647232056, 'learning_rate': 1.087248322147651e-05, 'epoch': 2.35}


 79%|███████▊  | 5890/7500 [3:06:47<50:24,  1.88s/it]

{'loss': 0.7703, 'grad_norm': 0.7251536250114441, 'learning_rate': 1.0805369127516778e-05, 'epoch': 2.36}


 79%|███████▊  | 5900/7500 [3:07:05<48:00,  1.80s/it]

{'loss': 0.7413, 'grad_norm': 0.8480940461158752, 'learning_rate': 1.0738255033557047e-05, 'epoch': 2.36}


 79%|███████▉  | 5910/7500 [3:07:23<47:42,  1.80s/it]

{'loss': 0.7671, 'grad_norm': 0.750775158405304, 'learning_rate': 1.0671140939597316e-05, 'epoch': 2.36}


 79%|███████▉  | 5920/7500 [3:07:41<48:11,  1.83s/it]

{'loss': 0.7081, 'grad_norm': 0.5124070644378662, 'learning_rate': 1.0604026845637584e-05, 'epoch': 2.37}


 79%|███████▉  | 5930/7500 [3:08:00<48:44,  1.86s/it]

{'loss': 0.7569, 'grad_norm': 0.9078432321548462, 'learning_rate': 1.0536912751677854e-05, 'epoch': 2.37}


 79%|███████▉  | 5940/7500 [3:08:19<48:28,  1.86s/it]

{'loss': 0.7585, 'grad_norm': 0.5275930762290955, 'learning_rate': 1.0469798657718121e-05, 'epoch': 2.38}


 79%|███████▉  | 5950/7500 [3:08:38<49:09,  1.90s/it]

{'loss': 0.7651, 'grad_norm': 0.4931371510028839, 'learning_rate': 1.040268456375839e-05, 'epoch': 2.38}


 79%|███████▉  | 5960/7500 [3:08:57<49:10,  1.92s/it]

{'loss': 0.7392, 'grad_norm': 0.6596435904502869, 'learning_rate': 1.0335570469798658e-05, 'epoch': 2.38}


 80%|███████▉  | 5970/7500 [3:09:16<48:14,  1.89s/it]

{'loss': 0.7785, 'grad_norm': 0.733079731464386, 'learning_rate': 1.0268456375838927e-05, 'epoch': 2.39}


 80%|███████▉  | 5980/7500 [3:09:35<49:00,  1.93s/it]

{'loss': 0.7335, 'grad_norm': 0.7009112238883972, 'learning_rate': 1.0201342281879195e-05, 'epoch': 2.39}


 80%|███████▉  | 5990/7500 [3:09:54<47:06,  1.87s/it]

{'loss': 0.71, 'grad_norm': 0.7494003176689148, 'learning_rate': 1.0134228187919464e-05, 'epoch': 2.4}


 80%|████████  | 6000/7500 [3:10:13<47:02,  1.88s/it]

{'loss': 0.737, 'grad_norm': 0.7852288484573364, 'learning_rate': 1.006711409395973e-05, 'epoch': 2.4}


 80%|████████  | 6010/7500 [3:10:33<47:57,  1.93s/it]

{'loss': 0.7713, 'grad_norm': 0.5720508694648743, 'learning_rate': 1e-05, 'epoch': 2.4}


 80%|████████  | 6020/7500 [3:10:53<47:43,  1.94s/it]

{'loss': 0.7787, 'grad_norm': 0.9032867550849915, 'learning_rate': 9.93288590604027e-06, 'epoch': 2.41}


 80%|████████  | 6030/7500 [3:11:12<46:43,  1.91s/it]

{'loss': 0.7828, 'grad_norm': 0.7011318802833557, 'learning_rate': 9.865771812080538e-06, 'epoch': 2.41}


 81%|████████  | 6040/7500 [3:11:31<46:25,  1.91s/it]

{'loss': 0.7324, 'grad_norm': 0.6505710482597351, 'learning_rate': 9.798657718120806e-06, 'epoch': 2.42}


 81%|████████  | 6050/7500 [3:11:50<46:29,  1.92s/it]

{'loss': 0.7352, 'grad_norm': 0.6192042231559753, 'learning_rate': 9.731543624161075e-06, 'epoch': 2.42}


 81%|████████  | 6060/7500 [3:12:09<44:22,  1.85s/it]

{'loss': 0.7432, 'grad_norm': 0.5224722623825073, 'learning_rate': 9.664429530201342e-06, 'epoch': 2.42}


 81%|████████  | 6070/7500 [3:12:27<44:01,  1.85s/it]

{'loss': 0.7238, 'grad_norm': 0.6262465119361877, 'learning_rate': 9.59731543624161e-06, 'epoch': 2.43}


 81%|████████  | 6080/7500 [3:12:46<44:31,  1.88s/it]

{'loss': 0.7704, 'grad_norm': 0.5760208964347839, 'learning_rate': 9.53020134228188e-06, 'epoch': 2.43}


 81%|████████  | 6090/7500 [3:13:05<44:35,  1.90s/it]

{'loss': 0.7315, 'grad_norm': 0.7552335262298584, 'learning_rate': 9.463087248322149e-06, 'epoch': 2.44}


 81%|████████▏ | 6100/7500 [3:13:24<44:04,  1.89s/it]

{'loss': 0.7801, 'grad_norm': 0.8304266929626465, 'learning_rate': 9.395973154362418e-06, 'epoch': 2.44}


 81%|████████▏ | 6110/7500 [3:13:43<43:58,  1.90s/it]

{'loss': 0.7711, 'grad_norm': 0.8277061581611633, 'learning_rate': 9.328859060402684e-06, 'epoch': 2.44}


 82%|████████▏ | 6120/7500 [3:14:02<43:19,  1.88s/it]

{'loss': 0.7679, 'grad_norm': 0.7393027544021606, 'learning_rate': 9.261744966442953e-06, 'epoch': 2.45}


 82%|████████▏ | 6130/7500 [3:14:21<42:43,  1.87s/it]

{'loss': 0.7681, 'grad_norm': 0.5741291642189026, 'learning_rate': 9.194630872483221e-06, 'epoch': 2.45}


 82%|████████▏ | 6140/7500 [3:14:40<42:52,  1.89s/it]

{'loss': 0.7325, 'grad_norm': 0.6284734010696411, 'learning_rate': 9.12751677852349e-06, 'epoch': 2.46}


 82%|████████▏ | 6150/7500 [3:14:59<42:42,  1.90s/it]

{'loss': 0.7293, 'grad_norm': 0.5862001180648804, 'learning_rate': 9.060402684563759e-06, 'epoch': 2.46}


 82%|████████▏ | 6160/7500 [3:15:18<42:40,  1.91s/it]

{'loss': 0.7402, 'grad_norm': 0.5384498834609985, 'learning_rate': 8.993288590604027e-06, 'epoch': 2.46}


 82%|████████▏ | 6170/7500 [3:15:37<41:48,  1.89s/it]

{'loss': 0.7395, 'grad_norm': 1.1774113178253174, 'learning_rate': 8.926174496644296e-06, 'epoch': 2.47}


 82%|████████▏ | 6180/7500 [3:15:56<42:32,  1.93s/it]

{'loss': 0.7448, 'grad_norm': 0.5053119659423828, 'learning_rate': 8.859060402684564e-06, 'epoch': 2.47}


 83%|████████▎ | 6190/7500 [3:16:15<41:17,  1.89s/it]

{'loss': 0.7394, 'grad_norm': 0.49698606133461, 'learning_rate': 8.791946308724833e-06, 'epoch': 2.48}


 83%|████████▎ | 6200/7500 [3:16:34<40:52,  1.89s/it]

{'loss': 0.7423, 'grad_norm': 0.6174274682998657, 'learning_rate': 8.724832214765101e-06, 'epoch': 2.48}


 83%|████████▎ | 6210/7500 [3:16:53<40:55,  1.90s/it]

{'loss': 0.7307, 'grad_norm': 0.5749746561050415, 'learning_rate': 8.65771812080537e-06, 'epoch': 2.48}


 83%|████████▎ | 6220/7500 [3:17:12<40:31,  1.90s/it]

{'loss': 0.773, 'grad_norm': 0.6096768379211426, 'learning_rate': 8.590604026845638e-06, 'epoch': 2.49}


 83%|████████▎ | 6230/7500 [3:17:31<40:12,  1.90s/it]

{'loss': 0.7038, 'grad_norm': 0.6872166395187378, 'learning_rate': 8.523489932885905e-06, 'epoch': 2.49}


 83%|████████▎ | 6240/7500 [3:17:50<41:02,  1.95s/it]

{'loss': 0.7181, 'grad_norm': 0.5412789583206177, 'learning_rate': 8.456375838926175e-06, 'epoch': 2.5}


 83%|████████▎ | 6250/7500 [3:18:09<40:27,  1.94s/it]

{'loss': 0.7527, 'grad_norm': 0.6019031405448914, 'learning_rate': 8.389261744966444e-06, 'epoch': 2.5}


 83%|████████▎ | 6260/7500 [3:18:29<39:35,  1.92s/it]

{'loss': 0.7317, 'grad_norm': 0.6623294949531555, 'learning_rate': 8.322147651006712e-06, 'epoch': 2.5}


 84%|████████▎ | 6270/7500 [3:18:48<38:28,  1.88s/it]

{'loss': 0.7348, 'grad_norm': 0.603053867816925, 'learning_rate': 8.255033557046981e-06, 'epoch': 2.51}


 84%|████████▎ | 6280/7500 [3:19:06<38:17,  1.88s/it]

{'loss': 0.7359, 'grad_norm': 0.5828479528427124, 'learning_rate': 8.187919463087248e-06, 'epoch': 2.51}


 84%|████████▍ | 6290/7500 [3:19:25<37:50,  1.88s/it]

{'loss': 0.7602, 'grad_norm': 0.6116567850112915, 'learning_rate': 8.120805369127516e-06, 'epoch': 2.52}


 84%|████████▍ | 6300/7500 [3:19:44<38:31,  1.93s/it]

{'loss': 0.774, 'grad_norm': 0.9968334436416626, 'learning_rate': 8.053691275167785e-06, 'epoch': 2.52}


 84%|████████▍ | 6310/7500 [3:20:04<38:07,  1.92s/it]

{'loss': 0.7807, 'grad_norm': 0.5357561111450195, 'learning_rate': 7.986577181208055e-06, 'epoch': 2.52}


 84%|████████▍ | 6320/7500 [3:20:23<37:08,  1.89s/it]

{'loss': 0.781, 'grad_norm': 1.1708024740219116, 'learning_rate': 7.919463087248324e-06, 'epoch': 2.53}


 84%|████████▍ | 6330/7500 [3:20:42<37:13,  1.91s/it]

{'loss': 0.7461, 'grad_norm': 0.5730124711990356, 'learning_rate': 7.852348993288592e-06, 'epoch': 2.53}


 85%|████████▍ | 6340/7500 [3:21:01<37:40,  1.95s/it]

{'loss': 0.7688, 'grad_norm': 0.8067437410354614, 'learning_rate': 7.785234899328859e-06, 'epoch': 2.54}


 85%|████████▍ | 6350/7500 [3:21:20<37:04,  1.93s/it]

{'loss': 0.7418, 'grad_norm': 0.7734897136688232, 'learning_rate': 7.718120805369127e-06, 'epoch': 2.54}


 85%|████████▍ | 6360/7500 [3:21:40<36:14,  1.91s/it]

{'loss': 0.7513, 'grad_norm': 0.6886509656906128, 'learning_rate': 7.651006711409396e-06, 'epoch': 2.54}


 85%|████████▍ | 6370/7500 [3:21:59<35:02,  1.86s/it]

{'loss': 0.7762, 'grad_norm': 0.6140671968460083, 'learning_rate': 7.5838926174496645e-06, 'epoch': 2.55}


 85%|████████▌ | 6380/7500 [3:22:17<34:54,  1.87s/it]

{'loss': 0.7316, 'grad_norm': 0.5285465717315674, 'learning_rate': 7.516778523489933e-06, 'epoch': 2.55}


 85%|████████▌ | 6390/7500 [3:22:36<35:12,  1.90s/it]

{'loss': 0.714, 'grad_norm': 0.5583608746528625, 'learning_rate': 7.4496644295302024e-06, 'epoch': 2.56}


 85%|████████▌ | 6400/7500 [3:22:55<35:10,  1.92s/it]

{'loss': 0.7334, 'grad_norm': 0.6676566004753113, 'learning_rate': 7.382550335570471e-06, 'epoch': 2.56}


 85%|████████▌ | 6410/7500 [3:23:14<34:26,  1.90s/it]

{'loss': 0.7719, 'grad_norm': 0.612461268901825, 'learning_rate': 7.315436241610739e-06, 'epoch': 2.56}


 86%|████████▌ | 6420/7500 [3:23:34<35:27,  1.97s/it]

{'loss': 0.7377, 'grad_norm': 0.644665002822876, 'learning_rate': 7.248322147651007e-06, 'epoch': 2.57}


 86%|████████▌ | 6430/7500 [3:23:54<35:21,  1.98s/it]

{'loss': 0.7645, 'grad_norm': 0.5460493564605713, 'learning_rate': 7.181208053691276e-06, 'epoch': 2.57}


 86%|████████▌ | 6440/7500 [3:24:14<34:34,  1.96s/it]

{'loss': 0.7236, 'grad_norm': 0.5830048322677612, 'learning_rate': 7.114093959731543e-06, 'epoch': 2.58}


 86%|████████▌ | 6450/7500 [3:24:33<33:23,  1.91s/it]

{'loss': 0.7396, 'grad_norm': 0.5352090001106262, 'learning_rate': 7.046979865771812e-06, 'epoch': 2.58}


 86%|████████▌ | 6460/7500 [3:24:52<33:53,  1.96s/it]

{'loss': 0.7497, 'grad_norm': 0.747038722038269, 'learning_rate': 6.9798657718120805e-06, 'epoch': 2.58}


 86%|████████▋ | 6470/7500 [3:25:13<33:42,  1.96s/it]

{'loss': 0.7475, 'grad_norm': 0.8247970938682556, 'learning_rate': 6.91275167785235e-06, 'epoch': 2.59}


 86%|████████▋ | 6480/7500 [3:25:32<32:36,  1.92s/it]

{'loss': 0.7222, 'grad_norm': 0.6004035472869873, 'learning_rate': 6.845637583892618e-06, 'epoch': 2.59}


 87%|████████▋ | 6490/7500 [3:25:52<32:14,  1.92s/it]

{'loss': 0.6929, 'grad_norm': 0.6920758485794067, 'learning_rate': 6.778523489932886e-06, 'epoch': 2.6}


 87%|████████▋ | 6500/7500 [3:26:13<39:29,  2.37s/it]

{'loss': 0.7555, 'grad_norm': 0.5624490976333618, 'learning_rate': 6.7114093959731546e-06, 'epoch': 2.6}


 87%|████████▋ | 6510/7500 [3:26:36<34:21,  2.08s/it]

{'loss': 0.7291, 'grad_norm': 0.6575191617012024, 'learning_rate': 6.644295302013423e-06, 'epoch': 2.6}


 87%|████████▋ | 6520/7500 [3:26:58<35:01,  2.14s/it]

{'loss': 0.7378, 'grad_norm': 0.8029407858848572, 'learning_rate': 6.577181208053692e-06, 'epoch': 2.61}


 87%|████████▋ | 6530/7500 [3:27:18<31:32,  1.95s/it]

{'loss': 0.73, 'grad_norm': 0.9930325150489807, 'learning_rate': 6.510067114093959e-06, 'epoch': 2.61}


 87%|████████▋ | 6540/7500 [3:27:37<30:42,  1.92s/it]

{'loss': 0.734, 'grad_norm': 0.6170931458473206, 'learning_rate': 6.4429530201342295e-06, 'epoch': 2.62}


 87%|████████▋ | 6550/7500 [3:27:56<29:58,  1.89s/it]

{'loss': 0.7722, 'grad_norm': 0.6431517004966736, 'learning_rate': 6.375838926174497e-06, 'epoch': 2.62}


 87%|████████▋ | 6560/7500 [3:28:15<29:47,  1.90s/it]

{'loss': 0.7581, 'grad_norm': 0.7131050229072571, 'learning_rate': 6.308724832214766e-06, 'epoch': 2.62}


 88%|████████▊ | 6570/7500 [3:28:34<29:11,  1.88s/it]

{'loss': 0.7668, 'grad_norm': 0.6430018544197083, 'learning_rate': 6.241610738255034e-06, 'epoch': 2.63}


 88%|████████▊ | 6580/7500 [3:28:53<29:01,  1.89s/it]

{'loss': 0.7252, 'grad_norm': 0.6778733134269714, 'learning_rate': 6.174496644295302e-06, 'epoch': 2.63}


 88%|████████▊ | 6590/7500 [3:29:12<28:38,  1.89s/it]

{'loss': 0.7435, 'grad_norm': 0.5675327181816101, 'learning_rate': 6.1073825503355705e-06, 'epoch': 2.64}


 88%|████████▊ | 6600/7500 [3:29:31<28:33,  1.90s/it]

{'loss': 0.7584, 'grad_norm': 0.5739737749099731, 'learning_rate': 6.04026845637584e-06, 'epoch': 2.64}


 88%|████████▊ | 6610/7500 [3:29:50<28:12,  1.90s/it]

{'loss': 0.7131, 'grad_norm': 0.6737279295921326, 'learning_rate': 5.9731543624161076e-06, 'epoch': 2.64}


 88%|████████▊ | 6620/7500 [3:30:09<27:50,  1.90s/it]

{'loss': 0.7357, 'grad_norm': 0.5429763793945312, 'learning_rate': 5.906040268456376e-06, 'epoch': 2.65}


 88%|████████▊ | 6630/7500 [3:30:28<27:29,  1.90s/it]

{'loss': 0.7624, 'grad_norm': 0.5870277285575867, 'learning_rate': 5.838926174496645e-06, 'epoch': 2.65}


 89%|████████▊ | 6640/7500 [3:30:47<27:35,  1.93s/it]

{'loss': 0.732, 'grad_norm': 0.6956804394721985, 'learning_rate': 5.771812080536913e-06, 'epoch': 2.66}


 89%|████████▊ | 6650/7500 [3:31:06<26:49,  1.89s/it]

{'loss': 0.7799, 'grad_norm': 0.8118778467178345, 'learning_rate': 5.704697986577182e-06, 'epoch': 2.66}


 89%|████████▉ | 6660/7500 [3:31:25<26:33,  1.90s/it]

{'loss': 0.7365, 'grad_norm': 0.7455150485038757, 'learning_rate': 5.63758389261745e-06, 'epoch': 2.66}


 89%|████████▉ | 6670/7500 [3:31:44<26:48,  1.94s/it]

{'loss': 0.7371, 'grad_norm': 0.8058657646179199, 'learning_rate': 5.570469798657718e-06, 'epoch': 2.67}


 89%|████████▉ | 6680/7500 [3:32:03<26:04,  1.91s/it]

{'loss': 0.7537, 'grad_norm': 0.6406611204147339, 'learning_rate': 5.503355704697987e-06, 'epoch': 2.67}


 89%|████████▉ | 6690/7500 [3:32:23<27:12,  2.02s/it]

{'loss': 0.748, 'grad_norm': 0.7519178986549377, 'learning_rate': 5.436241610738255e-06, 'epoch': 2.68}


 89%|████████▉ | 6700/7500 [3:32:44<26:46,  2.01s/it]

{'loss': 0.7353, 'grad_norm': 0.5807346105575562, 'learning_rate': 5.3691275167785235e-06, 'epoch': 2.68}


 89%|████████▉ | 6710/7500 [3:33:03<25:10,  1.91s/it]

{'loss': 0.7351, 'grad_norm': 0.6842594146728516, 'learning_rate': 5.302013422818792e-06, 'epoch': 2.68}


 90%|████████▉ | 6720/7500 [3:33:22<25:34,  1.97s/it]

{'loss': 0.7623, 'grad_norm': 0.8320870399475098, 'learning_rate': 5.2348993288590606e-06, 'epoch': 2.69}


 90%|████████▉ | 6730/7500 [3:33:41<24:05,  1.88s/it]

{'loss': 0.7148, 'grad_norm': 0.6592739224433899, 'learning_rate': 5.167785234899329e-06, 'epoch': 2.69}


 90%|████████▉ | 6740/7500 [3:34:00<23:51,  1.88s/it]

{'loss': 0.7313, 'grad_norm': 0.6277884840965271, 'learning_rate': 5.100671140939598e-06, 'epoch': 2.7}


 90%|█████████ | 6750/7500 [3:34:18<23:33,  1.88s/it]

{'loss': 0.7785, 'grad_norm': 1.2398518323898315, 'learning_rate': 5.033557046979865e-06, 'epoch': 2.7}


 90%|█████████ | 6760/7500 [3:34:37<23:21,  1.89s/it]

{'loss': 0.7251, 'grad_norm': 0.9593914747238159, 'learning_rate': 4.966442953020135e-06, 'epoch': 2.7}


 90%|█████████ | 6770/7500 [3:34:57<23:47,  1.96s/it]

{'loss': 0.7159, 'grad_norm': 0.7975189685821533, 'learning_rate': 4.899328859060403e-06, 'epoch': 2.71}


 90%|█████████ | 6780/7500 [3:35:16<22:53,  1.91s/it]

{'loss': 0.7552, 'grad_norm': 0.6124092936515808, 'learning_rate': 4.832214765100671e-06, 'epoch': 2.71}


 91%|█████████ | 6790/7500 [3:35:35<22:32,  1.90s/it]

{'loss': 0.7678, 'grad_norm': 0.6255057454109192, 'learning_rate': 4.76510067114094e-06, 'epoch': 2.72}


 91%|█████████ | 6800/7500 [3:35:54<22:24,  1.92s/it]

{'loss': 0.7343, 'grad_norm': 0.6560561060905457, 'learning_rate': 4.697986577181209e-06, 'epoch': 2.72}


 91%|█████████ | 6810/7500 [3:36:13<22:03,  1.92s/it]

{'loss': 0.7371, 'grad_norm': 0.5954013466835022, 'learning_rate': 4.6308724832214765e-06, 'epoch': 2.72}


 91%|█████████ | 6820/7500 [3:36:32<21:52,  1.93s/it]

{'loss': 0.7353, 'grad_norm': 0.48820462822914124, 'learning_rate': 4.563758389261745e-06, 'epoch': 2.73}


 91%|█████████ | 6830/7500 [3:36:51<21:25,  1.92s/it]

{'loss': 0.7885, 'grad_norm': 0.6895062923431396, 'learning_rate': 4.4966442953020135e-06, 'epoch': 2.73}


 91%|█████████ | 6840/7500 [3:37:10<21:10,  1.92s/it]

{'loss': 0.7624, 'grad_norm': 0.5939380526542664, 'learning_rate': 4.429530201342282e-06, 'epoch': 2.74}


 91%|█████████▏| 6850/7500 [3:37:30<20:42,  1.91s/it]

{'loss': 0.7247, 'grad_norm': 0.6925826072692871, 'learning_rate': 4.362416107382551e-06, 'epoch': 2.74}


 91%|█████████▏| 6860/7500 [3:37:49<20:21,  1.91s/it]

{'loss': 0.7371, 'grad_norm': 0.5463498830795288, 'learning_rate': 4.295302013422819e-06, 'epoch': 2.74}


 92%|█████████▏| 6870/7500 [3:38:08<20:22,  1.94s/it]

{'loss': 0.7492, 'grad_norm': 0.7106818556785583, 'learning_rate': 4.228187919463088e-06, 'epoch': 2.75}


 92%|█████████▏| 6880/7500 [3:38:27<19:32,  1.89s/it]

{'loss': 0.7429, 'grad_norm': 0.6799713373184204, 'learning_rate': 4.161073825503356e-06, 'epoch': 2.75}


 92%|█████████▏| 6890/7500 [3:38:46<19:27,  1.91s/it]

{'loss': 0.7297, 'grad_norm': 0.6922893524169922, 'learning_rate': 4.093959731543624e-06, 'epoch': 2.76}


 92%|█████████▏| 6900/7500 [3:39:05<18:19,  1.83s/it]

{'loss': 0.7383, 'grad_norm': 0.5985195636749268, 'learning_rate': 4.026845637583892e-06, 'epoch': 2.76}


 92%|█████████▏| 6910/7500 [3:39:23<17:52,  1.82s/it]

{'loss': 0.7287, 'grad_norm': 1.0380605459213257, 'learning_rate': 3.959731543624162e-06, 'epoch': 2.76}


 92%|█████████▏| 6920/7500 [3:39:41<17:48,  1.84s/it]

{'loss': 0.7125, 'grad_norm': 0.9105805158615112, 'learning_rate': 3.8926174496644295e-06, 'epoch': 2.77}


 92%|█████████▏| 6930/7500 [3:40:00<17:37,  1.86s/it]

{'loss': 0.7791, 'grad_norm': 0.727307140827179, 'learning_rate': 3.825503355704698e-06, 'epoch': 2.77}


 93%|█████████▎| 6940/7500 [3:40:19<17:17,  1.85s/it]

{'loss': 0.7622, 'grad_norm': 0.6329295039176941, 'learning_rate': 3.7583892617449665e-06, 'epoch': 2.78}


 93%|█████████▎| 6950/7500 [3:40:38<17:25,  1.90s/it]

{'loss': 0.7585, 'grad_norm': 0.7856727242469788, 'learning_rate': 3.6912751677852355e-06, 'epoch': 2.78}


 93%|█████████▎| 6960/7500 [3:40:57<17:09,  1.91s/it]

{'loss': 0.7448, 'grad_norm': 0.6570311188697815, 'learning_rate': 3.6241610738255036e-06, 'epoch': 2.78}


 93%|█████████▎| 6970/7500 [3:41:16<16:47,  1.90s/it]

{'loss': 0.8079, 'grad_norm': 0.6383084058761597, 'learning_rate': 3.5570469798657717e-06, 'epoch': 2.79}


 93%|█████████▎| 6980/7500 [3:41:35<16:23,  1.89s/it]

{'loss': 0.7617, 'grad_norm': 0.7628717422485352, 'learning_rate': 3.4899328859060402e-06, 'epoch': 2.79}


 93%|█████████▎| 6990/7500 [3:41:54<16:02,  1.89s/it]

{'loss': 0.7528, 'grad_norm': 0.7520030736923218, 'learning_rate': 3.422818791946309e-06, 'epoch': 2.8}


 93%|█████████▎| 7000/7500 [3:42:12<15:41,  1.88s/it]

{'loss': 0.7627, 'grad_norm': 0.6683176755905151, 'learning_rate': 3.3557046979865773e-06, 'epoch': 2.8}


 93%|█████████▎| 7010/7500 [3:42:36<18:42,  2.29s/it]

{'loss': 0.7177, 'grad_norm': 0.7656648755073547, 'learning_rate': 3.288590604026846e-06, 'epoch': 2.8}


 94%|█████████▎| 7020/7500 [3:42:56<16:02,  2.01s/it]

{'loss': 0.7313, 'grad_norm': 0.5415195822715759, 'learning_rate': 3.2214765100671148e-06, 'epoch': 2.81}


 94%|█████████▎| 7030/7500 [3:43:15<14:49,  1.89s/it]

{'loss': 0.7403, 'grad_norm': 0.880996584892273, 'learning_rate': 3.154362416107383e-06, 'epoch': 2.81}


 94%|█████████▍| 7040/7500 [3:43:33<14:12,  1.85s/it]

{'loss': 0.7466, 'grad_norm': 0.6931533813476562, 'learning_rate': 3.087248322147651e-06, 'epoch': 2.82}


 94%|█████████▍| 7050/7500 [3:43:52<14:12,  1.89s/it]

{'loss': 0.7466, 'grad_norm': 0.7813503742218018, 'learning_rate': 3.02013422818792e-06, 'epoch': 2.82}


 94%|█████████▍| 7060/7500 [3:44:11<13:29,  1.84s/it]

{'loss': 0.7551, 'grad_norm': 0.5305557250976562, 'learning_rate': 2.953020134228188e-06, 'epoch': 2.82}


 94%|█████████▍| 7070/7500 [3:44:29<13:13,  1.85s/it]

{'loss': 0.7286, 'grad_norm': 1.068291425704956, 'learning_rate': 2.8859060402684566e-06, 'epoch': 2.83}


 94%|█████████▍| 7080/7500 [3:44:48<13:09,  1.88s/it]

{'loss': 0.7613, 'grad_norm': 0.8526117205619812, 'learning_rate': 2.818791946308725e-06, 'epoch': 2.83}


 95%|█████████▍| 7090/7500 [3:45:06<12:42,  1.86s/it]

{'loss': 0.7947, 'grad_norm': 0.7114691734313965, 'learning_rate': 2.7516778523489936e-06, 'epoch': 2.84}


 95%|█████████▍| 7100/7500 [3:45:25<12:21,  1.85s/it]

{'loss': 0.751, 'grad_norm': 0.6299334168434143, 'learning_rate': 2.6845637583892617e-06, 'epoch': 2.84}


 95%|█████████▍| 7110/7500 [3:45:43<12:13,  1.88s/it]

{'loss': 0.7406, 'grad_norm': 0.8355052471160889, 'learning_rate': 2.6174496644295303e-06, 'epoch': 2.84}


 95%|█████████▍| 7120/7500 [3:46:02<11:56,  1.89s/it]

{'loss': 0.7276, 'grad_norm': 0.6178103089332581, 'learning_rate': 2.550335570469799e-06, 'epoch': 2.85}


 95%|█████████▌| 7130/7500 [3:46:22<11:37,  1.89s/it]

{'loss': 0.7152, 'grad_norm': 0.7865511178970337, 'learning_rate': 2.4832214765100673e-06, 'epoch': 2.85}


 95%|█████████▌| 7140/7500 [3:46:41<11:52,  1.98s/it]

{'loss': 0.7064, 'grad_norm': 0.9824076294898987, 'learning_rate': 2.4161073825503354e-06, 'epoch': 2.86}


 95%|█████████▌| 7150/7500 [3:47:00<10:51,  1.86s/it]

{'loss': 0.7375, 'grad_norm': 0.6387530565261841, 'learning_rate': 2.3489932885906044e-06, 'epoch': 2.86}


 95%|█████████▌| 7160/7500 [3:47:19<10:55,  1.93s/it]

{'loss': 0.7479, 'grad_norm': 1.1326504945755005, 'learning_rate': 2.2818791946308725e-06, 'epoch': 2.86}


 96%|█████████▌| 7170/7500 [3:47:38<10:24,  1.89s/it]

{'loss': 0.7723, 'grad_norm': 0.7775034308433533, 'learning_rate': 2.214765100671141e-06, 'epoch': 2.87}


 96%|█████████▌| 7180/7500 [3:47:58<10:59,  2.06s/it]

{'loss': 0.7375, 'grad_norm': 0.8097350001335144, 'learning_rate': 2.1476510067114096e-06, 'epoch': 2.87}


 96%|█████████▌| 7190/7500 [3:48:18<10:39,  2.06s/it]

{'loss': 0.7077, 'grad_norm': 0.5183387398719788, 'learning_rate': 2.080536912751678e-06, 'epoch': 2.88}


 96%|█████████▌| 7200/7500 [3:48:38<09:57,  1.99s/it]

{'loss': 0.7704, 'grad_norm': 0.7320764064788818, 'learning_rate': 2.013422818791946e-06, 'epoch': 2.88}


 96%|█████████▌| 7210/7500 [3:48:58<09:19,  1.93s/it]

{'loss': 0.7652, 'grad_norm': 0.7358409762382507, 'learning_rate': 1.9463087248322147e-06, 'epoch': 2.88}


 96%|█████████▋| 7220/7500 [3:49:18<08:55,  1.91s/it]

{'loss': 0.7635, 'grad_norm': 0.6145297288894653, 'learning_rate': 1.8791946308724833e-06, 'epoch': 2.89}


 96%|█████████▋| 7230/7500 [3:49:37<08:32,  1.90s/it]

{'loss': 0.7228, 'grad_norm': 0.5500670671463013, 'learning_rate': 1.8120805369127518e-06, 'epoch': 2.89}


 97%|█████████▋| 7240/7500 [3:49:56<08:58,  2.07s/it]

{'loss': 0.7551, 'grad_norm': 0.765666127204895, 'learning_rate': 1.7449664429530201e-06, 'epoch': 2.9}


 97%|█████████▋| 7250/7500 [3:50:15<07:48,  1.87s/it]

{'loss': 0.7654, 'grad_norm': 0.5396461486816406, 'learning_rate': 1.6778523489932886e-06, 'epoch': 2.9}


 97%|█████████▋| 7260/7500 [3:50:34<07:31,  1.88s/it]

{'loss': 0.751, 'grad_norm': 0.7449578046798706, 'learning_rate': 1.6107382550335574e-06, 'epoch': 2.9}


 97%|█████████▋| 7270/7500 [3:50:53<07:11,  1.87s/it]

{'loss': 0.7097, 'grad_norm': 0.9473363757133484, 'learning_rate': 1.5436241610738255e-06, 'epoch': 2.91}


 97%|█████████▋| 7280/7500 [3:51:12<07:03,  1.93s/it]

{'loss': 0.7695, 'grad_norm': 0.5813345313072205, 'learning_rate': 1.476510067114094e-06, 'epoch': 2.91}


 97%|█████████▋| 7290/7500 [3:51:31<06:40,  1.91s/it]

{'loss': 0.7465, 'grad_norm': 0.6052381992340088, 'learning_rate': 1.4093959731543626e-06, 'epoch': 2.92}


 97%|█████████▋| 7300/7500 [3:51:50<06:22,  1.91s/it]

{'loss': 0.7629, 'grad_norm': 0.6087234616279602, 'learning_rate': 1.3422818791946309e-06, 'epoch': 2.92}


 97%|█████████▋| 7310/7500 [3:52:09<06:16,  1.98s/it]

{'loss': 0.7579, 'grad_norm': 0.93767911195755, 'learning_rate': 1.2751677852348994e-06, 'epoch': 2.92}


 98%|█████████▊| 7320/7500 [3:52:28<05:33,  1.85s/it]

{'loss': 0.727, 'grad_norm': 0.572593629360199, 'learning_rate': 1.2080536912751677e-06, 'epoch': 2.93}


 98%|█████████▊| 7330/7500 [3:52:46<05:17,  1.87s/it]

{'loss': 0.7319, 'grad_norm': 0.5605331659317017, 'learning_rate': 1.1409395973154363e-06, 'epoch': 2.93}


 98%|█████████▊| 7340/7500 [3:53:05<05:06,  1.91s/it]

{'loss': 0.7308, 'grad_norm': 0.6926281452178955, 'learning_rate': 1.0738255033557048e-06, 'epoch': 2.94}


 98%|█████████▊| 7350/7500 [3:53:25<04:47,  1.92s/it]

{'loss': 0.7279, 'grad_norm': 0.5214190483093262, 'learning_rate': 1.006711409395973e-06, 'epoch': 2.94}


 98%|█████████▊| 7360/7500 [3:53:44<04:27,  1.91s/it]

{'loss': 0.7797, 'grad_norm': 0.5574145317077637, 'learning_rate': 9.395973154362416e-07, 'epoch': 2.94}


 98%|█████████▊| 7370/7500 [3:54:03<04:07,  1.91s/it]

{'loss': 0.7632, 'grad_norm': 0.6399810910224915, 'learning_rate': 8.724832214765101e-07, 'epoch': 2.95}


 98%|█████████▊| 7380/7500 [3:54:22<03:43,  1.87s/it]

{'loss': 0.7071, 'grad_norm': 0.5150043368339539, 'learning_rate': 8.053691275167787e-07, 'epoch': 2.95}


 99%|█████████▊| 7390/7500 [3:54:40<03:20,  1.83s/it]

{'loss': 0.7599, 'grad_norm': 0.555298388004303, 'learning_rate': 7.38255033557047e-07, 'epoch': 2.96}


 99%|█████████▊| 7400/7500 [3:54:59<03:02,  1.82s/it]

{'loss': 0.7267, 'grad_norm': 0.6405226588249207, 'learning_rate': 6.711409395973154e-07, 'epoch': 2.96}


 99%|█████████▉| 7410/7500 [3:55:17<02:43,  1.81s/it]

{'loss': 0.7262, 'grad_norm': 0.647418200969696, 'learning_rate': 6.040268456375839e-07, 'epoch': 2.96}


 99%|█████████▉| 7420/7500 [3:55:35<02:29,  1.87s/it]

{'loss': 0.6803, 'grad_norm': 0.5397423505783081, 'learning_rate': 5.369127516778524e-07, 'epoch': 2.97}


 99%|█████████▉| 7430/7500 [3:55:54<02:11,  1.88s/it]

{'loss': 0.7673, 'grad_norm': 0.6982577443122864, 'learning_rate': 4.697986577181208e-07, 'epoch': 2.97}


 99%|█████████▉| 7440/7500 [3:56:13<01:54,  1.91s/it]

{'loss': 0.7361, 'grad_norm': 0.8353753685951233, 'learning_rate': 4.0268456375838935e-07, 'epoch': 2.98}


 99%|█████████▉| 7450/7500 [3:56:32<01:33,  1.86s/it]

{'loss': 0.7295, 'grad_norm': 0.6218042969703674, 'learning_rate': 3.355704697986577e-07, 'epoch': 2.98}


 99%|█████████▉| 7460/7500 [3:56:50<01:15,  1.88s/it]

{'loss': 0.7527, 'grad_norm': 0.6926268935203552, 'learning_rate': 2.684563758389262e-07, 'epoch': 2.98}


100%|█████████▉| 7470/7500 [3:57:09<00:56,  1.89s/it]

{'loss': 0.7499, 'grad_norm': 0.667701005935669, 'learning_rate': 2.0134228187919467e-07, 'epoch': 2.99}


100%|█████████▉| 7480/7500 [3:57:28<00:37,  1.89s/it]

{'loss': 0.7229, 'grad_norm': 0.6265125274658203, 'learning_rate': 1.342281879194631e-07, 'epoch': 2.99}


100%|█████████▉| 7490/7500 [3:57:46<00:18,  1.83s/it]

{'loss': 0.7828, 'grad_norm': 0.6714482307434082, 'learning_rate': 6.711409395973155e-08, 'epoch': 3.0}


100%|██████████| 7500/7500 [3:58:05<00:00,  1.86s/it]

{'loss': 0.7839, 'grad_norm': 0.7537410259246826, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 7500/7500 [4:01:31<00:00,  1.93s/it]

{'eval_loss': 0.6824756860733032, 'eval_runtime': 201.3372, 'eval_samples_per_second': 2.483, 'eval_steps_per_second': 0.621, 'epoch': 3.0}
{'train_runtime': 14491.7919, 'train_samples_per_second': 2.07, 'train_steps_per_second': 0.518, 'train_loss': 0.7961758496602376, 'epoch': 3.0}





TrainOutput(global_step=7500, training_loss=0.7961758496602376, metrics={'train_runtime': 14491.7919, 'train_samples_per_second': 2.07, 'train_steps_per_second': 0.518, 'total_flos': 7838619402240000.0, 'train_loss': 0.7961758496602376, 'epoch': 3.0})

In [17]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-gpt1-summarization')
tokenizer.save_pretrained('./fine-tuned-gpt1-summarization')

('./fine-tuned-gpt1-summarization/tokenizer_config.json',
 './fine-tuned-gpt1-summarization/special_tokens_map.json',
 './fine-tuned-gpt1-summarization/vocab.json',
 './fine-tuned-gpt1-summarization/merges.txt',
 './fine-tuned-gpt1-summarization/added_tokens.json')

### 4. Test: Summarization

In [25]:
# Load the fine-tuned model
fine_tuned_model = OpenAIGPTLMHeadModel.from_pretrained('./fine-tuned-gpt1-summarization')
fine_tuned_tokenizer = OpenAIGPTTokenizer.from_pretrained('./fine-tuned-gpt1-summarization')

# Generate summary
def generate_summary(article):
    inputs = fine_tuned_tokenizer.encode(article, return_tensors='pt', max_length=128, truncation=True)
    # print(f"Length of input_ids after encoding: {inputs.shape}")
    # print(f"Encoded input_ids: {inputs}")
    outputs = fine_tuned_model.generate(inputs, max_new_tokens=128, num_beams=5, early_stopping=True)
    summary = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Test the summarization
sample_article = dataset['test'][0]['article']
summary = generate_summary(sample_article)
print("Generated Summary:")
print(summary)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


Generated Summary:
( cnn ) the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories . the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based . the palestinians signed the icc ' s founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed " in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . " later that month , the icc opened a preliminary examination into the situation in palestinian


In [19]:
# Check the model configuration for input length limits
print(f"Model max position embeddings: {fine_tuned_model.config.n_positions}")

Model max position embeddings: 512


In [21]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric("rouge")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [26]:
# Prepare lists for predictions and references
predictions = []
references = []

# Evaluate the first 100 samples in the test set
for i in range(5):  # Adjust the number for a larger evaluation
    # Get the article and reference summary from the dataset
    article = dataset['test'][i]['article']
    reference_summary = dataset['test'][i]['highlights']

    # Generate a summary for the article
    generated_summary = generate_summary(article)
    
    # Append the generated summary and reference summary to lists
    predictions.append(generated_summary)
    references.append(reference_summary)
    print(f"Generated Summary {i+1}:", generated_summary)
    print(f"Reference Summary {i+1}:", reference_summary)
    print("predictions length", len(predictions), ", references length", len(references))

# Compute the ROUGE scores for the generated summaries
results = rouge.compute(predictions=predictions, references=references)

# Print the ROUGE results
print("ROUGE Scores:")
print(f"ROUGE-1: {results['rouge1'].mid.fmeasure:.4f}")
print(f"ROUGE-2: {results['rouge2'].mid.fmeasure:.4f}")
print(f"ROUGE-L: {results['rougeL'].mid.fmeasure:.4f}")

Generated Summary 1: ( cnn ) the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories . the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based . the palestinians signed the icc ' s founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed " in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . " later that month , the icc opened a preliminary examination into the situation in palestinian
Reference Summary 1: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
predictions length 1 , references length 1
Generated Summary 2: ( cnn ) n