## Dataset Creation

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_huggingface_dataset(train_data_folder):
    train_data = []
    test_data = []

    for file_name in tqdm(os.listdir(train_data_folder)):
        df = pd.read_csv(os.path.join(train_data_folder, file_name))
        df.drop(columns=['MatchID', 'PeriodID', 'Timestamp'], inplace=True, errors='ignore')
        #Here we split the dataset so to take 20% of the data for each period in each match
        for id_val, group in df.groupby("ID"):
            train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
            train_data.append(train_group)
            test_data.append(test_group)
    
    train_df = pd.concat(train_data).reset_index(drop=True)
    test_df = pd.concat(test_data).reset_index(drop=True)

    train_df.rename(columns={"EventType": "labels"}, inplace=True)
    test_df.rename(columns={"EventType": "labels"}, inplace=True)

    # Convert to HuggingFace Dataset
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Create DatasetDict
    dataset = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })
    
    return dataset

In [3]:
dataset = create_huggingface_dataset("cleaned_data/train_data")

100%|██████████| 16/16 [00:03<00:00,  5.16it/s]


In [4]:
dataset.shape

{'train': (2309732, 3), 'test': (578525, 3)}

In [5]:
dataset["train"][0]

{'ID': '2_0',
 'labels': 0,
 'Tweet': 'the socceroos are confident going up against esp! check this out user:'}

# Model Setup

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, DistilBertModel
import evaluate
import numpy as np

import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [7]:
trial = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["Tweet"], truncation=True)

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2309732/2309732 [00:30<00:00, 76813.77 examples/s]
Map: 100%|██████████| 578525/578525 [00:07<00:00, 77429.61 examples/s]


In [11]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
def count_parameters(model):
    """Helper function to count number of parameters, trainable, non-trainable and total."""
    trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_parameters = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    total_parameters = trainable_parameters + non_trainable_parameters
    print(f"Trainable parameters: {trainable_parameters}")
    print(f"Non-trainable parameters: {non_trainable_parameters}")
    print(f"Total parameters: {total_parameters}")
    return trainable_parameters, non_trainable_parameters, total_parameters

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

for param in model.base_model.parameters():
    param.requires_grad = False

for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True

for param in model.pre_classifier.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

count_parameters(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 7680002
Non-trainable parameters: 59275008
Total parameters: 66955010


(7680002, 59275008, 66955010)

In [14]:
training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_mps_device=True,
)



In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()

  0%|          | 503/288718 [00:27<3:54:13, 20.51it/s]

{'loss': 0.6806, 'grad_norm': 1.5376558303833008, 'learning_rate': 1.9965364126933547e-05, 'epoch': 0.0}


  0%|          | 1003/288718 [00:53<4:25:25, 18.07it/s]

{'loss': 0.6698, 'grad_norm': 1.0807185173034668, 'learning_rate': 1.9930728253867096e-05, 'epoch': 0.01}


  1%|          | 1503/288718 [01:19<4:18:45, 18.50it/s]

{'loss': 0.6685, 'grad_norm': 2.6184403896331787, 'learning_rate': 1.9896092380800645e-05, 'epoch': 0.01}


  1%|          | 2003/288718 [01:47<4:33:28, 17.47it/s]

{'loss': 0.6669, 'grad_norm': 0.9938274621963501, 'learning_rate': 1.986145650773419e-05, 'epoch': 0.01}


  1%|          | 2503/288718 [02:16<4:11:18, 18.98it/s]

{'loss': 0.6599, 'grad_norm': 1.2484948635101318, 'learning_rate': 1.982682063466774e-05, 'epoch': 0.02}


  1%|          | 3003/288718 [02:44<4:39:04, 17.06it/s]

{'loss': 0.6581, 'grad_norm': 1.2180981636047363, 'learning_rate': 1.979218476160129e-05, 'epoch': 0.02}


  1%|          | 3502/288718 [03:16<6:05:14, 13.02it/s]

{'loss': 0.655, 'grad_norm': 1.8031209707260132, 'learning_rate': 1.9757548888534834e-05, 'epoch': 0.02}


  1%|▏         | 4002/288718 [03:54<5:26:42, 14.52it/s] 

{'loss': 0.6549, 'grad_norm': 1.1186436414718628, 'learning_rate': 1.9722913015468383e-05, 'epoch': 0.03}


  2%|▏         | 4502/288718 [04:30<5:45:22, 13.72it/s] 

{'loss': 0.6605, 'grad_norm': 1.045678734779358, 'learning_rate': 1.9688277142401932e-05, 'epoch': 0.03}


  2%|▏         | 5002/288718 [05:04<4:46:51, 16.48it/s]

{'loss': 0.6572, 'grad_norm': 1.01622474193573, 'learning_rate': 1.9653641269335477e-05, 'epoch': 0.03}


  2%|▏         | 5502/288718 [05:34<4:33:10, 17.28it/s]

{'loss': 0.652, 'grad_norm': 1.9814648628234863, 'learning_rate': 1.9619005396269026e-05, 'epoch': 0.04}


  2%|▏         | 6002/288718 [06:04<4:38:03, 16.95it/s]

{'loss': 0.6541, 'grad_norm': 1.0645039081573486, 'learning_rate': 1.9584369523202572e-05, 'epoch': 0.04}


  2%|▏         | 6501/288718 [06:34<5:19:43, 14.71it/s]

{'loss': 0.6496, 'grad_norm': 2.3067431449890137, 'learning_rate': 1.954973365013612e-05, 'epoch': 0.05}


  2%|▏         | 7003/288718 [07:04<4:39:28, 16.80it/s]

{'loss': 0.6592, 'grad_norm': 1.2937495708465576, 'learning_rate': 1.951509777706967e-05, 'epoch': 0.05}


  3%|▎         | 7503/288718 [07:34<4:40:14, 16.73it/s]

{'loss': 0.6531, 'grad_norm': 1.8614541292190552, 'learning_rate': 1.9480461904003215e-05, 'epoch': 0.05}


  3%|▎         | 8003/288718 [08:05<4:54:51, 15.87it/s]

{'loss': 0.6561, 'grad_norm': 2.096522569656372, 'learning_rate': 1.9445826030936764e-05, 'epoch': 0.06}


  3%|▎         | 8503/288718 [08:36<4:45:34, 16.35it/s] 

{'loss': 0.6518, 'grad_norm': 1.4431601762771606, 'learning_rate': 1.9411190157870313e-05, 'epoch': 0.06}


  3%|▎         | 9002/288718 [09:06<4:49:02, 16.13it/s]

{'loss': 0.6533, 'grad_norm': 1.1061663627624512, 'learning_rate': 1.937655428480386e-05, 'epoch': 0.06}


  3%|▎         | 9502/288718 [09:38<4:38:49, 16.69it/s]

{'loss': 0.6528, 'grad_norm': 1.4817863702774048, 'learning_rate': 1.9341918411737404e-05, 'epoch': 0.07}


  3%|▎         | 10002/288718 [10:08<5:05:15, 15.22it/s]

{'loss': 0.6525, 'grad_norm': 1.4279935359954834, 'learning_rate': 1.9307282538670953e-05, 'epoch': 0.07}


  4%|▎         | 10502/288718 [10:39<4:44:59, 16.27it/s]

{'loss': 0.6492, 'grad_norm': 1.034725546836853, 'learning_rate': 1.9272646665604502e-05, 'epoch': 0.07}


  4%|▍         | 11003/288718 [11:10<4:46:30, 16.15it/s]

{'loss': 0.6537, 'grad_norm': 1.0293636322021484, 'learning_rate': 1.9238010792538048e-05, 'epoch': 0.08}


  4%|▍         | 11501/288718 [11:41<4:51:11, 15.87it/s]

{'loss': 0.6511, 'grad_norm': 1.0823044776916504, 'learning_rate': 1.9203374919471597e-05, 'epoch': 0.08}


  4%|▍         | 12003/288718 [12:14<4:46:45, 16.08it/s]

{'loss': 0.647, 'grad_norm': 1.0592387914657593, 'learning_rate': 1.9168739046405146e-05, 'epoch': 0.08}


  4%|▍         | 12502/288718 [12:46<4:48:47, 15.94it/s]

{'loss': 0.6543, 'grad_norm': 1.2356585264205933, 'learning_rate': 1.913410317333869e-05, 'epoch': 0.09}


  5%|▍         | 13002/288718 [13:18<4:49:48, 15.86it/s]

{'loss': 0.6526, 'grad_norm': 1.088877558708191, 'learning_rate': 1.909946730027224e-05, 'epoch': 0.09}


  5%|▍         | 13502/288718 [13:50<5:16:02, 14.51it/s]

{'loss': 0.6489, 'grad_norm': 2.3934428691864014, 'learning_rate': 1.906483142720579e-05, 'epoch': 0.09}


  5%|▍         | 14002/288718 [14:21<4:52:52, 15.63it/s]

{'loss': 0.6435, 'grad_norm': 1.169222354888916, 'learning_rate': 1.9030195554139334e-05, 'epoch': 0.1}


  5%|▌         | 14502/288718 [14:54<4:56:50, 15.40it/s]

{'loss': 0.6546, 'grad_norm': 2.067847967147827, 'learning_rate': 1.899555968107288e-05, 'epoch': 0.1}


  5%|▌         | 15001/288718 [15:27<4:47:31, 15.87it/s]

{'loss': 0.6536, 'grad_norm': 2.0361993312835693, 'learning_rate': 1.896092380800643e-05, 'epoch': 0.1}


  5%|▌         | 15503/288718 [15:59<4:46:40, 15.88it/s]

{'loss': 0.649, 'grad_norm': 1.2752865552902222, 'learning_rate': 1.8926287934939978e-05, 'epoch': 0.11}


  6%|▌         | 16003/288718 [16:29<4:50:50, 15.63it/s]

{'loss': 0.6453, 'grad_norm': 1.2329806089401245, 'learning_rate': 1.8891652061873523e-05, 'epoch': 0.11}


  6%|▌         | 16503/288718 [17:00<4:19:49, 17.46it/s]

{'loss': 0.642, 'grad_norm': 1.0078771114349365, 'learning_rate': 1.8857016188807072e-05, 'epoch': 0.11}


  6%|▌         | 17003/288718 [17:31<4:37:29, 16.32it/s]

{'loss': 0.6419, 'grad_norm': 1.0270147323608398, 'learning_rate': 1.882238031574062e-05, 'epoch': 0.12}


  6%|▌         | 17502/288718 [18:02<4:20:58, 17.32it/s]

{'loss': 0.6534, 'grad_norm': 1.146621584892273, 'learning_rate': 1.8787744442674167e-05, 'epoch': 0.12}


  6%|▌         | 18002/288718 [18:31<4:33:48, 16.48it/s]

{'loss': 0.6457, 'grad_norm': 1.3222616910934448, 'learning_rate': 1.8753108569607716e-05, 'epoch': 0.12}


  6%|▋         | 18503/288718 [19:00<4:19:44, 17.34it/s]

{'loss': 0.644, 'grad_norm': 0.9043076634407043, 'learning_rate': 1.8718472696541265e-05, 'epoch': 0.13}


  7%|▋         | 19003/288718 [19:30<4:42:06, 15.93it/s]

{'loss': 0.6438, 'grad_norm': 1.1274535655975342, 'learning_rate': 1.868383682347481e-05, 'epoch': 0.13}


  7%|▋         | 19503/288718 [20:01<4:33:27, 16.41it/s]

{'loss': 0.6484, 'grad_norm': 1.217949390411377, 'learning_rate': 1.864920095040836e-05, 'epoch': 0.14}


  7%|▋         | 20003/288718 [20:31<4:21:31, 17.12it/s]

{'loss': 0.6466, 'grad_norm': 1.4067046642303467, 'learning_rate': 1.8614565077341905e-05, 'epoch': 0.14}


  7%|▋         | 20501/288718 [21:02<5:03:12, 14.74it/s]

{'loss': 0.6497, 'grad_norm': 1.4302254915237427, 'learning_rate': 1.8579929204275454e-05, 'epoch': 0.14}


  7%|▋         | 21003/288718 [21:31<4:39:04, 15.99it/s]

{'loss': 0.6507, 'grad_norm': 1.277392864227295, 'learning_rate': 1.8545293331209003e-05, 'epoch': 0.15}


  7%|▋         | 21502/288718 [22:01<4:14:16, 17.51it/s]

{'loss': 0.636, 'grad_norm': 1.5945096015930176, 'learning_rate': 1.8510657458142548e-05, 'epoch': 0.15}


  8%|▊         | 22003/288718 [22:32<4:42:01, 15.76it/s]

{'loss': 0.6475, 'grad_norm': 1.308931589126587, 'learning_rate': 1.8476021585076097e-05, 'epoch': 0.15}


  8%|▊         | 22503/288718 [23:03<4:25:05, 16.74it/s]

{'loss': 0.6429, 'grad_norm': 0.9820117950439453, 'learning_rate': 1.8441385712009646e-05, 'epoch': 0.16}


  8%|▊         | 23001/288718 [23:32<4:35:59, 16.05it/s]

{'loss': 0.6463, 'grad_norm': 2.0411605834960938, 'learning_rate': 1.840674983894319e-05, 'epoch': 0.16}


  8%|▊         | 23502/288718 [24:02<4:28:35, 16.46it/s]

{'loss': 0.6445, 'grad_norm': 1.0335551500320435, 'learning_rate': 1.837211396587674e-05, 'epoch': 0.16}


  8%|▊         | 24002/288718 [24:33<4:31:14, 16.27it/s]

{'loss': 0.6415, 'grad_norm': 1.5163408517837524, 'learning_rate': 1.833747809281029e-05, 'epoch': 0.17}


  8%|▊         | 24503/288718 [25:03<4:22:43, 16.76it/s]

{'loss': 0.6484, 'grad_norm': 1.4641010761260986, 'learning_rate': 1.8302842219743835e-05, 'epoch': 0.17}


  9%|▊         | 25002/288718 [25:34<4:24:46, 16.60it/s]

{'loss': 0.6537, 'grad_norm': 1.2274665832519531, 'learning_rate': 1.826820634667738e-05, 'epoch': 0.17}


  9%|▉         | 25502/288718 [26:04<4:11:43, 17.43it/s]

{'loss': 0.6433, 'grad_norm': 1.0704271793365479, 'learning_rate': 1.823357047361093e-05, 'epoch': 0.18}


  9%|▉         | 26002/288718 [26:35<4:43:37, 15.44it/s]

{'loss': 0.6467, 'grad_norm': 1.0468204021453857, 'learning_rate': 1.819893460054448e-05, 'epoch': 0.18}


  9%|▉         | 26503/288718 [27:06<4:22:27, 16.65it/s]

{'loss': 0.6455, 'grad_norm': 1.9309455156326294, 'learning_rate': 1.8164298727478024e-05, 'epoch': 0.18}


  9%|▉         | 27003/288718 [27:37<4:17:40, 16.93it/s]

{'loss': 0.6482, 'grad_norm': 1.1895127296447754, 'learning_rate': 1.8129662854411573e-05, 'epoch': 0.19}


 10%|▉         | 27502/288718 [28:07<4:15:07, 17.06it/s]

{'loss': 0.6479, 'grad_norm': 1.0911388397216797, 'learning_rate': 1.8095026981345122e-05, 'epoch': 0.19}


 10%|▉         | 28002/288718 [28:38<4:21:02, 16.65it/s]

{'loss': 0.6439, 'grad_norm': 1.5832018852233887, 'learning_rate': 1.8060391108278667e-05, 'epoch': 0.19}


 10%|▉         | 28503/288718 [29:09<4:22:02, 16.55it/s]

{'loss': 0.6468, 'grad_norm': 1.0639439821243286, 'learning_rate': 1.8025755235212213e-05, 'epoch': 0.2}


 10%|█         | 29002/288718 [29:40<4:30:31, 16.00it/s]

{'loss': 0.6418, 'grad_norm': 1.43589448928833, 'learning_rate': 1.7991119362145765e-05, 'epoch': 0.2}


 10%|█         | 29502/288718 [30:11<4:27:47, 16.13it/s]

{'loss': 0.6398, 'grad_norm': 1.3705828189849854, 'learning_rate': 1.795648348907931e-05, 'epoch': 0.2}


 10%|█         | 30002/288718 [30:43<4:22:52, 16.40it/s]

{'loss': 0.6433, 'grad_norm': 1.7379539012908936, 'learning_rate': 1.7921847616012856e-05, 'epoch': 0.21}


 11%|█         | 30502/288718 [31:14<4:26:23, 16.15it/s]

{'loss': 0.6415, 'grad_norm': 1.0892468690872192, 'learning_rate': 1.7887211742946405e-05, 'epoch': 0.21}


 11%|█         | 31002/288718 [31:46<4:31:57, 15.79it/s]

{'loss': 0.6432, 'grad_norm': 3.5342748165130615, 'learning_rate': 1.7852575869879954e-05, 'epoch': 0.21}


 11%|█         | 31502/288718 [32:18<4:15:47, 16.76it/s]

{'loss': 0.6475, 'grad_norm': 1.1405829191207886, 'learning_rate': 1.78179399968135e-05, 'epoch': 0.22}


 11%|█         | 32002/288718 [32:49<4:13:40, 16.87it/s]

{'loss': 0.6409, 'grad_norm': 1.249489426612854, 'learning_rate': 1.778330412374705e-05, 'epoch': 0.22}


 11%|█▏        | 32502/288718 [33:20<4:13:17, 16.86it/s]

{'loss': 0.6471, 'grad_norm': 1.2636315822601318, 'learning_rate': 1.7748668250680598e-05, 'epoch': 0.23}


 11%|█▏        | 33002/288718 [33:52<4:23:19, 16.18it/s]

{'loss': 0.6363, 'grad_norm': 1.2120118141174316, 'learning_rate': 1.7714032377614143e-05, 'epoch': 0.23}


 12%|█▏        | 33502/288718 [34:23<4:33:36, 15.55it/s]

{'loss': 0.6421, 'grad_norm': 1.4065885543823242, 'learning_rate': 1.7679396504547692e-05, 'epoch': 0.23}


 12%|█▏        | 34002/288718 [34:55<4:45:40, 14.86it/s]

{'loss': 0.6504, 'grad_norm': 1.293502688407898, 'learning_rate': 1.7644760631481238e-05, 'epoch': 0.24}


 12%|█▏        | 34502/288718 [35:26<4:58:50, 14.18it/s]

{'loss': 0.6386, 'grad_norm': 1.2643694877624512, 'learning_rate': 1.7610124758414787e-05, 'epoch': 0.24}


 12%|█▏        | 35003/288718 [36:00<3:55:47, 17.93it/s]

{'loss': 0.644, 'grad_norm': 1.2509486675262451, 'learning_rate': 1.7575488885348335e-05, 'epoch': 0.24}


 12%|█▏        | 35502/288718 [36:33<4:16:28, 16.45it/s]

{'loss': 0.6431, 'grad_norm': 1.329970359802246, 'learning_rate': 1.754085301228188e-05, 'epoch': 0.25}


 12%|█▏        | 36003/288718 [37:02<4:14:18, 16.56it/s]

{'loss': 0.6432, 'grad_norm': 1.4395846128463745, 'learning_rate': 1.750621713921543e-05, 'epoch': 0.25}


 13%|█▎        | 36503/288718 [37:32<4:09:35, 16.84it/s]

{'loss': 0.6499, 'grad_norm': 2.200521945953369, 'learning_rate': 1.747158126614898e-05, 'epoch': 0.25}


 13%|█▎        | 37003/288718 [38:03<4:04:06, 17.19it/s]

{'loss': 0.6417, 'grad_norm': 1.4431543350219727, 'learning_rate': 1.7436945393082524e-05, 'epoch': 0.26}


 13%|█▎        | 37503/288718 [38:33<3:59:41, 17.47it/s]

{'loss': 0.6384, 'grad_norm': 1.2145183086395264, 'learning_rate': 1.7402309520016073e-05, 'epoch': 0.26}


 13%|█▎        | 38001/288718 [39:03<4:39:03, 14.97it/s]

{'loss': 0.6419, 'grad_norm': 1.0521433353424072, 'learning_rate': 1.7367673646949622e-05, 'epoch': 0.26}


 13%|█▎        | 38503/288718 [39:38<4:44:30, 14.66it/s]

{'loss': 0.6372, 'grad_norm': 2.081413507461548, 'learning_rate': 1.7333037773883168e-05, 'epoch': 0.27}


 14%|█▎        | 39002/288718 [40:11<4:29:13, 15.46it/s]

{'loss': 0.6417, 'grad_norm': 1.4692659378051758, 'learning_rate': 1.7298401900816713e-05, 'epoch': 0.27}


 14%|█▎        | 39503/288718 [40:41<4:15:35, 16.25it/s]

{'loss': 0.6403, 'grad_norm': 1.360477089881897, 'learning_rate': 1.7263766027750262e-05, 'epoch': 0.27}


 14%|█▍        | 40003/288718 [41:11<4:07:25, 16.75it/s]

{'loss': 0.6457, 'grad_norm': 1.2126877307891846, 'learning_rate': 1.722913015468381e-05, 'epoch': 0.28}


 14%|█▍        | 40503/288718 [41:42<4:28:07, 15.43it/s]

{'loss': 0.639, 'grad_norm': 1.506161093711853, 'learning_rate': 1.7194494281617357e-05, 'epoch': 0.28}


 14%|█▍        | 41002/288718 [42:13<4:22:14, 15.74it/s]

{'loss': 0.643, 'grad_norm': 1.0776766538619995, 'learning_rate': 1.7159858408550906e-05, 'epoch': 0.28}


 14%|█▍        | 41502/288718 [42:44<4:22:10, 15.72it/s]

{'loss': 0.6457, 'grad_norm': 1.1392098665237427, 'learning_rate': 1.7125222535484455e-05, 'epoch': 0.29}


 15%|█▍        | 42002/288718 [43:14<3:58:54, 17.21it/s]

{'loss': 0.6403, 'grad_norm': 1.8885385990142822, 'learning_rate': 1.7090586662418e-05, 'epoch': 0.29}


 15%|█▍        | 42502/288718 [43:44<4:09:14, 16.46it/s]

{'loss': 0.6496, 'grad_norm': 0.9411448240280151, 'learning_rate': 1.705595078935155e-05, 'epoch': 0.29}


 15%|█▍        | 43001/288718 [44:14<4:27:49, 15.29it/s]

{'loss': 0.6447, 'grad_norm': 1.1453783512115479, 'learning_rate': 1.7021314916285098e-05, 'epoch': 0.3}


 15%|█▌        | 43502/288718 [44:46<4:30:30, 15.11it/s]

{'loss': 0.6434, 'grad_norm': 0.8580376505851746, 'learning_rate': 1.6986679043218644e-05, 'epoch': 0.3}


 15%|█▌        | 44001/288718 [45:16<4:13:28, 16.09it/s]

{'loss': 0.6392, 'grad_norm': 1.7307151556015015, 'learning_rate': 1.695204317015219e-05, 'epoch': 0.3}


 15%|█▌        | 44503/288718 [45:46<3:55:36, 17.28it/s]

{'loss': 0.6429, 'grad_norm': 2.8880107402801514, 'learning_rate': 1.6917407297085738e-05, 'epoch': 0.31}


 16%|█▌        | 45002/288718 [46:17<3:41:05, 18.37it/s]

{'loss': 0.6403, 'grad_norm': 1.2796909809112549, 'learning_rate': 1.6882771424019287e-05, 'epoch': 0.31}


 16%|█▌        | 45502/288718 [46:46<3:45:54, 17.94it/s]

{'loss': 0.6409, 'grad_norm': 1.2266491651535034, 'learning_rate': 1.6848135550952833e-05, 'epoch': 0.32}


 16%|█▌        | 46002/288718 [47:17<4:09:24, 16.22it/s]

{'loss': 0.6423, 'grad_norm': 1.446408748626709, 'learning_rate': 1.681349967788638e-05, 'epoch': 0.32}


 16%|█▌        | 46502/288718 [47:47<3:51:51, 17.41it/s]

{'loss': 0.6424, 'grad_norm': 0.9288383722305298, 'learning_rate': 1.677886380481993e-05, 'epoch': 0.32}


 16%|█▋        | 47003/288718 [48:17<4:00:00, 16.79it/s]

{'loss': 0.6444, 'grad_norm': 1.0560083389282227, 'learning_rate': 1.6744227931753476e-05, 'epoch': 0.33}


 16%|█▋        | 47502/288718 [48:47<3:59:28, 16.79it/s]

{'loss': 0.6385, 'grad_norm': 1.7421724796295166, 'learning_rate': 1.6709592058687025e-05, 'epoch': 0.33}


 17%|█▋        | 48002/288718 [49:19<4:16:58, 15.61it/s]

{'loss': 0.6373, 'grad_norm': 1.3636356592178345, 'learning_rate': 1.6674956185620574e-05, 'epoch': 0.33}


 17%|█▋        | 48502/288718 [49:49<3:55:35, 16.99it/s]

{'loss': 0.6353, 'grad_norm': 2.014503002166748, 'learning_rate': 1.664032031255412e-05, 'epoch': 0.34}


 17%|█▋        | 49003/288718 [50:20<3:55:53, 16.94it/s]

{'loss': 0.6446, 'grad_norm': 1.1159706115722656, 'learning_rate': 1.6605684439487668e-05, 'epoch': 0.34}


 17%|█▋        | 49503/288718 [50:53<4:55:58, 13.47it/s]

{'loss': 0.6379, 'grad_norm': 1.3195675611495972, 'learning_rate': 1.6571048566421214e-05, 'epoch': 0.34}


 17%|█▋        | 50002/288718 [51:24<4:04:40, 16.26it/s]

{'loss': 0.6394, 'grad_norm': 1.4746098518371582, 'learning_rate': 1.6536412693354763e-05, 'epoch': 0.35}


 17%|█▋        | 50502/288718 [51:55<4:03:23, 16.31it/s]

{'loss': 0.6389, 'grad_norm': 1.2746833562850952, 'learning_rate': 1.6501776820288312e-05, 'epoch': 0.35}


 18%|█▊        | 51003/288718 [52:26<3:59:18, 16.56it/s]

{'loss': 0.6402, 'grad_norm': 1.8543992042541504, 'learning_rate': 1.6467140947221857e-05, 'epoch': 0.35}


 18%|█▊        | 51503/288718 [52:58<3:54:48, 16.84it/s]

{'loss': 0.6362, 'grad_norm': 1.067962408065796, 'learning_rate': 1.6432505074155406e-05, 'epoch': 0.36}


 18%|█▊        | 52003/288718 [53:28<3:46:11, 17.44it/s]

{'loss': 0.6344, 'grad_norm': 1.876143217086792, 'learning_rate': 1.6397869201088955e-05, 'epoch': 0.36}


 18%|█▊        | 52502/288718 [53:59<3:52:38, 16.92it/s]

{'loss': 0.6424, 'grad_norm': 1.1783511638641357, 'learning_rate': 1.63632333280225e-05, 'epoch': 0.36}


 18%|█▊        | 53003/288718 [54:30<4:14:00, 15.47it/s]

{'loss': 0.6394, 'grad_norm': 2.6367077827453613, 'learning_rate': 1.6328597454956046e-05, 'epoch': 0.37}


 19%|█▊        | 53503/288718 [55:02<4:06:17, 15.92it/s]

{'loss': 0.6363, 'grad_norm': 1.5645116567611694, 'learning_rate': 1.62939615818896e-05, 'epoch': 0.37}


 19%|█▊        | 54002/288718 [55:33<4:02:54, 16.11it/s]

{'loss': 0.6403, 'grad_norm': 1.680255651473999, 'learning_rate': 1.6259325708823144e-05, 'epoch': 0.37}


 19%|█▉        | 54502/288718 [56:03<4:00:48, 16.21it/s]

{'loss': 0.6379, 'grad_norm': 1.5005509853363037, 'learning_rate': 1.622468983575669e-05, 'epoch': 0.38}


 19%|█▉        | 55002/288718 [56:34<3:59:56, 16.23it/s]

{'loss': 0.6297, 'grad_norm': 1.8965933322906494, 'learning_rate': 1.619005396269024e-05, 'epoch': 0.38}


 19%|█▉        | 55502/288718 [57:06<3:58:04, 16.33it/s]

{'loss': 0.6408, 'grad_norm': 1.2779699563980103, 'learning_rate': 1.6155418089623787e-05, 'epoch': 0.38}


 19%|█▉        | 56002/288718 [57:37<4:14:13, 15.26it/s]

{'loss': 0.6359, 'grad_norm': 1.5123342275619507, 'learning_rate': 1.6120782216557333e-05, 'epoch': 0.39}


 20%|█▉        | 56502/288718 [58:09<3:59:26, 16.16it/s]

{'loss': 0.6355, 'grad_norm': 1.520337462425232, 'learning_rate': 1.6086146343490882e-05, 'epoch': 0.39}


 20%|█▉        | 57002/288718 [58:41<4:10:39, 15.41it/s]

{'loss': 0.6374, 'grad_norm': 2.6059110164642334, 'learning_rate': 1.605151047042443e-05, 'epoch': 0.39}


 20%|█▉        | 57502/288718 [59:13<3:50:55, 16.69it/s]

{'loss': 0.6425, 'grad_norm': 2.1753616333007812, 'learning_rate': 1.6016874597357976e-05, 'epoch': 0.4}


 20%|██        | 58002/288718 [59:44<3:50:00, 16.72it/s]

{'loss': 0.6411, 'grad_norm': 1.0363812446594238, 'learning_rate': 1.5982238724291525e-05, 'epoch': 0.4}


 20%|██        | 58502/288718 [1:00:15<4:35:56, 13.91it/s]

{'loss': 0.6413, 'grad_norm': 1.0161864757537842, 'learning_rate': 1.594760285122507e-05, 'epoch': 0.41}


 20%|██        | 59002/288718 [1:00:49<4:12:21, 15.17it/s]

{'loss': 0.6358, 'grad_norm': 1.3476696014404297, 'learning_rate': 1.591296697815862e-05, 'epoch': 0.41}


 21%|██        | 59501/288718 [1:01:30<5:10:42, 12.30it/s]

{'loss': 0.6376, 'grad_norm': 1.4759869575500488, 'learning_rate': 1.587833110509217e-05, 'epoch': 0.41}


 21%|██        | 60002/288718 [1:02:12<4:04:59, 15.56it/s]

{'loss': 0.6367, 'grad_norm': 1.6404199600219727, 'learning_rate': 1.5843695232025714e-05, 'epoch': 0.42}


 21%|██        | 60501/288718 [1:02:45<4:27:00, 14.25it/s]

{'loss': 0.6452, 'grad_norm': 1.139853596687317, 'learning_rate': 1.5809059358959263e-05, 'epoch': 0.42}


 21%|██        | 61003/288718 [1:03:18<3:53:57, 16.22it/s]

{'loss': 0.6416, 'grad_norm': 1.1046255826950073, 'learning_rate': 1.577442348589281e-05, 'epoch': 0.42}


 21%|██▏       | 61501/288718 [1:03:52<4:21:45, 14.47it/s]

{'loss': 0.64, 'grad_norm': 1.1840859651565552, 'learning_rate': 1.5739787612826358e-05, 'epoch': 0.43}


 21%|██▏       | 62003/288718 [1:04:26<4:24:30, 14.29it/s]

{'loss': 0.6379, 'grad_norm': 1.2418450117111206, 'learning_rate': 1.5705151739759907e-05, 'epoch': 0.43}


 22%|██▏       | 62503/288718 [1:05:00<3:54:15, 16.09it/s]

{'loss': 0.6401, 'grad_norm': 0.9154801368713379, 'learning_rate': 1.5670515866693452e-05, 'epoch': 0.43}


 22%|██▏       | 63001/288718 [1:05:33<4:26:33, 14.11it/s]

{'loss': 0.6393, 'grad_norm': 1.0306719541549683, 'learning_rate': 1.5635879993627e-05, 'epoch': 0.44}


 22%|██▏       | 63501/288718 [1:06:08<4:38:40, 13.47it/s]

{'loss': 0.637, 'grad_norm': 1.6152194738388062, 'learning_rate': 1.5601244120560547e-05, 'epoch': 0.44}


 22%|██▏       | 64003/288718 [1:06:42<3:51:34, 16.17it/s]

{'loss': 0.6382, 'grad_norm': 1.4447630643844604, 'learning_rate': 1.5566608247494096e-05, 'epoch': 0.44}


 22%|██▏       | 64502/288718 [1:07:15<4:02:54, 15.38it/s]

{'loss': 0.6315, 'grad_norm': 1.19617760181427, 'learning_rate': 1.5531972374427645e-05, 'epoch': 0.45}


 23%|██▎       | 65002/288718 [1:07:46<3:39:42, 16.97it/s]

{'loss': 0.643, 'grad_norm': 0.9577217698097229, 'learning_rate': 1.549733650136119e-05, 'epoch': 0.45}


 23%|██▎       | 65502/288718 [1:08:19<4:00:21, 15.48it/s]

{'loss': 0.6399, 'grad_norm': 1.072265386581421, 'learning_rate': 1.546270062829474e-05, 'epoch': 0.45}


 23%|██▎       | 66002/288718 [1:08:52<4:13:26, 14.65it/s]

{'loss': 0.6366, 'grad_norm': 1.622859001159668, 'learning_rate': 1.5428064755228288e-05, 'epoch': 0.46}


 23%|██▎       | 66502/288718 [1:09:26<4:11:01, 14.75it/s]

{'loss': 0.6381, 'grad_norm': 1.7501575946807861, 'learning_rate': 1.5393428882161834e-05, 'epoch': 0.46}


 23%|██▎       | 67002/288718 [1:09:59<4:23:56, 14.00it/s]

{'loss': 0.6409, 'grad_norm': 1.982236385345459, 'learning_rate': 1.535879300909538e-05, 'epoch': 0.46}


 23%|██▎       | 67502/288718 [1:10:31<4:09:41, 14.77it/s]

{'loss': 0.6339, 'grad_norm': 1.1688748598098755, 'learning_rate': 1.532415713602893e-05, 'epoch': 0.47}


 24%|██▎       | 68002/288718 [1:11:05<4:00:48, 15.28it/s]

{'loss': 0.6307, 'grad_norm': 1.2857835292816162, 'learning_rate': 1.5289521262962477e-05, 'epoch': 0.47}


 24%|██▎       | 68502/288718 [1:11:40<4:08:30, 14.77it/s]

{'loss': 0.6347, 'grad_norm': 1.82576584815979, 'learning_rate': 1.5254885389896024e-05, 'epoch': 0.47}


 24%|██▍       | 69002/288718 [1:12:12<4:14:14, 14.40it/s]

{'loss': 0.6329, 'grad_norm': 1.4056252241134644, 'learning_rate': 1.5220249516829573e-05, 'epoch': 0.48}


 24%|██▍       | 69502/288718 [1:12:44<3:38:50, 16.70it/s]

{'loss': 0.6369, 'grad_norm': 1.147498607635498, 'learning_rate': 1.518561364376312e-05, 'epoch': 0.48}


 24%|██▍       | 70002/288718 [1:13:15<3:43:18, 16.32it/s]

{'loss': 0.6408, 'grad_norm': 1.6722489595413208, 'learning_rate': 1.5150977770696666e-05, 'epoch': 0.48}


 24%|██▍       | 70502/288718 [1:13:47<4:10:08, 14.54it/s]

{'loss': 0.6362, 'grad_norm': 1.1866730451583862, 'learning_rate': 1.5116341897630215e-05, 'epoch': 0.49}


 25%|██▍       | 71002/288718 [1:14:20<3:38:35, 16.60it/s]

{'loss': 0.6332, 'grad_norm': 1.3886042833328247, 'learning_rate': 1.5081706024563762e-05, 'epoch': 0.49}


 25%|██▍       | 71502/288718 [1:14:52<3:38:33, 16.56it/s]

{'loss': 0.6365, 'grad_norm': 1.0594940185546875, 'learning_rate': 1.504707015149731e-05, 'epoch': 0.5}


 25%|██▍       | 72002/288718 [1:15:25<4:20:57, 13.84it/s]

{'loss': 0.6377, 'grad_norm': 2.1798036098480225, 'learning_rate': 1.5012434278430858e-05, 'epoch': 0.5}


 25%|██▌       | 72502/288718 [1:15:57<3:43:34, 16.12it/s]

{'loss': 0.6381, 'grad_norm': 1.2075531482696533, 'learning_rate': 1.4977798405364405e-05, 'epoch': 0.5}


 25%|██▌       | 73002/288718 [1:16:30<3:58:40, 15.06it/s]

{'loss': 0.6361, 'grad_norm': 1.1964153051376343, 'learning_rate': 1.4943162532297953e-05, 'epoch': 0.51}


 25%|██▌       | 73502/288718 [1:17:02<4:07:53, 14.47it/s]

{'loss': 0.6416, 'grad_norm': 2.042043447494507, 'learning_rate': 1.4908526659231502e-05, 'epoch': 0.51}


 26%|██▌       | 74002/288718 [1:17:37<4:08:51, 14.38it/s]

{'loss': 0.6322, 'grad_norm': 1.7140116691589355, 'learning_rate': 1.4873890786165049e-05, 'epoch': 0.51}


 26%|██▌       | 74502/288718 [1:18:10<3:59:00, 14.94it/s]

{'loss': 0.6384, 'grad_norm': 1.8292443752288818, 'learning_rate': 1.4839254913098594e-05, 'epoch': 0.52}


 26%|██▌       | 75002/288718 [1:18:43<3:50:03, 15.48it/s]

{'loss': 0.6386, 'grad_norm': 1.847542643547058, 'learning_rate': 1.4804619040032145e-05, 'epoch': 0.52}


 26%|██▌       | 75502/288718 [1:19:15<3:37:57, 16.30it/s]

{'loss': 0.6317, 'grad_norm': 1.8551344871520996, 'learning_rate': 1.476998316696569e-05, 'epoch': 0.52}


 26%|██▋       | 76002/288718 [1:19:47<3:52:16, 15.26it/s]

{'loss': 0.6345, 'grad_norm': 1.4331083297729492, 'learning_rate': 1.4735347293899238e-05, 'epoch': 0.53}


 26%|██▋       | 76502/288718 [1:20:20<3:32:15, 16.66it/s]

{'loss': 0.6409, 'grad_norm': 2.0089900493621826, 'learning_rate': 1.4700711420832787e-05, 'epoch': 0.53}


 27%|██▋       | 77002/288718 [1:20:55<4:04:19, 14.44it/s]

{'loss': 0.6327, 'grad_norm': 2.253472089767456, 'learning_rate': 1.4666075547766334e-05, 'epoch': 0.53}


 27%|██▋       | 77500/288718 [1:21:27<4:29:28, 13.06it/s]

{'loss': 0.6325, 'grad_norm': 2.7053072452545166, 'learning_rate': 1.4631439674699881e-05, 'epoch': 0.54}


 27%|██▋       | 78002/288718 [1:22:01<4:00:53, 14.58it/s]

{'loss': 0.6362, 'grad_norm': 1.1851955652236938, 'learning_rate': 1.4596803801633428e-05, 'epoch': 0.54}


 27%|██▋       | 78502/288718 [1:22:35<4:02:06, 14.47it/s]

{'loss': 0.6339, 'grad_norm': 1.1692800521850586, 'learning_rate': 1.4562167928566977e-05, 'epoch': 0.54}


 27%|██▋       | 79002/288718 [1:23:09<4:23:33, 13.26it/s]

{'loss': 0.6336, 'grad_norm': 1.7787449359893799, 'learning_rate': 1.4527532055500525e-05, 'epoch': 0.55}


 28%|██▊       | 79502/288718 [1:23:44<3:47:42, 15.31it/s]

{'loss': 0.6383, 'grad_norm': 1.226496934890747, 'learning_rate': 1.449289618243407e-05, 'epoch': 0.55}


 28%|██▊       | 80002/288718 [1:24:18<3:38:07, 15.95it/s]

{'loss': 0.6342, 'grad_norm': 2.4795827865600586, 'learning_rate': 1.4458260309367619e-05, 'epoch': 0.55}


 28%|██▊       | 80502/288718 [1:24:52<3:55:26, 14.74it/s]

{'loss': 0.6349, 'grad_norm': 1.3133329153060913, 'learning_rate': 1.4423624436301166e-05, 'epoch': 0.56}


 28%|██▊       | 81002/288718 [1:25:26<3:47:52, 15.19it/s]

{'loss': 0.6319, 'grad_norm': 1.4345029592514038, 'learning_rate': 1.4388988563234714e-05, 'epoch': 0.56}


 28%|██▊       | 81502/288718 [1:26:00<3:54:15, 14.74it/s]

{'loss': 0.6334, 'grad_norm': 1.6250488758087158, 'learning_rate': 1.4354352690168263e-05, 'epoch': 0.56}


 28%|██▊       | 82002/288718 [1:26:35<4:26:49, 12.91it/s]

{'loss': 0.6372, 'grad_norm': 1.57686448097229, 'learning_rate': 1.431971681710181e-05, 'epoch': 0.57}


 29%|██▊       | 82502/288718 [1:27:09<3:45:45, 15.22it/s]

{'loss': 0.6329, 'grad_norm': 1.2148395776748657, 'learning_rate': 1.4285080944035357e-05, 'epoch': 0.57}


 29%|██▊       | 83002/288718 [1:27:44<4:26:05, 12.89it/s]

{'loss': 0.6295, 'grad_norm': 2.590306282043457, 'learning_rate': 1.4250445070968906e-05, 'epoch': 0.57}


 29%|██▉       | 83502/288718 [1:28:19<3:59:09, 14.30it/s]

{'loss': 0.6346, 'grad_norm': 1.2150555849075317, 'learning_rate': 1.4215809197902453e-05, 'epoch': 0.58}


 29%|██▉       | 84002/288718 [1:28:55<4:10:37, 13.61it/s]

{'loss': 0.6376, 'grad_norm': 1.134360909461975, 'learning_rate': 1.4181173324835999e-05, 'epoch': 0.58}


 29%|██▉       | 84502/288718 [1:29:29<3:49:55, 14.80it/s]

{'loss': 0.6357, 'grad_norm': 1.970165729522705, 'learning_rate': 1.414653745176955e-05, 'epoch': 0.59}


 29%|██▉       | 85002/288718 [1:30:04<3:54:23, 14.49it/s]

{'loss': 0.6291, 'grad_norm': 1.095201015472412, 'learning_rate': 1.4111901578703095e-05, 'epoch': 0.59}


 30%|██▉       | 85502/288718 [1:30:38<3:43:54, 15.13it/s]

{'loss': 0.6412, 'grad_norm': 1.5469951629638672, 'learning_rate': 1.4077265705636642e-05, 'epoch': 0.59}


 30%|██▉       | 86002/288718 [1:31:14<4:01:15, 14.00it/s]

{'loss': 0.6388, 'grad_norm': 1.2787573337554932, 'learning_rate': 1.4042629832570191e-05, 'epoch': 0.6}


 30%|██▉       | 86502/288718 [1:31:49<4:05:32, 13.73it/s]

{'loss': 0.6328, 'grad_norm': 1.344594955444336, 'learning_rate': 1.4007993959503738e-05, 'epoch': 0.6}


 30%|███       | 87002/288718 [1:32:23<4:11:47, 13.35it/s]

{'loss': 0.6377, 'grad_norm': 1.2539266347885132, 'learning_rate': 1.3973358086437286e-05, 'epoch': 0.6}


 30%|███       | 87502/288718 [1:32:58<4:11:31, 13.33it/s]

{'loss': 0.638, 'grad_norm': 1.6640286445617676, 'learning_rate': 1.3938722213370834e-05, 'epoch': 0.61}


 30%|███       | 88002/288718 [1:33:34<3:44:55, 14.87it/s]

{'loss': 0.633, 'grad_norm': 2.3243629932403564, 'learning_rate': 1.3904086340304382e-05, 'epoch': 0.61}


 31%|███       | 88502/288718 [1:34:10<4:07:42, 13.47it/s]

{'loss': 0.6398, 'grad_norm': 1.3746211528778076, 'learning_rate': 1.3869450467237929e-05, 'epoch': 0.61}


 31%|███       | 89002/288718 [1:34:46<4:03:26, 13.67it/s]

{'loss': 0.6306, 'grad_norm': 1.5351203680038452, 'learning_rate': 1.3834814594171478e-05, 'epoch': 0.62}


 31%|███       | 89500/288718 [1:35:22<3:58:42, 13.91it/s]

{'loss': 0.6413, 'grad_norm': 1.822982668876648, 'learning_rate': 1.3800178721105023e-05, 'epoch': 0.62}


 31%|███       | 90002/288718 [1:35:57<3:57:21, 13.95it/s]

{'loss': 0.6351, 'grad_norm': 1.762683629989624, 'learning_rate': 1.376554284803857e-05, 'epoch': 0.62}


 31%|███▏      | 90502/288718 [1:36:34<4:04:34, 13.51it/s]

{'loss': 0.6369, 'grad_norm': 1.611903429031372, 'learning_rate': 1.373090697497212e-05, 'epoch': 0.63}


 32%|███▏      | 91002/288718 [1:37:09<4:03:00, 13.56it/s]

{'loss': 0.6414, 'grad_norm': 1.217590570449829, 'learning_rate': 1.3696271101905667e-05, 'epoch': 0.63}


 32%|███▏      | 91502/288718 [1:37:46<3:53:11, 14.10it/s]

{'loss': 0.6328, 'grad_norm': 1.6526086330413818, 'learning_rate': 1.3661635228839214e-05, 'epoch': 0.63}


 32%|███▏      | 92002/288718 [1:38:23<3:38:45, 14.99it/s]

{'loss': 0.6406, 'grad_norm': 0.9944590330123901, 'learning_rate': 1.3626999355772763e-05, 'epoch': 0.64}


 32%|███▏      | 92502/288718 [1:38:59<4:05:26, 13.32it/s]

{'loss': 0.6393, 'grad_norm': 1.6981265544891357, 'learning_rate': 1.359236348270631e-05, 'epoch': 0.64}


 32%|███▏      | 93002/288718 [1:39:35<3:41:06, 14.75it/s]

{'loss': 0.6359, 'grad_norm': 1.8452520370483398, 'learning_rate': 1.3557727609639857e-05, 'epoch': 0.64}


 32%|███▏      | 93502/288718 [1:40:11<3:45:34, 14.42it/s]

{'loss': 0.6291, 'grad_norm': 1.406334400177002, 'learning_rate': 1.3523091736573406e-05, 'epoch': 0.65}


 33%|███▎      | 94002/288718 [1:40:48<4:15:54, 12.68it/s]

{'loss': 0.633, 'grad_norm': 1.2509442567825317, 'learning_rate': 1.3488455863506954e-05, 'epoch': 0.65}


 33%|███▎      | 94502/288718 [1:41:24<4:08:02, 13.05it/s]

{'loss': 0.6395, 'grad_norm': 1.411275863647461, 'learning_rate': 1.34538199904405e-05, 'epoch': 0.65}


 33%|███▎      | 95002/288718 [1:42:01<3:41:51, 14.55it/s]

{'loss': 0.6324, 'grad_norm': 1.3570746183395386, 'learning_rate': 1.3419184117374046e-05, 'epoch': 0.66}


 33%|███▎      | 95502/288718 [1:42:37<3:41:00, 14.57it/s]

{'loss': 0.6387, 'grad_norm': 1.40647292137146, 'learning_rate': 1.3384548244307595e-05, 'epoch': 0.66}


 33%|███▎      | 96002/288718 [1:43:13<4:11:37, 12.76it/s]

{'loss': 0.6376, 'grad_norm': 2.5849475860595703, 'learning_rate': 1.3349912371241143e-05, 'epoch': 0.67}


 33%|███▎      | 96502/288718 [1:43:50<4:01:05, 13.29it/s]

{'loss': 0.638, 'grad_norm': 1.5861111879348755, 'learning_rate': 1.331527649817469e-05, 'epoch': 0.67}


 34%|███▎      | 97002/288718 [1:44:27<4:08:28, 12.86it/s]

{'loss': 0.6341, 'grad_norm': 1.3757213354110718, 'learning_rate': 1.3280640625108239e-05, 'epoch': 0.67}


 34%|███▍      | 97502/288718 [1:45:03<3:54:36, 13.58it/s]

{'loss': 0.6322, 'grad_norm': 2.101212501525879, 'learning_rate': 1.3246004752041786e-05, 'epoch': 0.68}


 34%|███▍      | 98002/288718 [1:45:40<3:48:35, 13.91it/s]

{'loss': 0.636, 'grad_norm': 1.7227330207824707, 'learning_rate': 1.3211368878975333e-05, 'epoch': 0.68}


 34%|███▍      | 98502/288718 [1:46:17<3:56:51, 13.38it/s]

{'loss': 0.638, 'grad_norm': 1.0431995391845703, 'learning_rate': 1.3176733005908882e-05, 'epoch': 0.68}


 34%|███▍      | 99002/288718 [1:46:54<3:29:19, 15.11it/s]

{'loss': 0.6359, 'grad_norm': 1.161728858947754, 'learning_rate': 1.3142097132842428e-05, 'epoch': 0.69}


 34%|███▍      | 99502/288718 [1:47:31<3:48:11, 13.82it/s]

{'loss': 0.6316, 'grad_norm': 1.8913991451263428, 'learning_rate': 1.3107461259775975e-05, 'epoch': 0.69}


 35%|███▍      | 100002/288718 [1:48:07<3:37:31, 14.46it/s]

{'loss': 0.6379, 'grad_norm': 1.042717695236206, 'learning_rate': 1.3072825386709524e-05, 'epoch': 0.69}


 35%|███▍      | 100502/288718 [1:48:44<3:37:22, 14.43it/s]

{'loss': 0.6374, 'grad_norm': 1.7742019891738892, 'learning_rate': 1.3038189513643071e-05, 'epoch': 0.7}


 35%|███▍      | 101002/288718 [1:49:22<3:46:34, 13.81it/s]

{'loss': 0.6363, 'grad_norm': 1.3375078439712524, 'learning_rate': 1.3003553640576618e-05, 'epoch': 0.7}


 35%|███▌      | 101502/288718 [1:50:00<3:36:47, 14.39it/s]

{'loss': 0.6363, 'grad_norm': 1.374144196510315, 'learning_rate': 1.2968917767510167e-05, 'epoch': 0.7}


 35%|███▌      | 102002/288718 [1:50:37<3:42:56, 13.96it/s]

{'loss': 0.6274, 'grad_norm': 1.5824635028839111, 'learning_rate': 1.2934281894443715e-05, 'epoch': 0.71}


 36%|███▌      | 102502/288718 [1:51:14<3:52:26, 13.35it/s]

{'loss': 0.6329, 'grad_norm': 1.7050018310546875, 'learning_rate': 1.2899646021377262e-05, 'epoch': 0.71}


 36%|███▌      | 103002/288718 [1:51:52<3:55:00, 13.17it/s]

{'loss': 0.6408, 'grad_norm': 1.5400158166885376, 'learning_rate': 1.286501014831081e-05, 'epoch': 0.71}


 36%|███▌      | 103502/288718 [1:52:29<3:42:41, 13.86it/s]

{'loss': 0.6324, 'grad_norm': 0.9834383726119995, 'learning_rate': 1.2830374275244358e-05, 'epoch': 0.72}


 36%|███▌      | 104002/288718 [1:53:07<3:52:44, 13.23it/s]

{'loss': 0.6277, 'grad_norm': 1.3065807819366455, 'learning_rate': 1.2795738402177904e-05, 'epoch': 0.72}


 36%|███▌      | 104502/288718 [1:53:44<3:43:35, 13.73it/s]

{'loss': 0.6357, 'grad_norm': 1.7108042240142822, 'learning_rate': 1.2761102529111452e-05, 'epoch': 0.72}


 36%|███▋      | 105001/288718 [1:54:25<4:07:10, 12.39it/s]

{'loss': 0.6323, 'grad_norm': 1.561110258102417, 'learning_rate': 1.2726466656045e-05, 'epoch': 0.73}


 37%|███▋      | 105503/288718 [1:55:04<3:35:58, 14.14it/s]

{'loss': 0.6313, 'grad_norm': 1.428123116493225, 'learning_rate': 1.2691830782978547e-05, 'epoch': 0.73}


 37%|███▋      | 106001/288718 [1:55:42<3:26:13, 14.77it/s]

{'loss': 0.6433, 'grad_norm': 1.1342989206314087, 'learning_rate': 1.2657194909912096e-05, 'epoch': 0.73}


 37%|███▋      | 106503/288718 [1:56:19<3:29:39, 14.49it/s]

{'loss': 0.6409, 'grad_norm': 1.2966407537460327, 'learning_rate': 1.2622559036845643e-05, 'epoch': 0.74}


 37%|███▋      | 107001/288718 [1:56:56<3:43:02, 13.58it/s]

{'loss': 0.6325, 'grad_norm': 1.718260645866394, 'learning_rate': 1.258792316377919e-05, 'epoch': 0.74}


 37%|███▋      | 107501/288718 [1:57:33<3:39:49, 13.74it/s]

{'loss': 0.6349, 'grad_norm': 1.4736484289169312, 'learning_rate': 1.255328729071274e-05, 'epoch': 0.74}


 37%|███▋      | 108001/288718 [1:58:10<3:47:53, 13.22it/s]

{'loss': 0.6316, 'grad_norm': 1.8465338945388794, 'learning_rate': 1.2518651417646287e-05, 'epoch': 0.75}


 38%|███▊      | 108501/288718 [1:58:48<3:39:52, 13.66it/s]

{'loss': 0.6349, 'grad_norm': 1.589013934135437, 'learning_rate': 1.2484015544579832e-05, 'epoch': 0.75}


 38%|███▊      | 109003/288718 [1:59:27<3:34:03, 13.99it/s]

{'loss': 0.636, 'grad_norm': 1.3965545892715454, 'learning_rate': 1.2449379671513383e-05, 'epoch': 0.76}


 38%|███▊      | 109501/288718 [2:00:04<3:42:50, 13.40it/s]

{'loss': 0.6348, 'grad_norm': 1.9364622831344604, 'learning_rate': 1.2414743798446928e-05, 'epoch': 0.76}


 38%|███▊      | 110001/288718 [2:00:41<3:41:38, 13.44it/s]

{'loss': 0.633, 'grad_norm': 1.320421814918518, 'learning_rate': 1.2380107925380475e-05, 'epoch': 0.76}


 38%|███▊      | 110501/288718 [2:01:20<3:49:09, 12.96it/s]

{'loss': 0.633, 'grad_norm': 1.1498677730560303, 'learning_rate': 1.2345472052314024e-05, 'epoch': 0.77}


 38%|███▊      | 111003/288718 [2:01:58<3:27:54, 14.25it/s]

{'loss': 0.6354, 'grad_norm': 1.0682791471481323, 'learning_rate': 1.2310836179247572e-05, 'epoch': 0.77}


 39%|███▊      | 111501/288718 [2:02:36<3:33:09, 13.86it/s]

{'loss': 0.6331, 'grad_norm': 1.4113414287567139, 'learning_rate': 1.2276200306181119e-05, 'epoch': 0.77}


 39%|███▉      | 112001/288718 [2:03:15<3:45:02, 13.09it/s]

{'loss': 0.6363, 'grad_norm': 2.5407590866088867, 'learning_rate': 1.2241564433114666e-05, 'epoch': 0.78}


 39%|███▉      | 112501/288718 [2:03:52<4:04:03, 12.03it/s]

{'loss': 0.6352, 'grad_norm': 1.5219162702560425, 'learning_rate': 1.2206928560048215e-05, 'epoch': 0.78}


 39%|███▉      | 113001/288718 [2:04:30<4:10:28, 11.69it/s]

{'loss': 0.6329, 'grad_norm': 1.65774667263031, 'learning_rate': 1.2172292686981762e-05, 'epoch': 0.78}


 39%|███▉      | 113501/288718 [2:05:09<3:44:04, 13.03it/s]

{'loss': 0.6344, 'grad_norm': 0.9380236864089966, 'learning_rate': 1.2137656813915308e-05, 'epoch': 0.79}


 39%|███▉      | 114001/288718 [2:05:48<3:50:21, 12.64it/s]

{'loss': 0.6393, 'grad_norm': 1.2459605932235718, 'learning_rate': 1.2103020940848857e-05, 'epoch': 0.79}


 40%|███▉      | 114503/288718 [2:06:26<3:24:01, 14.23it/s]

{'loss': 0.6307, 'grad_norm': 1.5856915712356567, 'learning_rate': 1.2068385067782404e-05, 'epoch': 0.79}


 40%|███▉      | 115001/288718 [2:07:03<3:29:11, 13.84it/s]

{'loss': 0.6351, 'grad_norm': 1.4710414409637451, 'learning_rate': 1.2033749194715951e-05, 'epoch': 0.8}


 40%|████      | 115501/288718 [2:07:41<3:36:37, 13.33it/s]

{'loss': 0.635, 'grad_norm': 1.7200433015823364, 'learning_rate': 1.19991133216495e-05, 'epoch': 0.8}


 40%|████      | 116001/288718 [2:08:19<3:44:59, 12.79it/s]

{'loss': 0.631, 'grad_norm': 1.2700024843215942, 'learning_rate': 1.1964477448583047e-05, 'epoch': 0.8}


 40%|████      | 116501/288718 [2:08:57<3:26:34, 13.89it/s]

{'loss': 0.6371, 'grad_norm': 1.7278189659118652, 'learning_rate': 1.1929841575516595e-05, 'epoch': 0.81}


 41%|████      | 117001/288718 [2:09:35<4:26:29, 10.74it/s]

{'loss': 0.6268, 'grad_norm': 2.6937780380249023, 'learning_rate': 1.1895205702450144e-05, 'epoch': 0.81}


 41%|████      | 117503/288718 [2:10:13<3:19:03, 14.34it/s]

{'loss': 0.6335, 'grad_norm': 1.1399250030517578, 'learning_rate': 1.186056982938369e-05, 'epoch': 0.81}


 41%|████      | 118001/288718 [2:10:52<3:41:41, 12.83it/s]

{'loss': 0.6364, 'grad_norm': 1.7024005651474, 'learning_rate': 1.1825933956317236e-05, 'epoch': 0.82}


 41%|████      | 118501/288718 [2:11:30<3:37:24, 13.05it/s]

{'loss': 0.6302, 'grad_norm': 1.3829138278961182, 'learning_rate': 1.1791298083250787e-05, 'epoch': 0.82}


 41%|████      | 119001/288718 [2:12:08<3:38:40, 12.94it/s]

{'loss': 0.6412, 'grad_norm': 2.493577241897583, 'learning_rate': 1.1756662210184333e-05, 'epoch': 0.82}


 41%|████▏     | 119503/288718 [2:12:46<3:25:47, 13.70it/s]

{'loss': 0.6355, 'grad_norm': 1.2293496131896973, 'learning_rate': 1.172202633711788e-05, 'epoch': 0.83}


 42%|████▏     | 120001/288718 [2:13:25<3:29:53, 13.40it/s]

{'loss': 0.6303, 'grad_norm': 1.6335886716842651, 'learning_rate': 1.1687390464051429e-05, 'epoch': 0.83}


 42%|████▏     | 120501/288718 [2:14:02<3:36:01, 12.98it/s]

{'loss': 0.6393, 'grad_norm': 1.2848948240280151, 'learning_rate': 1.1652754590984976e-05, 'epoch': 0.83}


 42%|████▏     | 121001/288718 [2:14:41<3:30:29, 13.28it/s]

{'loss': 0.6296, 'grad_norm': 2.0684077739715576, 'learning_rate': 1.1618118717918523e-05, 'epoch': 0.84}


 42%|████▏     | 121501/288718 [2:15:19<3:22:11, 13.78it/s]

{'loss': 0.6288, 'grad_norm': 1.952593445777893, 'learning_rate': 1.1583482844852072e-05, 'epoch': 0.84}


 42%|████▏     | 122001/288718 [2:15:56<3:33:45, 13.00it/s]

{'loss': 0.6289, 'grad_norm': 1.145564079284668, 'learning_rate': 1.154884697178562e-05, 'epoch': 0.85}


 42%|████▏     | 122501/288718 [2:16:36<4:01:17, 11.48it/s]

{'loss': 0.6374, 'grad_norm': 1.1498342752456665, 'learning_rate': 1.1514211098719167e-05, 'epoch': 0.85}


 43%|████▎     | 123001/288718 [2:17:21<3:53:43, 11.82it/s]

{'loss': 0.6369, 'grad_norm': 1.809607982635498, 'learning_rate': 1.1479575225652716e-05, 'epoch': 0.85}


 43%|████▎     | 123501/288718 [2:18:02<3:44:20, 12.27it/s]

{'loss': 0.6371, 'grad_norm': 1.644562005996704, 'learning_rate': 1.1444939352586261e-05, 'epoch': 0.86}


 43%|████▎     | 124001/288718 [2:18:40<3:26:08, 13.32it/s]

{'loss': 0.6356, 'grad_norm': 1.06939697265625, 'learning_rate': 1.1410303479519808e-05, 'epoch': 0.86}


 43%|████▎     | 124501/288718 [2:19:19<3:13:08, 14.17it/s]

{'loss': 0.6365, 'grad_norm': 1.0512239933013916, 'learning_rate': 1.1375667606453357e-05, 'epoch': 0.86}


 43%|████▎     | 125001/288718 [2:19:57<3:45:55, 12.08it/s]

{'loss': 0.632, 'grad_norm': 1.1220707893371582, 'learning_rate': 1.1341031733386904e-05, 'epoch': 0.87}


 43%|████▎     | 125501/288718 [2:20:36<3:24:29, 13.30it/s]

{'loss': 0.6359, 'grad_norm': 1.335787057876587, 'learning_rate': 1.1306395860320452e-05, 'epoch': 0.87}


 44%|████▎     | 126003/288718 [2:21:13<3:05:51, 14.59it/s]

{'loss': 0.6354, 'grad_norm': 1.8943042755126953, 'learning_rate': 1.1271759987254e-05, 'epoch': 0.87}


 44%|████▍     | 126503/288718 [2:21:51<3:08:37, 14.33it/s]

{'loss': 0.6291, 'grad_norm': 1.628963828086853, 'learning_rate': 1.1237124114187548e-05, 'epoch': 0.88}


 44%|████▍     | 127001/288718 [2:22:28<3:16:24, 13.72it/s]

{'loss': 0.6342, 'grad_norm': 1.4608887434005737, 'learning_rate': 1.1202488241121095e-05, 'epoch': 0.88}


 44%|████▍     | 127503/288718 [2:23:05<3:15:20, 13.76it/s]

{'loss': 0.6373, 'grad_norm': 1.412489652633667, 'learning_rate': 1.1167852368054644e-05, 'epoch': 0.88}


 44%|████▍     | 128002/288718 [2:23:46<4:08:31, 10.78it/s]

{'loss': 0.6321, 'grad_norm': 1.465978741645813, 'learning_rate': 1.1133216494988191e-05, 'epoch': 0.89}


 45%|████▍     | 128502/288718 [2:24:27<3:12:44, 13.85it/s]

{'loss': 0.6328, 'grad_norm': 1.1567858457565308, 'learning_rate': 1.1098580621921737e-05, 'epoch': 0.89}


 45%|████▍     | 129002/288718 [2:25:04<3:14:31, 13.68it/s]

{'loss': 0.6395, 'grad_norm': 1.4591772556304932, 'learning_rate': 1.1063944748855284e-05, 'epoch': 0.89}


 45%|████▍     | 129502/288718 [2:25:41<3:17:00, 13.47it/s]

{'loss': 0.6355, 'grad_norm': 1.5712677240371704, 'learning_rate': 1.1029308875788833e-05, 'epoch': 0.9}


 45%|████▌     | 130002/288718 [2:26:19<3:10:42, 13.87it/s]

{'loss': 0.6297, 'grad_norm': 1.7367825508117676, 'learning_rate': 1.099467300272238e-05, 'epoch': 0.9}


 45%|████▌     | 130502/288718 [2:26:57<3:13:41, 13.61it/s]

{'loss': 0.6296, 'grad_norm': 1.4606415033340454, 'learning_rate': 1.0960037129655927e-05, 'epoch': 0.9}


 45%|████▌     | 131002/288718 [2:27:36<3:30:03, 12.51it/s]

{'loss': 0.6344, 'grad_norm': 1.5179991722106934, 'learning_rate': 1.0925401256589476e-05, 'epoch': 0.91}


 46%|████▌     | 131500/288718 [2:28:14<3:04:57, 14.17it/s]

{'loss': 0.6338, 'grad_norm': 1.8706178665161133, 'learning_rate': 1.0890765383523024e-05, 'epoch': 0.91}


 46%|████▌     | 132002/288718 [2:28:52<3:08:12, 13.88it/s]

{'loss': 0.6309, 'grad_norm': 1.685889482498169, 'learning_rate': 1.085612951045657e-05, 'epoch': 0.91}


 46%|████▌     | 132501/288718 [2:29:36<3:21:41, 12.91it/s]

{'loss': 0.6289, 'grad_norm': 1.1457470655441284, 'learning_rate': 1.082149363739012e-05, 'epoch': 0.92}


 46%|████▌     | 133001/288718 [2:30:14<3:23:58, 12.72it/s]

{'loss': 0.6329, 'grad_norm': 1.740790605545044, 'learning_rate': 1.0786857764323665e-05, 'epoch': 0.92}


 46%|████▌     | 133501/288718 [2:30:51<3:19:10, 12.99it/s]

{'loss': 0.6252, 'grad_norm': 2.139277458190918, 'learning_rate': 1.0752221891257213e-05, 'epoch': 0.92}


 46%|████▋     | 134001/288718 [2:31:30<3:35:45, 11.95it/s]

{'loss': 0.629, 'grad_norm': 1.6710575819015503, 'learning_rate': 1.0717586018190762e-05, 'epoch': 0.93}


 47%|████▋     | 134501/288718 [2:32:08<3:27:05, 12.41it/s]

{'loss': 0.6304, 'grad_norm': 2.4962527751922607, 'learning_rate': 1.0682950145124309e-05, 'epoch': 0.93}


 47%|████▋     | 135001/288718 [2:32:46<3:14:07, 13.20it/s]

{'loss': 0.6315, 'grad_norm': 1.2017455101013184, 'learning_rate': 1.0648314272057856e-05, 'epoch': 0.94}


 47%|████▋     | 135501/288718 [2:33:26<3:01:53, 14.04it/s]

{'loss': 0.633, 'grad_norm': 1.3523764610290527, 'learning_rate': 1.0613678398991405e-05, 'epoch': 0.94}


 47%|████▋     | 136001/288718 [2:34:02<2:59:42, 14.16it/s]

{'loss': 0.6332, 'grad_norm': 1.7882250547409058, 'learning_rate': 1.0579042525924952e-05, 'epoch': 0.94}


 47%|████▋     | 136501/288718 [2:34:37<2:57:07, 14.32it/s]

{'loss': 0.6368, 'grad_norm': 1.6567400693893433, 'learning_rate': 1.05444066528585e-05, 'epoch': 0.95}


 47%|████▋     | 137003/288718 [2:35:11<2:44:27, 15.37it/s]

{'loss': 0.6323, 'grad_norm': 1.2378432750701904, 'learning_rate': 1.0509770779792048e-05, 'epoch': 0.95}


 48%|████▊     | 137501/288718 [2:35:46<2:51:27, 14.70it/s]

{'loss': 0.6295, 'grad_norm': 1.3227580785751343, 'learning_rate': 1.0475134906725594e-05, 'epoch': 0.95}


 48%|████▊     | 138001/288718 [2:36:21<2:47:44, 14.98it/s]

{'loss': 0.6404, 'grad_norm': 1.4938163757324219, 'learning_rate': 1.0440499033659141e-05, 'epoch': 0.96}


 48%|████▊     | 138503/288718 [2:36:56<2:45:33, 15.12it/s]

{'loss': 0.6313, 'grad_norm': 1.6151988506317139, 'learning_rate': 1.040586316059269e-05, 'epoch': 0.96}


 48%|████▊     | 139001/288718 [2:37:32<3:01:13, 13.77it/s]

{'loss': 0.6263, 'grad_norm': 1.6532645225524902, 'learning_rate': 1.0371227287526237e-05, 'epoch': 0.96}


 48%|████▊     | 139503/288718 [2:38:07<2:44:24, 15.13it/s]

{'loss': 0.6361, 'grad_norm': 1.1496176719665527, 'learning_rate': 1.0336591414459785e-05, 'epoch': 0.97}


 48%|████▊     | 140003/288718 [2:38:43<2:57:37, 13.95it/s]

{'loss': 0.6278, 'grad_norm': 2.2331788539886475, 'learning_rate': 1.0301955541393333e-05, 'epoch': 0.97}


 49%|████▊     | 140501/288718 [2:39:17<2:43:25, 15.12it/s]

{'loss': 0.6355, 'grad_norm': 2.5677008628845215, 'learning_rate': 1.026731966832688e-05, 'epoch': 0.97}


 49%|████▉     | 141003/288718 [2:39:52<2:43:46, 15.03it/s]

{'loss': 0.6343, 'grad_norm': 2.0814273357391357, 'learning_rate': 1.0232683795260428e-05, 'epoch': 0.98}


 49%|████▉     | 141503/288718 [2:40:26<2:38:16, 15.50it/s]

{'loss': 0.6307, 'grad_norm': 1.7716822624206543, 'learning_rate': 1.0198047922193977e-05, 'epoch': 0.98}


 49%|████▉     | 142001/288718 [2:41:02<2:45:17, 14.79it/s]

{'loss': 0.6351, 'grad_norm': 1.655408263206482, 'learning_rate': 1.0163412049127524e-05, 'epoch': 0.98}


 49%|████▉     | 142501/288718 [2:41:37<3:07:52, 12.97it/s]

{'loss': 0.63, 'grad_norm': 1.3135713338851929, 'learning_rate': 1.012877617606107e-05, 'epoch': 0.99}


 50%|████▉     | 143003/288718 [2:42:12<2:48:30, 14.41it/s]

{'loss': 0.636, 'grad_norm': 2.5263965129852295, 'learning_rate': 1.0094140302994619e-05, 'epoch': 0.99}


 50%|████▉     | 143501/288718 [2:42:49<2:50:24, 14.20it/s]

{'loss': 0.6296, 'grad_norm': 1.350814700126648, 'learning_rate': 1.0059504429928166e-05, 'epoch': 0.99}


 50%|████▉     | 144003/288718 [2:43:24<2:41:19, 14.95it/s]

{'loss': 0.6345, 'grad_norm': 1.4646937847137451, 'learning_rate': 1.0024868556861713e-05, 'epoch': 1.0}


                                                            
 50%|█████     | 144359/288718 [3:14:00<12:20:53,  3.25it/s]

{'eval_loss': 0.6313793659210205, 'eval_accuracy': 0.6279486625469946, 'eval_runtime': 1809.371, 'eval_samples_per_second': 319.738, 'eval_steps_per_second': 19.984, 'epoch': 1.0}


 50%|█████     | 144502/288718 [3:14:08<2:00:17, 19.98it/s]     

{'loss': 0.6311, 'grad_norm': 2.168114423751831, 'learning_rate': 9.990232683795262e-06, 'epoch': 1.0}


 50%|█████     | 145002/288718 [3:14:46<3:14:25, 12.32it/s]

{'loss': 0.6258, 'grad_norm': 1.573665976524353, 'learning_rate': 9.95559681072881e-06, 'epoch': 1.0}


 50%|█████     | 145502/288718 [3:15:24<2:41:24, 14.79it/s]

{'loss': 0.6292, 'grad_norm': 2.847656011581421, 'learning_rate': 9.920960937662357e-06, 'epoch': 1.01}


 51%|█████     | 146002/288718 [3:15:57<2:43:35, 14.54it/s]

{'loss': 0.6256, 'grad_norm': 1.6819252967834473, 'learning_rate': 9.886325064595904e-06, 'epoch': 1.01}


 51%|█████     | 146502/288718 [3:16:30<2:32:49, 15.51it/s]

{'loss': 0.633, 'grad_norm': 1.238070011138916, 'learning_rate': 9.851689191529453e-06, 'epoch': 1.01}


 51%|█████     | 147002/288718 [3:17:04<2:53:34, 13.61it/s]

{'loss': 0.6292, 'grad_norm': 1.8385041952133179, 'learning_rate': 9.817053318462998e-06, 'epoch': 1.02}


 51%|█████     | 147502/288718 [3:17:38<2:51:16, 13.74it/s]

{'loss': 0.6327, 'grad_norm': 2.1315624713897705, 'learning_rate': 9.782417445396547e-06, 'epoch': 1.02}


 51%|█████▏    | 148002/288718 [3:18:12<2:50:58, 13.72it/s]

{'loss': 0.6314, 'grad_norm': 1.5895507335662842, 'learning_rate': 9.747781572330094e-06, 'epoch': 1.03}


 51%|█████▏    | 148502/288718 [3:18:47<2:37:24, 14.85it/s]

{'loss': 0.6283, 'grad_norm': 1.3746018409729004, 'learning_rate': 9.713145699263642e-06, 'epoch': 1.03}


 52%|█████▏    | 149002/288718 [3:19:21<2:37:30, 14.78it/s]

{'loss': 0.6326, 'grad_norm': 1.3250705003738403, 'learning_rate': 9.67850982619719e-06, 'epoch': 1.03}


 52%|█████▏    | 149502/288718 [3:19:56<2:28:00, 15.68it/s]

{'loss': 0.6322, 'grad_norm': 2.433617353439331, 'learning_rate': 9.643873953130736e-06, 'epoch': 1.04}


 52%|█████▏    | 150002/288718 [3:20:30<2:35:11, 14.90it/s]

{'loss': 0.6278, 'grad_norm': 1.7350363731384277, 'learning_rate': 9.609238080064285e-06, 'epoch': 1.04}


 52%|█████▏    | 150502/288718 [3:21:05<2:32:47, 15.08it/s]

{'loss': 0.6344, 'grad_norm': 1.444618821144104, 'learning_rate': 9.574602206997832e-06, 'epoch': 1.04}


 52%|█████▏    | 151002/288718 [3:21:39<2:36:50, 14.63it/s]

{'loss': 0.6231, 'grad_norm': 1.808124303817749, 'learning_rate': 9.53996633393138e-06, 'epoch': 1.05}


 52%|█████▏    | 151502/288718 [3:22:14<2:42:08, 14.10it/s]

{'loss': 0.6248, 'grad_norm': 1.2878659963607788, 'learning_rate': 9.505330460864928e-06, 'epoch': 1.05}


 53%|█████▎    | 152001/288718 [3:22:50<4:21:29,  8.71it/s]

{'loss': 0.6315, 'grad_norm': 2.047348976135254, 'learning_rate': 9.470694587798476e-06, 'epoch': 1.05}


 53%|█████▎    | 152503/288718 [3:23:27<2:23:42, 15.80it/s]

{'loss': 0.6238, 'grad_norm': 1.410123586654663, 'learning_rate': 9.436058714732023e-06, 'epoch': 1.06}


 53%|█████▎    | 153001/288718 [3:23:59<2:43:51, 13.80it/s]

{'loss': 0.6246, 'grad_norm': 1.2089756727218628, 'learning_rate': 9.40142284166557e-06, 'epoch': 1.06}


 53%|█████▎    | 153503/288718 [3:24:30<2:12:20, 17.03it/s]

{'loss': 0.622, 'grad_norm': 1.4286543130874634, 'learning_rate': 9.366786968599119e-06, 'epoch': 1.06}


 53%|█████▎    | 154002/288718 [3:25:01<2:26:19, 15.34it/s]

{'loss': 0.6312, 'grad_norm': 1.961677074432373, 'learning_rate': 9.332151095532666e-06, 'epoch': 1.07}


 54%|█████▎    | 154502/288718 [3:25:36<2:25:43, 15.35it/s]

{'loss': 0.6264, 'grad_norm': 1.4472324848175049, 'learning_rate': 9.297515222466214e-06, 'epoch': 1.07}


 54%|█████▎    | 155002/288718 [3:26:05<2:05:54, 17.70it/s]

{'loss': 0.6295, 'grad_norm': 1.6348845958709717, 'learning_rate': 9.26287934939976e-06, 'epoch': 1.07}


 54%|█████▍    | 155503/288718 [3:26:36<2:22:39, 15.56it/s]

{'loss': 0.6257, 'grad_norm': 2.1919076442718506, 'learning_rate': 9.228243476333308e-06, 'epoch': 1.08}


 54%|█████▍    | 156003/288718 [3:27:06<2:18:01, 16.03it/s]

{'loss': 0.6348, 'grad_norm': 2.405304193496704, 'learning_rate': 9.193607603266857e-06, 'epoch': 1.08}


 54%|█████▍    | 156501/288718 [3:27:38<2:26:10, 15.07it/s]

{'loss': 0.6321, 'grad_norm': 1.5777822732925415, 'learning_rate': 9.158971730200404e-06, 'epoch': 1.08}


 54%|█████▍    | 157003/288718 [3:28:09<2:09:02, 17.01it/s]

{'loss': 0.6279, 'grad_norm': 1.1593668460845947, 'learning_rate': 9.124335857133951e-06, 'epoch': 1.09}


 55%|█████▍    | 157503/288718 [3:28:43<2:28:25, 14.73it/s]

{'loss': 0.6316, 'grad_norm': 1.7406089305877686, 'learning_rate': 9.089699984067499e-06, 'epoch': 1.09}


 55%|█████▍    | 158003/288718 [3:29:16<2:13:17, 16.35it/s]

{'loss': 0.621, 'grad_norm': 2.280904769897461, 'learning_rate': 9.055064111001046e-06, 'epoch': 1.09}


 55%|█████▍    | 158503/288718 [3:29:50<2:17:30, 15.78it/s]

{'loss': 0.625, 'grad_norm': 1.2191383838653564, 'learning_rate': 9.020428237934595e-06, 'epoch': 1.1}


 55%|█████▌    | 159003/288718 [3:30:23<2:28:47, 14.53it/s]

{'loss': 0.6359, 'grad_norm': 1.6356841325759888, 'learning_rate': 8.985792364868142e-06, 'epoch': 1.1}


 55%|█████▌    | 159501/288718 [3:30:56<2:24:15, 14.93it/s]

{'loss': 0.6266, 'grad_norm': 2.1260597705841064, 'learning_rate': 8.95115649180169e-06, 'epoch': 1.1}


 55%|█████▌    | 160003/288718 [3:31:29<2:22:55, 15.01it/s]

{'loss': 0.6246, 'grad_norm': 1.2144325971603394, 'learning_rate': 8.916520618735237e-06, 'epoch': 1.11}


 56%|█████▌    | 160503/288718 [3:32:03<2:07:23, 16.77it/s]

{'loss': 0.6307, 'grad_norm': 2.2297110557556152, 'learning_rate': 8.881884745668786e-06, 'epoch': 1.11}


 56%|█████▌    | 161003/288718 [3:32:37<2:40:32, 13.26it/s]

{'loss': 0.6256, 'grad_norm': 1.326584815979004, 'learning_rate': 8.847248872602333e-06, 'epoch': 1.12}


 56%|█████▌    | 161503/288718 [3:33:10<2:19:12, 15.23it/s]

{'loss': 0.6286, 'grad_norm': 1.2708683013916016, 'learning_rate': 8.81261299953588e-06, 'epoch': 1.12}


 56%|█████▌    | 162003/288718 [3:33:44<2:24:06, 14.65it/s]

{'loss': 0.6343, 'grad_norm': 2.21181583404541, 'learning_rate': 8.777977126469427e-06, 'epoch': 1.12}


 56%|█████▋    | 162503/288718 [3:34:16<2:02:59, 17.10it/s]

{'loss': 0.631, 'grad_norm': 1.7576450109481812, 'learning_rate': 8.743341253402974e-06, 'epoch': 1.13}


 56%|█████▋    | 163002/288718 [3:34:45<2:11:21, 15.95it/s]

{'loss': 0.6281, 'grad_norm': 1.4706192016601562, 'learning_rate': 8.708705380336523e-06, 'epoch': 1.13}


 57%|█████▋    | 163503/288718 [3:35:16<2:01:52, 17.12it/s]

{'loss': 0.6282, 'grad_norm': 1.7291781902313232, 'learning_rate': 8.67406950727007e-06, 'epoch': 1.13}


 57%|█████▋    | 164003/288718 [3:35:47<2:09:16, 16.08it/s]

{'loss': 0.6278, 'grad_norm': 1.2461059093475342, 'learning_rate': 8.639433634203618e-06, 'epoch': 1.14}


 57%|█████▋    | 164503/288718 [3:36:19<2:04:46, 16.59it/s]

{'loss': 0.6281, 'grad_norm': 3.071969747543335, 'learning_rate': 8.604797761137165e-06, 'epoch': 1.14}


 57%|█████▋    | 165001/288718 [3:36:49<2:11:07, 15.72it/s]

{'loss': 0.6333, 'grad_norm': 1.5651016235351562, 'learning_rate': 8.570161888070714e-06, 'epoch': 1.14}


 57%|█████▋    | 165501/288718 [3:37:21<2:18:39, 14.81it/s]

{'loss': 0.6318, 'grad_norm': 1.448851227760315, 'learning_rate': 8.535526015004261e-06, 'epoch': 1.15}


 57%|█████▋    | 166003/288718 [3:37:52<2:05:59, 16.23it/s]

{'loss': 0.6243, 'grad_norm': 1.6525648832321167, 'learning_rate': 8.500890141937809e-06, 'epoch': 1.15}


 58%|█████▊    | 166503/288718 [3:38:23<2:08:28, 15.85it/s]

{'loss': 0.6322, 'grad_norm': 2.1627626419067383, 'learning_rate': 8.466254268871356e-06, 'epoch': 1.15}


 58%|█████▊    | 167003/288718 [3:38:55<1:59:04, 17.04it/s]

{'loss': 0.6297, 'grad_norm': 2.0127642154693604, 'learning_rate': 8.431618395804903e-06, 'epoch': 1.16}


 58%|█████▊    | 167503/288718 [3:39:26<2:16:37, 14.79it/s]

{'loss': 0.6283, 'grad_norm': 1.1524415016174316, 'learning_rate': 8.396982522738452e-06, 'epoch': 1.16}


 58%|█████▊    | 168003/288718 [3:39:58<2:00:23, 16.71it/s]

{'loss': 0.6309, 'grad_norm': 1.2988054752349854, 'learning_rate': 8.362346649672e-06, 'epoch': 1.16}


 58%|█████▊    | 168503/288718 [3:40:29<2:05:39, 15.95it/s]

{'loss': 0.6333, 'grad_norm': 1.5151550769805908, 'learning_rate': 8.327710776605546e-06, 'epoch': 1.17}


 59%|█████▊    | 169002/288718 [3:40:59<1:58:35, 16.82it/s]

{'loss': 0.6259, 'grad_norm': 2.0326406955718994, 'learning_rate': 8.293074903539094e-06, 'epoch': 1.17}


 59%|█████▊    | 169502/288718 [3:41:28<1:56:51, 17.00it/s]

{'loss': 0.6282, 'grad_norm': 1.4891793727874756, 'learning_rate': 8.258439030472641e-06, 'epoch': 1.17}


 59%|█████▉    | 170002/288718 [3:41:57<1:49:20, 18.10it/s]

{'loss': 0.6244, 'grad_norm': 1.676003098487854, 'learning_rate': 8.22380315740619e-06, 'epoch': 1.18}


 59%|█████▉    | 170503/288718 [3:42:25<1:45:02, 18.76it/s]

{'loss': 0.6267, 'grad_norm': 1.2442654371261597, 'learning_rate': 8.189167284339737e-06, 'epoch': 1.18}


 59%|█████▉    | 171002/288718 [3:42:51<1:40:12, 19.58it/s]

{'loss': 0.6305, 'grad_norm': 1.5272197723388672, 'learning_rate': 8.154531411273284e-06, 'epoch': 1.18}


 59%|█████▉    | 171501/288718 [3:43:17<1:34:35, 20.65it/s]

{'loss': 0.6261, 'grad_norm': 1.0454580783843994, 'learning_rate': 8.119895538206832e-06, 'epoch': 1.19}


 60%|█████▉    | 172003/288718 [3:43:43<1:42:10, 19.04it/s]

{'loss': 0.6341, 'grad_norm': 1.4314465522766113, 'learning_rate': 8.08525966514038e-06, 'epoch': 1.19}


 60%|█████▉    | 172503/288718 [3:44:08<1:37:38, 19.84it/s]

{'loss': 0.632, 'grad_norm': 1.8688664436340332, 'learning_rate': 8.050623792073928e-06, 'epoch': 1.19}


 60%|█████▉    | 173002/288718 [3:44:34<1:40:02, 19.28it/s]

{'loss': 0.6208, 'grad_norm': 3.4463436603546143, 'learning_rate': 8.015987919007475e-06, 'epoch': 1.2}


 60%|██████    | 173501/288718 [3:45:00<1:44:55, 18.30it/s]

{'loss': 0.6263, 'grad_norm': 2.5635311603546143, 'learning_rate': 7.981352045941024e-06, 'epoch': 1.2}


 60%|██████    | 174002/288718 [3:45:26<1:34:53, 20.15it/s]

{'loss': 0.6282, 'grad_norm': 2.724353075027466, 'learning_rate': 7.94671617287457e-06, 'epoch': 1.21}


 60%|██████    | 174503/288718 [3:45:51<1:42:36, 18.55it/s]

{'loss': 0.6271, 'grad_norm': 1.4513862133026123, 'learning_rate': 7.912080299808118e-06, 'epoch': 1.21}


 61%|██████    | 175003/288718 [3:46:17<1:36:27, 19.65it/s]

{'loss': 0.6275, 'grad_norm': 1.1865595579147339, 'learning_rate': 7.877444426741666e-06, 'epoch': 1.21}


 61%|██████    | 175504/288718 [3:46:43<1:32:36, 20.38it/s]

{'loss': 0.626, 'grad_norm': 1.470212459564209, 'learning_rate': 7.842808553675213e-06, 'epoch': 1.22}


 61%|██████    | 176002/288718 [3:47:08<1:35:31, 19.67it/s]

{'loss': 0.6329, 'grad_norm': 1.6536824703216553, 'learning_rate': 7.808172680608762e-06, 'epoch': 1.22}


 61%|██████    | 176502/288718 [3:47:34<1:37:14, 19.23it/s]

{'loss': 0.6323, 'grad_norm': 2.06801700592041, 'learning_rate': 7.773536807542307e-06, 'epoch': 1.22}


 61%|██████▏   | 177002/288718 [3:47:59<1:32:31, 20.13it/s]

{'loss': 0.6305, 'grad_norm': 1.342719316482544, 'learning_rate': 7.738900934475856e-06, 'epoch': 1.23}


 61%|██████▏   | 177502/288718 [3:48:25<1:38:28, 18.82it/s]

{'loss': 0.631, 'grad_norm': 2.054497241973877, 'learning_rate': 7.704265061409403e-06, 'epoch': 1.23}


 62%|██████▏   | 178004/288718 [3:48:50<1:32:51, 19.87it/s]

{'loss': 0.6303, 'grad_norm': 1.724242091178894, 'learning_rate': 7.66962918834295e-06, 'epoch': 1.23}


 62%|██████▏   | 178503/288718 [3:49:15<1:31:15, 20.13it/s]

{'loss': 0.6239, 'grad_norm': 2.344092607498169, 'learning_rate': 7.634993315276498e-06, 'epoch': 1.24}


 62%|██████▏   | 179002/288718 [3:49:40<1:32:16, 19.82it/s]

{'loss': 0.6256, 'grad_norm': 1.4280363321304321, 'learning_rate': 7.600357442210047e-06, 'epoch': 1.24}


 62%|██████▏   | 179502/288718 [3:50:05<1:35:14, 19.11it/s]

{'loss': 0.6332, 'grad_norm': 1.6277780532836914, 'learning_rate': 7.565721569143593e-06, 'epoch': 1.24}


 62%|██████▏   | 180003/288718 [3:50:31<1:32:36, 19.57it/s]

{'loss': 0.6249, 'grad_norm': 1.5872018337249756, 'learning_rate': 7.531085696077141e-06, 'epoch': 1.25}


 63%|██████▎   | 180503/288718 [3:50:57<1:42:38, 17.57it/s]

{'loss': 0.6265, 'grad_norm': 2.031309127807617, 'learning_rate': 7.4964498230106895e-06, 'epoch': 1.25}


 63%|██████▎   | 181001/288718 [3:51:22<1:32:57, 19.31it/s]

{'loss': 0.6321, 'grad_norm': 1.6918714046478271, 'learning_rate': 7.461813949944237e-06, 'epoch': 1.25}


 63%|██████▎   | 181504/288718 [3:51:48<1:28:40, 20.15it/s]

{'loss': 0.6314, 'grad_norm': 1.541435956954956, 'learning_rate': 7.427178076877785e-06, 'epoch': 1.26}


 63%|██████▎   | 182003/288718 [3:52:15<1:37:31, 18.24it/s]

{'loss': 0.6289, 'grad_norm': 1.1801623106002808, 'learning_rate': 7.392542203811333e-06, 'epoch': 1.26}


 63%|██████▎   | 182503/288718 [3:52:41<1:32:18, 19.18it/s]

{'loss': 0.6261, 'grad_norm': 1.5478616952896118, 'learning_rate': 7.357906330744879e-06, 'epoch': 1.26}


 63%|██████▎   | 183002/288718 [3:53:08<1:32:25, 19.06it/s]

{'loss': 0.6276, 'grad_norm': 1.5809317827224731, 'learning_rate': 7.323270457678427e-06, 'epoch': 1.27}


 64%|██████▎   | 183502/288718 [3:53:37<1:40:22, 17.47it/s]

{'loss': 0.6291, 'grad_norm': 1.7783141136169434, 'learning_rate': 7.288634584611975e-06, 'epoch': 1.27}


 64%|██████▎   | 184002/288718 [3:54:06<1:37:43, 17.86it/s]

{'loss': 0.6311, 'grad_norm': 1.6314328908920288, 'learning_rate': 7.253998711545523e-06, 'epoch': 1.27}


 64%|██████▍   | 184502/288718 [3:54:35<1:39:09, 17.52it/s]

{'loss': 0.6281, 'grad_norm': 1.625914216041565, 'learning_rate': 7.219362838479071e-06, 'epoch': 1.28}


 64%|██████▍   | 185003/288718 [3:55:04<1:41:02, 17.11it/s]

{'loss': 0.6324, 'grad_norm': 2.1642987728118896, 'learning_rate': 7.184726965412617e-06, 'epoch': 1.28}


 64%|██████▍   | 185502/288718 [3:55:33<1:38:02, 17.55it/s]

{'loss': 0.6231, 'grad_norm': 1.6160160303115845, 'learning_rate': 7.150091092346165e-06, 'epoch': 1.28}


 64%|██████▍   | 186002/288718 [3:56:02<1:37:09, 17.62it/s]

{'loss': 0.6256, 'grad_norm': 1.7792423963546753, 'learning_rate': 7.115455219279713e-06, 'epoch': 1.29}


 65%|██████▍   | 186502/288718 [3:56:32<1:38:13, 17.34it/s]

{'loss': 0.6321, 'grad_norm': 2.504741668701172, 'learning_rate': 7.0808193462132606e-06, 'epoch': 1.29}


 65%|██████▍   | 187002/288718 [3:57:02<1:43:57, 16.31it/s]

{'loss': 0.6195, 'grad_norm': 1.6317814588546753, 'learning_rate': 7.046183473146808e-06, 'epoch': 1.3}


 65%|██████▍   | 187502/288718 [3:57:32<1:43:32, 16.29it/s]

{'loss': 0.6362, 'grad_norm': 1.7600523233413696, 'learning_rate': 7.011547600080356e-06, 'epoch': 1.3}


 65%|██████▌   | 188002/288718 [3:58:01<1:47:18, 15.64it/s]

{'loss': 0.6285, 'grad_norm': 2.0800065994262695, 'learning_rate': 6.976911727013903e-06, 'epoch': 1.3}


 65%|██████▌   | 188503/288718 [3:58:31<1:38:16, 17.00it/s]

{'loss': 0.6323, 'grad_norm': 1.736739993095398, 'learning_rate': 6.942275853947451e-06, 'epoch': 1.31}


 65%|██████▌   | 189003/288718 [3:59:01<1:44:36, 15.89it/s]

{'loss': 0.6234, 'grad_norm': 3.358613967895508, 'learning_rate': 6.907639980880999e-06, 'epoch': 1.31}


 66%|██████▌   | 189502/288718 [3:59:31<1:42:11, 16.18it/s]

{'loss': 0.6231, 'grad_norm': 2.25950288772583, 'learning_rate': 6.873004107814546e-06, 'epoch': 1.31}


 66%|██████▌   | 190002/288718 [4:00:01<1:42:26, 16.06it/s]

{'loss': 0.6276, 'grad_norm': 1.5377357006072998, 'learning_rate': 6.838368234748094e-06, 'epoch': 1.32}


 66%|██████▌   | 190502/288718 [4:00:31<1:44:49, 15.62it/s]

{'loss': 0.6322, 'grad_norm': 3.142416477203369, 'learning_rate': 6.803732361681642e-06, 'epoch': 1.32}


 66%|██████▌   | 191002/288718 [4:01:01<1:37:09, 16.76it/s]

{'loss': 0.6261, 'grad_norm': 1.4020066261291504, 'learning_rate': 6.769096488615189e-06, 'epoch': 1.32}


 66%|██████▋   | 191502/288718 [4:01:31<1:43:03, 15.72it/s]

{'loss': 0.6227, 'grad_norm': 2.060148000717163, 'learning_rate': 6.734460615548737e-06, 'epoch': 1.33}


 67%|██████▋   | 192002/288718 [4:02:02<1:32:14, 17.48it/s]

{'loss': 0.6248, 'grad_norm': 1.418489933013916, 'learning_rate': 6.699824742482284e-06, 'epoch': 1.33}


 67%|██████▋   | 192502/288718 [4:02:33<1:33:56, 17.07it/s]

{'loss': 0.6255, 'grad_norm': 1.3713504076004028, 'learning_rate': 6.665188869415832e-06, 'epoch': 1.33}


 67%|██████▋   | 193002/288718 [4:03:03<1:34:52, 16.81it/s]

{'loss': 0.6273, 'grad_norm': 2.050290107727051, 'learning_rate': 6.63055299634938e-06, 'epoch': 1.34}


 67%|██████▋   | 193503/288718 [4:03:34<1:36:20, 16.47it/s]

{'loss': 0.6274, 'grad_norm': 1.5764254331588745, 'learning_rate': 6.595917123282927e-06, 'epoch': 1.34}


 67%|██████▋   | 194003/288718 [4:04:05<1:37:09, 16.25it/s]

{'loss': 0.6261, 'grad_norm': 1.3162628412246704, 'learning_rate': 6.561281250216475e-06, 'epoch': 1.34}


 67%|██████▋   | 194503/288718 [4:04:36<1:39:50, 15.73it/s]

{'loss': 0.6311, 'grad_norm': 1.3447514772415161, 'learning_rate': 6.526645377150022e-06, 'epoch': 1.35}


 68%|██████▊   | 195003/288718 [4:05:07<1:30:33, 17.25it/s]

{'loss': 0.628, 'grad_norm': 2.176130771636963, 'learning_rate': 6.4920095040835696e-06, 'epoch': 1.35}


 68%|██████▊   | 195503/288718 [4:05:38<1:32:53, 16.73it/s]

{'loss': 0.6313, 'grad_norm': 1.81332528591156, 'learning_rate': 6.457373631017118e-06, 'epoch': 1.35}


 68%|██████▊   | 196001/288718 [4:06:10<1:59:06, 12.97it/s]

{'loss': 0.6347, 'grad_norm': 1.5878103971481323, 'learning_rate': 6.422737757950666e-06, 'epoch': 1.36}


 68%|██████▊   | 196501/288718 [4:06:47<2:16:48, 11.23it/s]

{'loss': 0.628, 'grad_norm': 1.9412182569503784, 'learning_rate': 6.388101884884212e-06, 'epoch': 1.36}


 68%|██████▊   | 197001/288718 [4:07:24<1:46:50, 14.31it/s]

{'loss': 0.6308, 'grad_norm': 2.01106333732605, 'learning_rate': 6.35346601181776e-06, 'epoch': 1.36}


 68%|██████▊   | 197501/288718 [4:08:01<1:54:13, 13.31it/s]

{'loss': 0.6277, 'grad_norm': 1.1824827194213867, 'learning_rate': 6.318830138751308e-06, 'epoch': 1.37}


 69%|██████▊   | 198001/288718 [4:08:39<2:06:34, 11.95it/s]

{'loss': 0.6329, 'grad_norm': 1.9185658693313599, 'learning_rate': 6.2841942656848555e-06, 'epoch': 1.37}


 69%|██████▉   | 198501/288718 [4:09:17<1:47:24, 14.00it/s]

{'loss': 0.6279, 'grad_norm': 2.8216073513031006, 'learning_rate': 6.249558392618404e-06, 'epoch': 1.38}


 69%|██████▉   | 199001/288718 [4:09:53<1:50:13, 13.56it/s]

{'loss': 0.6251, 'grad_norm': 1.7907006740570068, 'learning_rate': 6.214922519551952e-06, 'epoch': 1.38}


 69%|██████▉   | 199501/288718 [4:10:31<2:15:25, 10.98it/s]

{'loss': 0.6229, 'grad_norm': 2.190162181854248, 'learning_rate': 6.180286646485498e-06, 'epoch': 1.38}


 69%|██████▉   | 200001/288718 [4:11:07<1:41:52, 14.51it/s]

{'loss': 0.6289, 'grad_norm': 2.180387020111084, 'learning_rate': 6.145650773419046e-06, 'epoch': 1.39}


 69%|██████▉   | 200503/288718 [4:11:39<1:28:45, 16.56it/s]

{'loss': 0.6241, 'grad_norm': 1.942512035369873, 'learning_rate': 6.111014900352593e-06, 'epoch': 1.39}


 70%|██████▉   | 201003/288718 [4:12:11<1:32:28, 15.81it/s]

{'loss': 0.6298, 'grad_norm': 1.9858412742614746, 'learning_rate': 6.0763790272861415e-06, 'epoch': 1.39}


 70%|██████▉   | 201503/288718 [4:12:43<1:31:02, 15.97it/s]

{'loss': 0.6239, 'grad_norm': 1.7072027921676636, 'learning_rate': 6.041743154219689e-06, 'epoch': 1.4}


 70%|██████▉   | 202001/288718 [4:13:14<1:38:52, 14.62it/s]

{'loss': 0.625, 'grad_norm': 1.229917049407959, 'learning_rate': 6.007107281153236e-06, 'epoch': 1.4}


 70%|███████   | 202501/288718 [4:13:46<1:30:40, 15.85it/s]

{'loss': 0.6266, 'grad_norm': 1.603716492652893, 'learning_rate': 5.972471408086784e-06, 'epoch': 1.4}


 70%|███████   | 203003/288718 [4:14:18<1:27:41, 16.29it/s]

{'loss': 0.6261, 'grad_norm': 1.8714711666107178, 'learning_rate': 5.937835535020332e-06, 'epoch': 1.41}


 70%|███████   | 203503/288718 [4:14:50<1:28:31, 16.04it/s]

{'loss': 0.6202, 'grad_norm': 1.924717903137207, 'learning_rate': 5.9031996619538785e-06, 'epoch': 1.41}


 71%|███████   | 204003/288718 [4:15:22<1:29:00, 15.86it/s]

{'loss': 0.6311, 'grad_norm': 1.393250823020935, 'learning_rate': 5.868563788887427e-06, 'epoch': 1.41}


 71%|███████   | 204502/288718 [4:15:54<1:52:41, 12.46it/s]

{'loss': 0.6244, 'grad_norm': 1.2460638284683228, 'learning_rate': 5.833927915820975e-06, 'epoch': 1.42}


 71%|███████   | 205003/288718 [4:16:27<1:29:00, 15.67it/s]

{'loss': 0.6228, 'grad_norm': 1.8251463174819946, 'learning_rate': 5.799292042754522e-06, 'epoch': 1.42}


 71%|███████   | 205503/288718 [4:17:00<1:24:55, 16.33it/s]

{'loss': 0.6274, 'grad_norm': 1.4578346014022827, 'learning_rate': 5.76465616968807e-06, 'epoch': 1.42}


 71%|███████▏  | 206003/288718 [4:17:33<1:31:54, 15.00it/s]

{'loss': 0.6307, 'grad_norm': 1.4300071001052856, 'learning_rate': 5.730020296621618e-06, 'epoch': 1.43}


 72%|███████▏  | 206503/288718 [4:18:05<1:28:55, 15.41it/s]

{'loss': 0.6293, 'grad_norm': 1.5586740970611572, 'learning_rate': 5.6953844235551645e-06, 'epoch': 1.43}


 72%|███████▏  | 207003/288718 [4:18:37<1:27:14, 15.61it/s]

{'loss': 0.6347, 'grad_norm': 3.0962045192718506, 'learning_rate': 5.660748550488713e-06, 'epoch': 1.43}


 72%|███████▏  | 207503/288718 [4:19:09<1:24:53, 15.94it/s]

{'loss': 0.6269, 'grad_norm': 3.0887327194213867, 'learning_rate': 5.626112677422261e-06, 'epoch': 1.44}


 72%|███████▏  | 208003/288718 [4:19:41<1:26:55, 15.48it/s]

{'loss': 0.6249, 'grad_norm': 1.6412874460220337, 'learning_rate': 5.591476804355808e-06, 'epoch': 1.44}


 72%|███████▏  | 208503/288718 [4:20:13<1:24:21, 15.85it/s]

{'loss': 0.631, 'grad_norm': 2.068680763244629, 'learning_rate': 5.556840931289356e-06, 'epoch': 1.44}


 72%|███████▏  | 209003/288718 [4:20:46<1:28:33, 15.00it/s]

{'loss': 0.6304, 'grad_norm': 1.7927792072296143, 'learning_rate': 5.522205058222902e-06, 'epoch': 1.45}


 73%|███████▎  | 209503/288718 [4:21:18<1:25:02, 15.53it/s]

{'loss': 0.6297, 'grad_norm': 2.236037492752075, 'learning_rate': 5.4875691851564505e-06, 'epoch': 1.45}


 73%|███████▎  | 210003/288718 [4:21:50<1:26:15, 15.21it/s]

{'loss': 0.6296, 'grad_norm': 1.4414327144622803, 'learning_rate': 5.4529333120899986e-06, 'epoch': 1.45}


 73%|███████▎  | 210503/288718 [4:22:23<1:24:31, 15.42it/s]

{'loss': 0.6239, 'grad_norm': 1.8906359672546387, 'learning_rate': 5.418297439023546e-06, 'epoch': 1.46}


 73%|███████▎  | 211003/288718 [4:22:55<1:21:20, 15.92it/s]

{'loss': 0.6297, 'grad_norm': 1.8788155317306519, 'learning_rate': 5.383661565957093e-06, 'epoch': 1.46}


 73%|███████▎  | 211503/288718 [4:23:27<1:25:30, 15.05it/s]

{'loss': 0.6248, 'grad_norm': 1.2307300567626953, 'learning_rate': 5.349025692890641e-06, 'epoch': 1.47}


 73%|███████▎  | 212003/288718 [4:24:00<1:22:40, 15.47it/s]

{'loss': 0.6258, 'grad_norm': 1.8060139417648315, 'learning_rate': 5.314389819824188e-06, 'epoch': 1.47}


 74%|███████▎  | 212503/288718 [4:24:32<1:24:22, 15.06it/s]

{'loss': 0.6281, 'grad_norm': 2.4180963039398193, 'learning_rate': 5.2797539467577365e-06, 'epoch': 1.47}


 74%|███████▍  | 213003/288718 [4:25:05<1:22:09, 15.36it/s]

{'loss': 0.6216, 'grad_norm': 1.4422768354415894, 'learning_rate': 5.2451180736912845e-06, 'epoch': 1.48}


 74%|███████▍  | 213503/288718 [4:25:37<1:22:12, 15.25it/s]

{'loss': 0.6211, 'grad_norm': 3.076775074005127, 'learning_rate': 5.210482200624831e-06, 'epoch': 1.48}


 74%|███████▍  | 214003/288718 [4:26:10<1:20:18, 15.51it/s]

{'loss': 0.6202, 'grad_norm': 3.539928436279297, 'learning_rate': 5.175846327558379e-06, 'epoch': 1.48}


 74%|███████▍  | 214503/288718 [4:26:42<1:19:32, 15.55it/s]

{'loss': 0.6234, 'grad_norm': 2.1590850353240967, 'learning_rate': 5.141210454491927e-06, 'epoch': 1.49}


 74%|███████▍  | 215003/288718 [4:27:14<1:20:24, 15.28it/s]

{'loss': 0.6248, 'grad_norm': 2.902092933654785, 'learning_rate': 5.106574581425474e-06, 'epoch': 1.49}


 75%|███████▍  | 215503/288718 [4:27:46<1:12:47, 16.76it/s]

{'loss': 0.6286, 'grad_norm': 1.60019850730896, 'learning_rate': 5.0719387083590224e-06, 'epoch': 1.49}


 75%|███████▍  | 216003/288718 [4:28:19<1:20:43, 15.01it/s]

{'loss': 0.6286, 'grad_norm': 1.442568063735962, 'learning_rate': 5.0373028352925705e-06, 'epoch': 1.5}


 75%|███████▍  | 216501/288718 [4:28:52<1:21:22, 14.79it/s]

{'loss': 0.6325, 'grad_norm': 1.9688599109649658, 'learning_rate': 5.002666962226117e-06, 'epoch': 1.5}


 75%|███████▌  | 217003/288718 [4:29:26<1:16:50, 15.56it/s]

{'loss': 0.6202, 'grad_norm': 1.6368753910064697, 'learning_rate': 4.968031089159665e-06, 'epoch': 1.5}


 75%|███████▌  | 217503/288718 [4:29:59<1:15:25, 15.74it/s]

{'loss': 0.6245, 'grad_norm': 1.4630916118621826, 'learning_rate': 4.933395216093212e-06, 'epoch': 1.51}


 76%|███████▌  | 218003/288718 [4:30:32<1:17:35, 15.19it/s]

{'loss': 0.6269, 'grad_norm': 1.4918408393859863, 'learning_rate': 4.89875934302676e-06, 'epoch': 1.51}


 76%|███████▌  | 218502/288718 [4:31:06<1:14:37, 15.68it/s]

{'loss': 0.6191, 'grad_norm': 1.3222670555114746, 'learning_rate': 4.8641234699603076e-06, 'epoch': 1.51}


 76%|███████▌  | 219002/288718 [4:31:41<1:16:19, 15.22it/s]

{'loss': 0.626, 'grad_norm': 1.6069960594177246, 'learning_rate': 4.829487596893856e-06, 'epoch': 1.52}


 76%|███████▌  | 219502/288718 [4:32:15<1:23:22, 13.84it/s]

{'loss': 0.629, 'grad_norm': 2.5737619400024414, 'learning_rate': 4.794851723827403e-06, 'epoch': 1.52}


 76%|███████▌  | 220002/288718 [4:32:49<1:17:04, 14.86it/s]

{'loss': 0.6294, 'grad_norm': 1.5343905687332153, 'learning_rate': 4.76021585076095e-06, 'epoch': 1.52}


 76%|███████▋  | 220502/288718 [4:33:23<1:20:59, 14.04it/s]

{'loss': 0.6334, 'grad_norm': 2.2651896476745605, 'learning_rate': 4.725579977694498e-06, 'epoch': 1.53}


 77%|███████▋  | 221002/288718 [4:33:58<1:22:07, 13.74it/s]

{'loss': 0.6283, 'grad_norm': 3.4743430614471436, 'learning_rate': 4.6909441046280455e-06, 'epoch': 1.53}


 77%|███████▋  | 221502/288718 [4:34:33<1:22:38, 13.56it/s]

{'loss': 0.6184, 'grad_norm': 1.6267993450164795, 'learning_rate': 4.6563082315615935e-06, 'epoch': 1.53}


 77%|███████▋  | 222002/288718 [4:35:07<1:16:22, 14.56it/s]

{'loss': 0.619, 'grad_norm': 1.4891724586486816, 'learning_rate': 4.621672358495141e-06, 'epoch': 1.54}


 77%|███████▋  | 222502/288718 [4:35:42<1:20:33, 13.70it/s]

{'loss': 0.6185, 'grad_norm': 1.5875205993652344, 'learning_rate': 4.587036485428689e-06, 'epoch': 1.54}


 77%|███████▋  | 223002/288718 [4:36:17<1:15:36, 14.49it/s]

{'loss': 0.6256, 'grad_norm': 1.2910982370376587, 'learning_rate': 4.552400612362236e-06, 'epoch': 1.54}


 77%|███████▋  | 223502/288718 [4:36:52<1:11:18, 15.24it/s]

{'loss': 0.6223, 'grad_norm': 1.773170828819275, 'learning_rate': 4.517764739295784e-06, 'epoch': 1.55}


 78%|███████▊  | 224002/288718 [4:37:26<1:15:22, 14.31it/s]

{'loss': 0.6293, 'grad_norm': 1.589572548866272, 'learning_rate': 4.483128866229331e-06, 'epoch': 1.55}


 78%|███████▊  | 224502/288718 [4:38:01<1:13:36, 14.54it/s]

{'loss': 0.6293, 'grad_norm': 1.6133041381835938, 'learning_rate': 4.448492993162879e-06, 'epoch': 1.56}


 78%|███████▊  | 225002/288718 [4:38:36<1:18:20, 13.56it/s]

{'loss': 0.6257, 'grad_norm': 1.8672125339508057, 'learning_rate': 4.413857120096427e-06, 'epoch': 1.56}


 78%|███████▊  | 225502/288718 [4:39:11<1:18:50, 13.36it/s]

{'loss': 0.6241, 'grad_norm': 1.4455807209014893, 'learning_rate': 4.379221247029975e-06, 'epoch': 1.56}


 78%|███████▊  | 226002/288718 [4:39:45<1:09:33, 15.03it/s]

{'loss': 0.6231, 'grad_norm': 2.887416124343872, 'learning_rate': 4.344585373963522e-06, 'epoch': 1.57}


 78%|███████▊  | 226502/288718 [4:40:20<1:13:15, 14.15it/s]

{'loss': 0.6289, 'grad_norm': 1.9695781469345093, 'learning_rate': 4.309949500897069e-06, 'epoch': 1.57}


 79%|███████▊  | 227002/288718 [4:40:55<1:09:02, 14.90it/s]

{'loss': 0.6273, 'grad_norm': 2.703185558319092, 'learning_rate': 4.275313627830617e-06, 'epoch': 1.57}


 79%|███████▉  | 227502/288718 [4:41:30<1:15:34, 13.50it/s]

{'loss': 0.6268, 'grad_norm': 1.6537288427352905, 'learning_rate': 4.240677754764165e-06, 'epoch': 1.58}


 79%|███████▉  | 228002/288718 [4:42:05<1:04:35, 15.67it/s]

{'loss': 0.6205, 'grad_norm': 1.7178949117660522, 'learning_rate': 4.206041881697712e-06, 'epoch': 1.58}


 79%|███████▉  | 228502/288718 [4:42:39<1:05:21, 15.35it/s]

{'loss': 0.6302, 'grad_norm': 1.919158935546875, 'learning_rate': 4.17140600863126e-06, 'epoch': 1.58}


 79%|███████▉  | 229002/288718 [4:43:14<1:08:01, 14.63it/s]

{'loss': 0.6281, 'grad_norm': 2.1981630325317383, 'learning_rate': 4.136770135564808e-06, 'epoch': 1.59}


 79%|███████▉  | 229502/288718 [4:43:49<1:21:09, 12.16it/s]

{'loss': 0.624, 'grad_norm': 2.182508707046509, 'learning_rate': 4.102134262498355e-06, 'epoch': 1.59}


 80%|███████▉  | 230002/288718 [4:44:23<1:09:57, 13.99it/s]

{'loss': 0.6313, 'grad_norm': 1.3621222972869873, 'learning_rate': 4.0674983894319025e-06, 'epoch': 1.59}


 80%|███████▉  | 230502/288718 [4:44:58<1:03:45, 15.22it/s]

{'loss': 0.6218, 'grad_norm': 1.9423809051513672, 'learning_rate': 4.032862516365451e-06, 'epoch': 1.6}


 80%|████████  | 231002/288718 [4:45:32<1:09:42, 13.80it/s]

{'loss': 0.6284, 'grad_norm': 1.6241586208343506, 'learning_rate': 3.998226643298998e-06, 'epoch': 1.6}


 80%|████████  | 231502/288718 [4:46:07<1:04:44, 14.73it/s]

{'loss': 0.6262, 'grad_norm': 2.3224058151245117, 'learning_rate': 3.963590770232545e-06, 'epoch': 1.6}


 80%|████████  | 232002/288718 [4:46:42<1:03:46, 14.82it/s]

{'loss': 0.6268, 'grad_norm': 2.258345603942871, 'learning_rate': 3.928954897166093e-06, 'epoch': 1.61}


 81%|████████  | 232502/288718 [4:47:17<1:04:58, 14.42it/s]

{'loss': 0.6245, 'grad_norm': 1.4503159523010254, 'learning_rate': 3.894319024099641e-06, 'epoch': 1.61}


 81%|████████  | 233002/288718 [4:47:51<1:02:56, 14.75it/s]

{'loss': 0.631, 'grad_norm': 1.7066155672073364, 'learning_rate': 3.8596831510331885e-06, 'epoch': 1.61}


 81%|████████  | 233502/288718 [4:48:26<1:07:55, 13.55it/s]

{'loss': 0.6238, 'grad_norm': 2.4022467136383057, 'learning_rate': 3.825047277966736e-06, 'epoch': 1.62}


 81%|████████  | 234002/288718 [4:49:01<58:26, 15.60it/s]  

{'loss': 0.6288, 'grad_norm': 1.9734972715377808, 'learning_rate': 3.790411404900284e-06, 'epoch': 1.62}


 81%|████████  | 234502/288718 [4:49:36<58:54, 15.34it/s]  

{'loss': 0.6267, 'grad_norm': 1.849238634109497, 'learning_rate': 3.755775531833831e-06, 'epoch': 1.62}


 81%|████████▏ | 235002/288718 [4:50:11<1:02:30, 14.32it/s]

{'loss': 0.6308, 'grad_norm': 1.9719133377075195, 'learning_rate': 3.7211396587673787e-06, 'epoch': 1.63}


 82%|████████▏ | 235502/288718 [4:50:46<1:04:41, 13.71it/s]

{'loss': 0.6193, 'grad_norm': 3.411134719848633, 'learning_rate': 3.686503785700927e-06, 'epoch': 1.63}


 82%|████████▏ | 236002/288718 [4:51:20<1:03:44, 13.78it/s]

{'loss': 0.6231, 'grad_norm': 2.1972219944000244, 'learning_rate': 3.651867912634474e-06, 'epoch': 1.63}


 82%|████████▏ | 236502/288718 [4:51:55<59:57, 14.51it/s]  

{'loss': 0.6241, 'grad_norm': 1.293472409248352, 'learning_rate': 3.6172320395680217e-06, 'epoch': 1.64}


 82%|████████▏ | 237002/288718 [4:52:31<1:07:42, 12.73it/s]

{'loss': 0.6228, 'grad_norm': 1.906844139099121, 'learning_rate': 3.582596166501569e-06, 'epoch': 1.64}


 82%|████████▏ | 237502/288718 [4:53:05<56:53, 15.00it/s]  

{'loss': 0.6292, 'grad_norm': 1.4142382144927979, 'learning_rate': 3.547960293435117e-06, 'epoch': 1.65}


 82%|████████▏ | 238002/288718 [4:53:40<56:16, 15.02it/s]  

{'loss': 0.6312, 'grad_norm': 1.43417227268219, 'learning_rate': 3.5133244203686647e-06, 'epoch': 1.65}


 83%|████████▎ | 238502/288718 [4:54:15<58:47, 14.24it/s]  

{'loss': 0.623, 'grad_norm': 1.8054516315460205, 'learning_rate': 3.478688547302212e-06, 'epoch': 1.65}


 83%|████████▎ | 239002/288718 [4:54:50<1:09:53, 11.86it/s]

{'loss': 0.6197, 'grad_norm': 1.7469183206558228, 'learning_rate': 3.44405267423576e-06, 'epoch': 1.66}


 83%|████████▎ | 239502/288718 [4:55:25<54:10, 15.14it/s]  

{'loss': 0.6212, 'grad_norm': 2.6555237770080566, 'learning_rate': 3.4094168011693073e-06, 'epoch': 1.66}


 83%|████████▎ | 240002/288718 [4:56:00<54:29, 14.90it/s]  

{'loss': 0.6181, 'grad_norm': 2.465043306350708, 'learning_rate': 3.374780928102855e-06, 'epoch': 1.66}


 83%|████████▎ | 240502/288718 [4:56:34<56:02, 14.34it/s]  

{'loss': 0.6203, 'grad_norm': 2.0095648765563965, 'learning_rate': 3.340145055036403e-06, 'epoch': 1.67}


 83%|████████▎ | 241002/288718 [4:57:09<57:14, 13.89it/s]  

{'loss': 0.628, 'grad_norm': 1.6373918056488037, 'learning_rate': 3.3055091819699502e-06, 'epoch': 1.67}


 84%|████████▎ | 241502/288718 [4:57:43<51:48, 15.19it/s]  

{'loss': 0.6291, 'grad_norm': 1.7015125751495361, 'learning_rate': 3.270873308903498e-06, 'epoch': 1.67}


 84%|████████▍ | 242002/288718 [4:58:18<53:23, 14.58it/s]  

{'loss': 0.629, 'grad_norm': 2.0807764530181885, 'learning_rate': 3.236237435837045e-06, 'epoch': 1.68}


 84%|████████▍ | 242502/288718 [4:58:53<52:13, 14.75it/s]  

{'loss': 0.6372, 'grad_norm': 1.4300864934921265, 'learning_rate': 3.2016015627705932e-06, 'epoch': 1.68}


 84%|████████▍ | 243002/288718 [4:59:28<55:05, 13.83it/s]  

{'loss': 0.6224, 'grad_norm': 1.7822140455245972, 'learning_rate': 3.1669656897041405e-06, 'epoch': 1.68}


 84%|████████▍ | 243502/288718 [5:00:03<49:02, 15.37it/s]  

{'loss': 0.6301, 'grad_norm': 1.6874967813491821, 'learning_rate': 3.132329816637688e-06, 'epoch': 1.69}


 85%|████████▍ | 244002/288718 [5:00:37<50:07, 14.87it/s]  

{'loss': 0.6205, 'grad_norm': 1.6710213422775269, 'learning_rate': 3.0976939435712362e-06, 'epoch': 1.69}


 85%|████████▍ | 244502/288718 [5:01:12<57:44, 12.76it/s]  

{'loss': 0.6238, 'grad_norm': 1.6499351263046265, 'learning_rate': 3.0630580705047835e-06, 'epoch': 1.69}


 85%|████████▍ | 245002/288718 [5:01:48<53:43, 13.56it/s]  

{'loss': 0.6213, 'grad_norm': 1.5487112998962402, 'learning_rate': 3.028422197438331e-06, 'epoch': 1.7}


 85%|████████▌ | 245502/288718 [5:02:22<49:09, 14.65it/s]

{'loss': 0.6262, 'grad_norm': 2.044386625289917, 'learning_rate': 2.9937863243718784e-06, 'epoch': 1.7}


 85%|████████▌ | 246002/288718 [5:02:56<47:08, 15.10it/s]  

{'loss': 0.626, 'grad_norm': 2.2972335815429688, 'learning_rate': 2.9591504513054264e-06, 'epoch': 1.7}


 85%|████████▌ | 246502/288718 [5:03:30<49:40, 14.17it/s]  

{'loss': 0.6257, 'grad_norm': 3.827383041381836, 'learning_rate': 2.9245145782389737e-06, 'epoch': 1.71}


 86%|████████▌ | 247002/288718 [5:04:05<47:05, 14.77it/s]  

{'loss': 0.632, 'grad_norm': 2.1887450218200684, 'learning_rate': 2.8898787051725213e-06, 'epoch': 1.71}


 86%|████████▌ | 247502/288718 [5:04:39<45:17, 15.17it/s]

{'loss': 0.622, 'grad_norm': 2.253567934036255, 'learning_rate': 2.8552428321060694e-06, 'epoch': 1.71}


 86%|████████▌ | 248002/288718 [5:05:14<45:57, 14.77it/s]

{'loss': 0.6324, 'grad_norm': 1.6016433238983154, 'learning_rate': 2.8206069590396167e-06, 'epoch': 1.72}


 86%|████████▌ | 248502/288718 [5:05:49<48:57, 13.69it/s]  

{'loss': 0.6205, 'grad_norm': 1.805755853652954, 'learning_rate': 2.7859710859731643e-06, 'epoch': 1.72}


 86%|████████▌ | 249002/288718 [5:06:23<44:15, 14.96it/s]

{'loss': 0.6243, 'grad_norm': 1.9889293909072876, 'learning_rate': 2.7513352129067124e-06, 'epoch': 1.72}


 86%|████████▋ | 249502/288718 [5:06:58<46:08, 14.16it/s]  

{'loss': 0.6287, 'grad_norm': 1.4607340097427368, 'learning_rate': 2.7166993398402597e-06, 'epoch': 1.73}


 87%|████████▋ | 250002/288718 [5:07:32<44:58, 14.35it/s]

{'loss': 0.6218, 'grad_norm': 1.8479773998260498, 'learning_rate': 2.6820634667738073e-06, 'epoch': 1.73}


 87%|████████▋ | 250502/288718 [5:08:07<43:13, 14.74it/s]

{'loss': 0.6263, 'grad_norm': 1.9752998352050781, 'learning_rate': 2.6474275937073546e-06, 'epoch': 1.74}


 87%|████████▋ | 251002/288718 [5:08:41<44:26, 14.15it/s]

{'loss': 0.6261, 'grad_norm': 1.2462513446807861, 'learning_rate': 2.6127917206409026e-06, 'epoch': 1.74}


 87%|████████▋ | 251502/288718 [5:09:16<47:54, 12.95it/s]  

{'loss': 0.6202, 'grad_norm': 1.5337270498275757, 'learning_rate': 2.57815584757445e-06, 'epoch': 1.74}


 87%|████████▋ | 252002/288718 [5:09:50<42:57, 14.24it/s]

{'loss': 0.623, 'grad_norm': 1.7908607721328735, 'learning_rate': 2.5435199745079975e-06, 'epoch': 1.75}


 87%|████████▋ | 252502/288718 [5:10:26<41:10, 14.66it/s]  

{'loss': 0.6323, 'grad_norm': 1.481771469116211, 'learning_rate': 2.5088841014415456e-06, 'epoch': 1.75}


 88%|████████▊ | 253002/288718 [5:11:01<43:35, 13.66it/s]

{'loss': 0.6255, 'grad_norm': 1.7584418058395386, 'learning_rate': 2.474248228375093e-06, 'epoch': 1.75}


 88%|████████▊ | 253501/288718 [5:12:43<1:28:34,  6.63it/s] 

{'loss': 0.6194, 'grad_norm': 2.4209728240966797, 'learning_rate': 2.4396123553086405e-06, 'epoch': 1.76}


 88%|████████▊ | 254001/288718 [5:13:38<1:44:17,  5.55it/s] 

{'loss': 0.6238, 'grad_norm': 2.4290103912353516, 'learning_rate': 2.404976482242188e-06, 'epoch': 1.76}


 88%|████████▊ | 254502/288718 [5:14:44<27:14, 20.94it/s]   

{'loss': 0.6298, 'grad_norm': 1.9765033721923828, 'learning_rate': 2.3703406091757354e-06, 'epoch': 1.76}


 88%|████████▊ | 255003/288718 [5:16:08<27:28, 20.45it/s]    

{'loss': 0.6198, 'grad_norm': 2.6582655906677246, 'learning_rate': 2.335704736109283e-06, 'epoch': 1.77}


 88%|████████▊ | 255502/288718 [5:16:40<38:46, 14.28it/s]  

{'loss': 0.6205, 'grad_norm': 1.7172223329544067, 'learning_rate': 2.301068863042831e-06, 'epoch': 1.77}


 89%|████████▊ | 256003/288718 [5:17:58<26:24, 20.64it/s]   

{'loss': 0.6251, 'grad_norm': 1.298054814338684, 'learning_rate': 2.2664329899763784e-06, 'epoch': 1.77}


 89%|████████▉ | 256503/288718 [5:19:15<26:15, 20.45it/s]   

{'loss': 0.6146, 'grad_norm': 1.7768542766571045, 'learning_rate': 2.231797116909926e-06, 'epoch': 1.78}


 89%|████████▉ | 257000/288718 [5:19:47<43:54, 12.04it/s]  

{'loss': 0.6278, 'grad_norm': 2.2054805755615234, 'learning_rate': 2.1971612438434737e-06, 'epoch': 1.78}


 89%|████████▉ | 257503/288718 [5:21:03<25:38, 20.29it/s]   

{'loss': 0.6281, 'grad_norm': 2.8724474906921387, 'learning_rate': 2.1625253707770214e-06, 'epoch': 1.78}


 89%|████████▉ | 258003/288718 [5:22:17<24:54, 20.55it/s]   

{'loss': 0.6245, 'grad_norm': 2.7378807067871094, 'learning_rate': 2.127889497710569e-06, 'epoch': 1.79}


 90%|████████▉ | 258501/288718 [5:22:50<37:47, 13.33it/s]  

{'loss': 0.6249, 'grad_norm': 2.264826536178589, 'learning_rate': 2.0932536246441167e-06, 'epoch': 1.79}


 90%|████████▉ | 259002/288718 [5:23:36<35:37, 13.90it/s]   

{'loss': 0.6208, 'grad_norm': 1.8433741331100464, 'learning_rate': 2.0586177515776644e-06, 'epoch': 1.79}


 90%|████████▉ | 259502/288718 [5:24:46<23:44, 20.52it/s]   

{'loss': 0.625, 'grad_norm': 3.1618313789367676, 'learning_rate': 2.0239818785112116e-06, 'epoch': 1.8}


 90%|█████████ | 260001/288718 [5:26:01<3:17:55,  2.42it/s] 

{'loss': 0.6242, 'grad_norm': 1.846222162246704, 'learning_rate': 1.9893460054447593e-06, 'epoch': 1.8}


 90%|█████████ | 260503/288718 [5:26:27<24:50, 18.93it/s]  

{'loss': 0.6187, 'grad_norm': 2.0745394229888916, 'learning_rate': 1.954710132378307e-06, 'epoch': 1.8}


 90%|█████████ | 261003/288718 [5:27:46<22:25, 20.60it/s]   

{'loss': 0.6256, 'grad_norm': 2.1505963802337646, 'learning_rate': 1.9200742593118546e-06, 'epoch': 1.81}


 91%|█████████ | 261504/288718 [5:29:06<23:47, 19.06it/s]   

{'loss': 0.6231, 'grad_norm': 1.7177791595458984, 'learning_rate': 1.8854383862454023e-06, 'epoch': 1.81}


 91%|█████████ | 262000/288718 [5:29:33<35:12, 12.65it/s]  

{'loss': 0.6222, 'grad_norm': 2.246185302734375, 'learning_rate': 1.8508025131789497e-06, 'epoch': 1.81}


 91%|█████████ | 262502/288718 [5:30:11<29:29, 14.82it/s]

{'loss': 0.6248, 'grad_norm': 2.174994468688965, 'learning_rate': 1.8161666401124974e-06, 'epoch': 1.82}


 91%|█████████ | 263003/288718 [5:31:31<22:58, 18.65it/s]   

{'loss': 0.6343, 'grad_norm': 1.9269839525222778, 'learning_rate': 1.7815307670460448e-06, 'epoch': 1.82}


 91%|█████████▏| 263502/288718 [5:32:08<31:03, 13.53it/s]  

{'loss': 0.6151, 'grad_norm': 1.5617530345916748, 'learning_rate': 1.7468948939795927e-06, 'epoch': 1.83}


 91%|█████████▏| 264002/288718 [5:32:42<33:57, 12.13it/s]

{'loss': 0.6227, 'grad_norm': 1.8200618028640747, 'learning_rate': 1.7122590209131404e-06, 'epoch': 1.83}


 92%|█████████▏| 264501/288718 [5:33:37<34:34, 11.67it/s]   

{'loss': 0.6266, 'grad_norm': 1.681347131729126, 'learning_rate': 1.6776231478466878e-06, 'epoch': 1.83}


 92%|█████████▏| 265002/288718 [5:35:05<19:36, 20.17it/s]   

{'loss': 0.6195, 'grad_norm': 2.2044060230255127, 'learning_rate': 1.6429872747802355e-06, 'epoch': 1.84}


 92%|█████████▏| 265501/288718 [5:36:21<14:38:46,  2.27s/it]

{'loss': 0.629, 'grad_norm': 1.413240909576416, 'learning_rate': 1.608351401713783e-06, 'epoch': 1.84}


 92%|█████████▏| 266002/288718 [5:36:48<30:53, 12.25it/s]   

{'loss': 0.6237, 'grad_norm': 1.4333010911941528, 'learning_rate': 1.5737155286473308e-06, 'epoch': 1.84}


 92%|█████████▏| 266502/288718 [5:38:06<18:22, 20.15it/s]   

{'loss': 0.6226, 'grad_norm': 2.3221161365509033, 'learning_rate': 1.5390796555808785e-06, 'epoch': 1.85}


 92%|█████████▏| 267001/288718 [5:38:37<27:55, 12.96it/s]  

{'loss': 0.634, 'grad_norm': 1.741042137145996, 'learning_rate': 1.504443782514426e-06, 'epoch': 1.85}


 93%|█████████▎| 267501/288718 [5:39:47<17:07, 20.64it/s]   

{'loss': 0.6335, 'grad_norm': 2.46323561668396, 'learning_rate': 1.4698079094479736e-06, 'epoch': 1.85}


 93%|█████████▎| 268003/288718 [5:41:07<16:31, 20.90it/s]   

{'loss': 0.6227, 'grad_norm': 1.9837294816970825, 'learning_rate': 1.435172036381521e-06, 'epoch': 1.86}


 93%|█████████▎| 268504/288718 [5:41:36<16:43, 20.14it/s]

{'loss': 0.6218, 'grad_norm': 2.9059789180755615, 'learning_rate': 1.400536163315069e-06, 'epoch': 1.86}


 93%|█████████▎| 269002/288718 [5:42:02<15:08, 21.69it/s]

{'loss': 0.6271, 'grad_norm': 1.4404706954956055, 'learning_rate': 1.3659002902486166e-06, 'epoch': 1.86}


 93%|█████████▎| 269503/288718 [5:42:28<17:53, 17.89it/s]

{'loss': 0.6239, 'grad_norm': 2.5837185382843018, 'learning_rate': 1.331264417182164e-06, 'epoch': 1.87}


 94%|█████████▎| 270003/288718 [5:42:55<16:32, 18.85it/s]

{'loss': 0.6221, 'grad_norm': 2.208918333053589, 'learning_rate': 1.2966285441157117e-06, 'epoch': 1.87}


 94%|█████████▎| 270501/288718 [5:43:26<29:04, 10.44it/s]

{'loss': 0.6335, 'grad_norm': 1.8498791456222534, 'learning_rate': 1.2619926710492591e-06, 'epoch': 1.87}


 94%|█████████▍| 271003/288718 [5:44:01<18:17, 16.15it/s]

{'loss': 0.6227, 'grad_norm': 2.2668066024780273, 'learning_rate': 1.2273567979828068e-06, 'epoch': 1.88}


 94%|█████████▍| 271503/288718 [5:44:33<17:00, 16.87it/s]

{'loss': 0.6192, 'grad_norm': 1.9353035688400269, 'learning_rate': 1.1927209249163545e-06, 'epoch': 1.88}


 94%|█████████▍| 272002/288718 [5:45:04<18:32, 15.03it/s]

{'loss': 0.6254, 'grad_norm': 1.352500319480896, 'learning_rate': 1.1580850518499021e-06, 'epoch': 1.88}


 94%|█████████▍| 272502/288718 [5:45:36<16:30, 16.37it/s]

{'loss': 0.6229, 'grad_norm': 2.24875807762146, 'learning_rate': 1.1234491787834496e-06, 'epoch': 1.89}


 95%|█████████▍| 273002/288718 [5:46:07<18:33, 14.11it/s]

{'loss': 0.6181, 'grad_norm': 2.0663890838623047, 'learning_rate': 1.0888133057169974e-06, 'epoch': 1.89}


 95%|█████████▍| 273502/288718 [5:46:39<14:54, 17.01it/s]

{'loss': 0.6246, 'grad_norm': 1.4553951025009155, 'learning_rate': 1.054177432650545e-06, 'epoch': 1.89}


 95%|█████████▍| 274002/288718 [5:47:11<15:09, 16.18it/s]

{'loss': 0.6307, 'grad_norm': 2.146832227706909, 'learning_rate': 1.0195415595840926e-06, 'epoch': 1.9}


 95%|█████████▌| 274502/288718 [5:47:43<17:33, 13.49it/s]

{'loss': 0.6264, 'grad_norm': 1.8657153844833374, 'learning_rate': 9.849056865176402e-07, 'epoch': 1.9}


 95%|█████████▌| 275002/288718 [5:48:16<14:20, 15.94it/s]

{'loss': 0.6236, 'grad_norm': 1.5495009422302246, 'learning_rate': 9.502698134511877e-07, 'epoch': 1.9}


 95%|█████████▌| 275502/288718 [5:48:48<13:08, 16.76it/s]

{'loss': 0.6225, 'grad_norm': 1.4914100170135498, 'learning_rate': 9.156339403847353e-07, 'epoch': 1.91}


 96%|█████████▌| 276002/288718 [5:49:20<12:25, 17.06it/s]

{'loss': 0.6235, 'grad_norm': 1.531488299369812, 'learning_rate': 8.80998067318283e-07, 'epoch': 1.91}


 96%|█████████▌| 276502/288718 [5:49:52<13:40, 14.89it/s]

{'loss': 0.6273, 'grad_norm': 2.420369863510132, 'learning_rate': 8.463621942518307e-07, 'epoch': 1.92}


 96%|█████████▌| 277002/288718 [5:50:25<12:17, 15.90it/s]

{'loss': 0.6244, 'grad_norm': 1.735358476638794, 'learning_rate': 8.117263211853782e-07, 'epoch': 1.92}


 96%|█████████▌| 277502/288718 [5:50:58<13:03, 14.31it/s]

{'loss': 0.6258, 'grad_norm': 1.7514723539352417, 'learning_rate': 7.770904481189258e-07, 'epoch': 1.92}


 96%|█████████▋| 278002/288718 [5:51:31<11:26, 15.60it/s]

{'loss': 0.6246, 'grad_norm': 1.788776159286499, 'learning_rate': 7.424545750524733e-07, 'epoch': 1.93}


 96%|█████████▋| 278503/288718 [5:52:04<10:47, 15.77it/s]

{'loss': 0.6262, 'grad_norm': 1.2016559839248657, 'learning_rate': 7.07818701986021e-07, 'epoch': 1.93}


 97%|█████████▋| 279003/288718 [5:52:36<10:33, 15.35it/s]

{'loss': 0.6164, 'grad_norm': 2.02087664604187, 'learning_rate': 6.731828289195687e-07, 'epoch': 1.93}


 97%|█████████▋| 279503/288718 [5:53:08<09:32, 16.10it/s]

{'loss': 0.6267, 'grad_norm': 2.1771323680877686, 'learning_rate': 6.385469558531163e-07, 'epoch': 1.94}


 97%|█████████▋| 280001/288718 [5:53:41<09:41, 15.00it/s]

{'loss': 0.6254, 'grad_norm': 2.2277307510375977, 'learning_rate': 6.039110827866639e-07, 'epoch': 1.94}


 97%|█████████▋| 280503/288718 [5:54:14<08:38, 15.85it/s]

{'loss': 0.6219, 'grad_norm': 1.6762011051177979, 'learning_rate': 5.692752097202114e-07, 'epoch': 1.94}


 97%|█████████▋| 281003/288718 [5:54:46<08:15, 15.58it/s]

{'loss': 0.6226, 'grad_norm': 2.382305860519409, 'learning_rate': 5.346393366537591e-07, 'epoch': 1.95}


 98%|█████████▊| 281503/288718 [5:55:19<10:55, 11.00it/s]

{'loss': 0.6214, 'grad_norm': 1.6720304489135742, 'learning_rate': 5.000034635873066e-07, 'epoch': 1.95}


 98%|█████████▊| 282001/288718 [5:55:52<07:18, 15.33it/s]

{'loss': 0.6212, 'grad_norm': 1.5062971115112305, 'learning_rate': 4.6536759052085425e-07, 'epoch': 1.95}


 98%|█████████▊| 282503/288718 [5:56:24<06:30, 15.92it/s]

{'loss': 0.6243, 'grad_norm': 2.262923240661621, 'learning_rate': 4.307317174544019e-07, 'epoch': 1.96}


 98%|█████████▊| 283003/288718 [5:56:57<06:18, 15.12it/s]

{'loss': 0.6278, 'grad_norm': 1.4209555387496948, 'learning_rate': 3.960958443879495e-07, 'epoch': 1.96}


 98%|█████████▊| 283503/288718 [5:57:30<05:46, 15.06it/s]

{'loss': 0.6174, 'grad_norm': 1.6681700944900513, 'learning_rate': 3.614599713214971e-07, 'epoch': 1.96}


 98%|█████████▊| 284003/288718 [5:58:03<05:04, 15.46it/s]

{'loss': 0.6291, 'grad_norm': 2.0384202003479004, 'learning_rate': 3.2682409825504474e-07, 'epoch': 1.97}


 99%|█████████▊| 284501/288718 [5:58:36<05:13, 13.46it/s]

{'loss': 0.6269, 'grad_norm': 1.9272228479385376, 'learning_rate': 2.9218822518859235e-07, 'epoch': 1.97}


 99%|█████████▊| 285001/288718 [5:59:09<04:04, 15.22it/s]

{'loss': 0.6249, 'grad_norm': 2.4072229862213135, 'learning_rate': 2.5755235212213996e-07, 'epoch': 1.97}


 99%|█████████▉| 285503/288718 [5:59:41<03:17, 16.30it/s]

{'loss': 0.6281, 'grad_norm': 4.693045616149902, 'learning_rate': 2.2291647905568757e-07, 'epoch': 1.98}


 99%|█████████▉| 286003/288718 [6:00:14<03:02, 14.87it/s]

{'loss': 0.6255, 'grad_norm': 2.242921829223633, 'learning_rate': 1.8828060598923518e-07, 'epoch': 1.98}


 99%|█████████▉| 286503/288718 [6:00:48<02:20, 15.76it/s]

{'loss': 0.6234, 'grad_norm': 1.861802339553833, 'learning_rate': 1.536447329227828e-07, 'epoch': 1.98}


 99%|█████████▉| 287003/288718 [6:01:21<01:55, 14.85it/s]

{'loss': 0.6252, 'grad_norm': 2.032949686050415, 'learning_rate': 1.1900885985633041e-07, 'epoch': 1.99}


100%|█████████▉| 287503/288718 [6:01:53<01:15, 16.12it/s]

{'loss': 0.6209, 'grad_norm': 1.804974913597107, 'learning_rate': 8.437298678987802e-08, 'epoch': 1.99}


100%|█████████▉| 288003/288718 [6:02:27<00:47, 15.21it/s]

{'loss': 0.6273, 'grad_norm': 1.3401600122451782, 'learning_rate': 4.973711372342563e-08, 'epoch': 2.0}


100%|█████████▉| 288503/288718 [6:03:00<00:14, 15.19it/s]

{'loss': 0.6237, 'grad_norm': 1.4872955083847046, 'learning_rate': 1.510124065697324e-08, 'epoch': 2.0}


                                                         
100%|██████████| 288718/288718 [6:33:17<00:00, 12.39it/s]

{'eval_loss': 0.6259598135948181, 'eval_accuracy': 0.6321109718681128, 'eval_runtime': 1802.2599, 'eval_samples_per_second': 321.0, 'eval_steps_per_second': 20.063, 'epoch': 2.0}


100%|██████████| 288718/288718 [6:33:18<00:00, 12.23it/s]

{'train_runtime': 23598.5489, 'train_samples_per_second': 195.752, 'train_steps_per_second': 12.235, 'train_loss': 0.6329476171114634, 'epoch': 2.0}





TrainOutput(global_step=288718, training_loss=0.6329476171114634, metrics={'train_runtime': 23598.5489, 'train_samples_per_second': 195.752, 'train_steps_per_second': 12.235, 'total_flos': 4.302666124454362e+16, 'train_loss': 0.6329476171114634, 'epoch': 2.0})