In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Mounted at /content/drive
/content/drive/My Drive/evaluating_factuality_word_definitions


In [2]:
!pip install datasets
!pip install peft

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [3]:
import torch
import gc
import numpy as np
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from torch import optim
from tqdm import tqdm
import transformers
from transformers import AutoTokenizer, BigBirdModel
from matplotlib import pyplot as plt
from torch.cuda.amp import GradScaler, autocast

from config import DB_URL
from dataset.def_dataset import DefinitionDataset
from models.evidence_selection_model import EvidenceSelectionModel
from losses.supcon import SupConLoss
import torch.nn.functional as F

# Evaluation

In [4]:
def convert_to_labels(similarities, labels, k=2):
    top_indices = torch.topk(similarities, k=min(k, similarities.size(1)))[1]
    predicted = torch.zeros_like(similarities)
    predicted.scatter_(1, top_indices, 1)

    top_k_hits = labels[torch.arange(labels.size(0)).unsqueeze(1), top_indices]
    top_k_hits = torch.any(top_k_hits == 1, dim=1).float()

    mask = (labels != -1).flatten()
    return predicted.flatten()[mask], labels.flatten()[mask], top_k_hits

In [5]:
def evaluate(ev_model, dataloader, loss_function):
    gt_labels = []
    pr_labels = []
    all_top_k_hits = []
    all_loss = []
    for batch in tqdm(dataloader):
        ev_model.eval()
        with torch.no_grad():
            model_input = batch["model_input"]
            claim_embedding = ev_model(input_ids=model_input['claim_input_ids'],
                                       attention_mask=model_input['claim_attention_mask'])
            sentence_embeddings = ev_model(input_ids=model_input['input_ids'],
                                           attention_mask=model_input['attention_mask'],
                                           sentence_mask=model_input['sentence_mask'])

            loss = loss_function(claim_embedding, sentence_embeddings, labels=batch['labels'])
            claim_similarities = F.cosine_similarity(claim_embedding, sentence_embeddings, dim=2)
            claim_similarities = claim_similarities.nan_to_num(nan=float('-inf'))
        predicted, true_labels, top_k_hits = convert_to_labels(claim_similarities, batch['labels'],
                                                               k=3)
        gt_labels.extend(true_labels.tolist())
        pr_labels.extend(predicted.tolist())
        all_top_k_hits.extend(top_k_hits.tolist())
        all_loss.append(loss)
    loss = sum(all_loss) / len(all_loss)
    top_k_acc = sum(all_top_k_hits) / len(all_top_k_hits)

    return loss.item(), top_k_acc, classification_report(gt_labels, pr_labels)

# Training

In [12]:
torch.cuda.empty_cache()
gc.collect()

99

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = 'google/bigbird-roberta-large'
model = BigBirdModel.from_pretrained(model_name)

# Add all lora compatible modules
target_modules = []
for name, module in model.named_modules():
    if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, transformers.pytorch_utils.Conv1D)):
        target_modules.append(name)

peft_config = LoraConfig(task_type=TaskType.FEATURE_EXTRACTION, inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1, target_modules=target_modules)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

selection_model = EvidenceSelectionModel(model).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

trainable params: 32,124,416 || all params: 391,248,384 || trainable%: 8.21074726790437


In [10]:
dataset_query = """
select dd.id, dd.claim, dd.label, docs.document_id, docs.text,
       docs.lines, group_concat(dd.evidence_sentence_id) as evidence_lines
from def_dataset dd
    join documents docs on docs.document_id = dd.evidence_wiki_url
where set_type='{set_type}' -- and length(claim) < 50 and length(docs.text) < 400
group by dd.id, evidence_annotation_id, evidence_wiki_url
limit 2000
"""

train_dataset_raw = Dataset.from_sql(dataset_query.format(set_type='train'), con=DB_URL)
dev_dataset_raw = Dataset.from_sql(dataset_query.format(set_type='dev'), con=DB_URL)

train_dataset = DefinitionDataset(train_dataset_raw, tokenizer, mode='train', model='evidence_selection')
train_dataloader = DataLoader(train_dataset, shuffle=True,
                              collate_fn=train_dataset.collate_fn,
                              batch_size=1)
dev_dataset = DefinitionDataset(dev_dataset_raw, tokenizer, mode='train', model='evidence_selection')
dev_dataloader = DataLoader(train_dataset, shuffle=True,
                              collate_fn=dev_dataset.collate_fn,
                              batch_size=1)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1513 [00:00<?, ? examples/s]

In [13]:
#warmup_steps = 0
#t_total = int(len(train_dataloader) * args.num_epochs / args.gradient_accumulation_steps)

optimizer = optim.AdamW(selection_model.parameters(), lr=1e-5)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
criterion = SupConLoss(temperature=0.17)

## Gradient Accumulation

Accumulates the gradient, which allows finetuning in a batch manner of large models. Works well for Bert

In [None]:
timestamp = datetime.now().strftime("%m-%d_%H-%M")

num_epochs = 5
patience = 4
gradient_accumulation = 16  # 2048
trace_train = []
trace_val = []

selection_model.zero_grad()
use_amp = True
scaler = GradScaler(enabled=use_amp, init_scale=1)

best_loss = np.inf
for epoch in range(num_epochs):
    bar_desc = "Epoch %d of %d | Iteration" % (epoch + 1, num_epochs)
    train_iterator = tqdm(train_dataloader, desc=bar_desc)

    train_loss = 0
    print('Train ...')
    for step, batch in enumerate(train_iterator):
        selection_model.train()
        model_input = batch["model_input"]

        with autocast():
            claim_embedding = selection_model(input_ids=model_input['claim_input_ids'],
                                              attention_mask=model_input['claim_attention_mask'])
            sentence_embeddings = selection_model(input_ids=model_input['input_ids'],
                                                  attention_mask=model_input['attention_mask'],
                                                  sentence_mask=model_input['sentence_mask'])

            loss = criterion(claim_embedding, sentence_embeddings, labels=batch['labels'])
            train_loss += loss.detach().item()
            loss = (loss / gradient_accumulation)

        scaler.scale(loss).backward()

        total_norm = 0
        for name, param in selection_model.named_parameters():
            if param.grad is not None:
                param_norm = param.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1 / 2)

        # Print gradient norm
        print(f'Gradient norm: {total_norm}')

        if (step + 1) % gradient_accumulation == 0:
            scaler.unscale_(optimizer)
            count = 0
            scaler.step(optimizer)
            scaler.update()
            optimizer.step()
            # scheduler.step()
            optimizer.zero_grad()

    trace_train.append(train_loss / len(train_dataloader))
    # validation
    with torch.no_grad():
        val_loss, val_top_k_acc, report = evaluate(selection_model, dev_dataloader, criterion)
        trace_val.append(val_loss)
        print(
            f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {(train_loss / len(train_dataloader)):.4f}, Validation Loss: {val_loss:.4f}')
        print(f'Validation top k acc: {val_top_k_acc:.4f}')
        print(report)

        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
            best_state = {key: value.cpu() for key, value in
                          selection_model.state_dict().items()}
            selection_model.save(f'selection_model_intermediate_{timestamp}')
        else:
            if epoch >= best_epoch + patience:
                break

selection_model.load_state_dict(best_state)
selection_model.save(f'selection_model_{timestamp}')

plt.plot(trace_train, label='train')
plt.plot(trace_val, label='validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

Epoch 1 of 5 | Iteration:   0%|          | 0/1212 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 23 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Train ...


Epoch 1 of 5 | Iteration:   0%|          | 1/1212 [00:01<27:30,  1.36s/it]

Gradient norm: 23.391392473183185


Epoch 1 of 5 | Iteration:   0%|          | 2/1212 [00:01<18:00,  1.12it/s]

Gradient norm: 23.40951146091544


Epoch 1 of 5 | Iteration:   0%|          | 3/1212 [00:02<15:34,  1.29it/s]

Gradient norm: 23.365501603206187


Epoch 1 of 5 | Iteration:   0%|          | 4/1212 [00:03<17:11,  1.17it/s]

Gradient norm: 23.49919420597834


Epoch 1 of 5 | Iteration:   0%|          | 5/1212 [00:04<16:00,  1.26it/s]

Gradient norm: 26.455217479946764


Epoch 1 of 5 | Iteration:   0%|          | 6/1212 [00:05<16:06,  1.25it/s]

Gradient norm: 133.98587634279718


Epoch 1 of 5 | Iteration:   1%|          | 7/1212 [00:05<16:27,  1.22it/s]

Gradient norm: 134.5672959236897


Epoch 1 of 5 | Iteration:   1%|          | 8/1212 [00:06<16:02,  1.25it/s]

Gradient norm: 134.4652828029562


Epoch 1 of 5 | Iteration:   1%|          | 9/1212 [00:07<16:42,  1.20it/s]

Gradient norm: 135.5588547173688


Epoch 1 of 5 | Iteration:   1%|          | 10/1212 [00:08<18:52,  1.06it/s]

Gradient norm: 150.82846910790275


Epoch 1 of 5 | Iteration:   1%|          | 11/1212 [00:09<20:21,  1.02s/it]

Gradient norm: 150.59420561296386


Epoch 1 of 5 | Iteration:   1%|          | 12/1212 [00:10<18:59,  1.05it/s]

Gradient norm: 151.26337198263388


Epoch 1 of 5 | Iteration:   1%|          | 13/1212 [00:11<18:02,  1.11it/s]

Gradient norm: 154.3960220403757


Epoch 1 of 5 | Iteration:   1%|          | 14/1212 [00:12<18:01,  1.11it/s]

Gradient norm: 154.5208454528094


Epoch 1 of 5 | Iteration:   1%|          | 15/1212 [00:13<16:48,  1.19it/s]

Gradient norm: 154.58776305027877


Epoch 1 of 5 | Iteration:   1%|▏         | 16/1212 [00:14<17:32,  1.14it/s]

Gradient norm: 157.88786401225414


Epoch 1 of 5 | Iteration:   1%|▏         | 17/1212 [00:15<19:18,  1.03it/s]

Gradient norm: 0.1736152193812545


Epoch 1 of 5 | Iteration:   1%|▏         | 18/1212 [00:16<21:48,  1.10s/it]

Gradient norm: 3.7276444836655513


Epoch 1 of 5 | Iteration:   2%|▏         | 19/1212 [00:17<22:34,  1.14s/it]

Gradient norm: 3.9488977203422144


Epoch 1 of 5 | Iteration:   2%|▏         | 20/1212 [00:18<20:23,  1.03s/it]

Gradient norm: 4.506320394578816


Epoch 1 of 5 | Iteration:   2%|▏         | 21/1212 [00:19<19:37,  1.01it/s]

Gradient norm: 4.672785103131052


Epoch 1 of 5 | Iteration:   2%|▏         | 22/1212 [00:20<16:20,  1.21it/s]

Gradient norm: 5.133735060792915


Epoch 1 of 5 | Iteration:   2%|▏         | 23/1212 [00:20<13:28,  1.47it/s]

Gradient norm: 5.3424144407018375


Epoch 1 of 5 | Iteration:   2%|▏         | 24/1212 [00:20<12:54,  1.53it/s]

Gradient norm: 5.474402896688282


Epoch 1 of 5 | Iteration:   2%|▏         | 25/1212 [00:21<11:42,  1.69it/s]

Gradient norm: 19.031105058605526


Epoch 1 of 5 | Iteration:   2%|▏         | 26/1212 [00:21<10:17,  1.92it/s]

Gradient norm: 19.042997582953852


Epoch 1 of 5 | Iteration:   2%|▏         | 27/1212 [00:22<09:24,  2.10it/s]

Gradient norm: 19.736676453590334


Epoch 1 of 5 | Iteration:   2%|▏         | 28/1212 [00:22<08:33,  2.31it/s]

Gradient norm: 48.82787386489653


Epoch 1 of 5 | Iteration:   2%|▏         | 29/1212 [00:22<08:23,  2.35it/s]

Gradient norm: 48.7150438074626


Epoch 1 of 5 | Iteration:   2%|▏         | 30/1212 [00:23<08:00,  2.46it/s]

Gradient norm: 56.613897504166296


Epoch 1 of 5 | Iteration:   3%|▎         | 31/1212 [00:23<07:29,  2.62it/s]

Gradient norm: 56.707890474503074


Epoch 1 of 5 | Iteration:   3%|▎         | 32/1212 [00:24<08:02,  2.45it/s]

Gradient norm: 54.7006312682836


Epoch 1 of 5 | Iteration:   3%|▎         | 33/1212 [00:24<08:19,  2.36it/s]

Gradient norm: 10.185593822834202


Epoch 1 of 5 | Iteration:   3%|▎         | 34/1212 [00:24<07:50,  2.51it/s]

Gradient norm: 10.6831592876509


Epoch 1 of 5 | Iteration:   3%|▎         | 35/1212 [00:25<07:38,  2.57it/s]

Gradient norm: 10.777067866101172


Epoch 1 of 5 | Iteration:   3%|▎         | 36/1212 [00:25<07:19,  2.68it/s]

Gradient norm: 10.795451317118026


Epoch 1 of 5 | Iteration:   3%|▎         | 37/1212 [00:25<07:04,  2.77it/s]

Gradient norm: 10.913870921150057


Epoch 1 of 5 | Iteration:   3%|▎         | 38/1212 [00:26<08:26,  2.32it/s]

Gradient norm: 11.300739535052982


Epoch 1 of 5 | Iteration:   3%|▎         | 39/1212 [00:26<07:57,  2.45it/s]

Gradient norm: 11.416354227495795


Epoch 1 of 5 | Iteration:   3%|▎         | 40/1212 [00:27<09:58,  1.96it/s]

Gradient norm: 12.582372315500063


Epoch 1 of 5 | Iteration:   3%|▎         | 41/1212 [00:28<10:24,  1.87it/s]

Gradient norm: 13.495229057941911


Epoch 1 of 5 | Iteration:   3%|▎         | 42/1212 [00:28<09:58,  1.95it/s]

Gradient norm: 13.548708537978902


Epoch 1 of 5 | Iteration:   4%|▎         | 43/1212 [00:29<09:57,  1.96it/s]

Gradient norm: 147.44654453816298


Epoch 1 of 5 | Iteration:   4%|▎         | 44/1212 [00:29<09:35,  2.03it/s]

Gradient norm: 147.84948029640506


Epoch 1 of 5 | Iteration:   4%|▎         | 45/1212 [00:30<10:20,  1.88it/s]

Gradient norm: 148.01666255014308


Epoch 1 of 5 | Iteration:   4%|▍         | 46/1212 [00:30<10:12,  1.90it/s]

Gradient norm: 147.9749118464437


Epoch 1 of 5 | Iteration:   4%|▍         | 47/1212 [00:31<10:13,  1.90it/s]

Gradient norm: 151.14692056165202


Epoch 1 of 5 | Iteration:   4%|▍         | 48/1212 [00:31<09:45,  1.99it/s]

Gradient norm: 157.63867405926055


Epoch 1 of 5 | Iteration:   4%|▍         | 49/1212 [00:32<08:47,  2.20it/s]

Gradient norm: 12.061772331232039


Epoch 1 of 5 | Iteration:   4%|▍         | 50/1212 [00:32<08:43,  2.22it/s]

Gradient norm: 19.071811970117817


Epoch 1 of 5 | Iteration:   4%|▍         | 51/1212 [00:32<08:09,  2.37it/s]

Gradient norm: 19.386382259570162


Epoch 1 of 5 | Iteration:   4%|▍         | 52/1212 [00:33<08:28,  2.28it/s]

Gradient norm: 21.36837499153537


Epoch 1 of 5 | Iteration:   4%|▍         | 53/1212 [00:33<09:05,  2.12it/s]

Gradient norm: 40.70190070858158


Epoch 1 of 5 | Iteration:   4%|▍         | 54/1212 [00:34<08:29,  2.27it/s]

Gradient norm: 40.140591904097036


Epoch 1 of 5 | Iteration:   5%|▍         | 55/1212 [00:34<09:37,  2.00it/s]

Gradient norm: 39.83183393270335


Epoch 1 of 5 | Iteration:   5%|▍         | 56/1212 [00:35<09:06,  2.11it/s]

Gradient norm: 39.905163133954865


Epoch 1 of 5 | Iteration:   5%|▍         | 57/1212 [00:35<08:38,  2.23it/s]

Gradient norm: 40.18260624391158


Epoch 1 of 5 | Iteration:   5%|▍         | 58/1212 [00:36<09:38,  1.99it/s]

Gradient norm: 40.03802382546877


Epoch 1 of 5 | Iteration:   5%|▍         | 59/1212 [00:36<09:03,  2.12it/s]

Gradient norm: 124.78893566456588


Epoch 1 of 5 | Iteration:   5%|▍         | 60/1212 [00:36<08:15,  2.32it/s]

Gradient norm: 124.78893566456588


Epoch 1 of 5 | Iteration:   5%|▌         | 61/1212 [00:37<08:24,  2.28it/s]

Gradient norm: 124.9186867431385


Epoch 1 of 5 | Iteration:   5%|▌         | 62/1212 [00:37<07:59,  2.40it/s]

Gradient norm: 124.18697739040657


Epoch 1 of 5 | Iteration:   5%|▌         | 63/1212 [00:38<07:30,  2.55it/s]

Gradient norm: 124.31849052372927


Epoch 1 of 5 | Iteration:   5%|▌         | 64/1212 [00:38<07:58,  2.40it/s]

Gradient norm: 160.74182729625954


Epoch 1 of 5 | Iteration:   5%|▌         | 65/1212 [00:38<07:30,  2.55it/s]

Gradient norm: 66.52248316049456


Epoch 1 of 5 | Iteration:   5%|▌         | 66/1212 [00:39<07:14,  2.64it/s]

Gradient norm: 68.16766383605638


Epoch 1 of 5 | Iteration:   6%|▌         | 67/1212 [00:39<07:04,  2.69it/s]

Gradient norm: 80.80943869085827


Epoch 1 of 5 | Iteration:   6%|▌         | 68/1212 [00:40<08:24,  2.27it/s]

Gradient norm: 82.19759315324143


Epoch 1 of 5 | Iteration:   6%|▌         | 69/1212 [00:40<08:47,  2.17it/s]

Gradient norm: 241.68947365224608


Epoch 1 of 5 | Iteration:   6%|▌         | 70/1212 [00:41<08:47,  2.17it/s]

Gradient norm: 241.65163416447297


Epoch 1 of 5 | Iteration:   6%|▌         | 71/1212 [00:41<09:25,  2.02it/s]

Gradient norm: 241.5063133325691


Epoch 1 of 5 | Iteration:   6%|▌         | 72/1212 [00:42<09:34,  1.98it/s]

Gradient norm: 241.5276222756191


Epoch 1 of 5 | Iteration:   6%|▌         | 73/1212 [00:42<09:59,  1.90it/s]

Gradient norm: 567.7516820069936


Epoch 1 of 5 | Iteration:   6%|▌         | 74/1212 [00:43<09:46,  1.94it/s]

Gradient norm: 566.8035940675161


Epoch 1 of 5 | Iteration:   6%|▌         | 75/1212 [00:43<09:40,  1.96it/s]

Gradient norm: 566.8008379976457


Epoch 1 of 5 | Iteration:   6%|▋         | 76/1212 [00:44<09:37,  1.97it/s]

Gradient norm: 567.8126644606504


Epoch 1 of 5 | Iteration:   6%|▋         | 77/1212 [00:44<09:21,  2.02it/s]

Gradient norm: 567.3527445126218


Epoch 1 of 5 | Iteration:   6%|▋         | 78/1212 [00:45<09:31,  1.99it/s]

Gradient norm: 567.8049195586781


Epoch 1 of 5 | Iteration:   7%|▋         | 79/1212 [00:45<08:36,  2.19it/s]

Gradient norm: 567.915829018373


Epoch 1 of 5 | Iteration:   7%|▋         | 80/1212 [00:46<08:10,  2.31it/s]

Gradient norm: 567.6554178502181


Epoch 1 of 5 | Iteration:   7%|▋         | 81/1212 [00:46<07:40,  2.45it/s]

Gradient norm: 2.3310944420791535


Epoch 1 of 5 | Iteration:   7%|▋         | 82/1212 [00:46<07:42,  2.44it/s]

Gradient norm: 80.67153229664245


Epoch 1 of 5 | Iteration:   7%|▋         | 83/1212 [00:47<07:20,  2.56it/s]

Gradient norm: 82.4103530484459


Epoch 1 of 5 | Iteration:   7%|▋         | 84/1212 [00:47<07:14,  2.60it/s]

Gradient norm: 82.41340828963229


Epoch 1 of 5 | Iteration:   7%|▋         | 85/1212 [00:47<07:16,  2.58it/s]

Gradient norm: 82.52854364407419


Epoch 1 of 5 | Iteration:   7%|▋         | 86/1212 [00:48<07:01,  2.67it/s]

Gradient norm: 82.53984127819513


Epoch 1 of 5 | Iteration:   7%|▋         | 87/1212 [00:48<07:57,  2.36it/s]

Gradient norm: 82.561731267078


Epoch 1 of 5 | Iteration:   7%|▋         | 88/1212 [00:49<07:30,  2.49it/s]

Gradient norm: 82.59661207524148


Epoch 1 of 5 | Iteration:   7%|▋         | 89/1212 [00:49<07:54,  2.37it/s]

Gradient norm: 82.81085412505563


Epoch 1 of 5 | Iteration:   7%|▋         | 90/1212 [00:50<07:26,  2.51it/s]

Gradient norm: 82.53822738771726


Epoch 1 of 5 | Iteration:   8%|▊         | 91/1212 [00:50<07:06,  2.63it/s]

Gradient norm: 82.5483673940694


Epoch 1 of 5 | Iteration:   8%|▊         | 92/1212 [00:50<07:08,  2.62it/s]

Gradient norm: 82.53013688193715


Epoch 1 of 5 | Iteration:   8%|▊         | 93/1212 [00:51<07:17,  2.56it/s]

Gradient norm: 82.52948413210214


Epoch 1 of 5 | Iteration:   8%|▊         | 94/1212 [00:51<07:51,  2.37it/s]

Gradient norm: 82.53673646175736


Epoch 1 of 5 | Iteration:   8%|▊         | 95/1212 [00:52<08:03,  2.31it/s]

Gradient norm: 82.4043327801063


Epoch 1 of 5 | Iteration:   8%|▊         | 96/1212 [00:52<08:35,  2.16it/s]

Gradient norm: 82.4410610949061


Epoch 1 of 5 | Iteration:   8%|▊         | 97/1212 [00:53<08:11,  2.27it/s]

Gradient norm: 0.30034657466547077


Epoch 1 of 5 | Iteration:   8%|▊         | 98/1212 [00:53<08:29,  2.19it/s]

Gradient norm: 0.40221141596296583


Epoch 1 of 5 | Iteration:   8%|▊         | 99/1212 [00:54<08:58,  2.07it/s]

Gradient norm: 1.1061170187872642


Epoch 1 of 5 | Iteration:   8%|▊         | 100/1212 [00:54<09:34,  1.94it/s]

Gradient norm: 2.751040562329486


Epoch 1 of 5 | Iteration:   8%|▊         | 101/1212 [00:55<09:28,  1.95it/s]

Gradient norm: 2.97631878116911


Epoch 1 of 5 | Iteration:   8%|▊         | 102/1212 [00:55<09:16,  1.99it/s]

Gradient norm: 4.649680003824515


Epoch 1 of 5 | Iteration:   8%|▊         | 103/1212 [00:56<09:47,  1.89it/s]

Gradient norm: 10.917373681206092


Epoch 1 of 5 | Iteration:   9%|▊         | 104/1212 [00:56<09:21,  1.97it/s]

Gradient norm: 12.929760109393138


Epoch 1 of 5 | Iteration:   9%|▊         | 105/1212 [00:57<10:31,  1.75it/s]

Gradient norm: 13.386621527749998


Epoch 1 of 5 | Iteration:   9%|▊         | 106/1212 [00:57<10:07,  1.82it/s]

Gradient norm: 149.77044585155076


Epoch 1 of 5 | Iteration:   9%|▉         | 107/1212 [00:58<09:15,  1.99it/s]

Gradient norm: 149.69119519492224


Epoch 1 of 5 | Iteration:   9%|▉         | 108/1212 [00:58<08:39,  2.13it/s]

Gradient norm: 149.79169894592596


Epoch 1 of 5 | Iteration:   9%|▉         | 109/1212 [00:59<09:15,  1.98it/s]

Gradient norm: 149.87348714670358


Epoch 1 of 5 | Iteration:   9%|▉         | 110/1212 [00:59<08:20,  2.20it/s]

Gradient norm: 149.8303433109408


Epoch 1 of 5 | Iteration:   9%|▉         | 111/1212 [01:00<09:03,  2.02it/s]

Gradient norm: 149.78062165126136


Epoch 1 of 5 | Iteration:   9%|▉         | 112/1212 [01:00<08:27,  2.17it/s]

Gradient norm: 149.79886130368374


Epoch 1 of 5 | Iteration:   9%|▉         | 113/1212 [01:00<07:49,  2.34it/s]

Gradient norm: 1.0490326230696507


Epoch 1 of 5 | Iteration:   9%|▉         | 114/1212 [01:01<07:30,  2.44it/s]

Gradient norm: 1.6968282902623593


Epoch 1 of 5 | Iteration:   9%|▉         | 115/1212 [01:01<08:28,  2.16it/s]

Gradient norm: 2.1861051084746


Epoch 1 of 5 | Iteration:  10%|▉         | 116/1212 [01:02<08:19,  2.19it/s]

Gradient norm: 8.66137622947013


Epoch 1 of 5 | Iteration:  10%|▉         | 117/1212 [01:02<08:07,  2.24it/s]

Gradient norm: 8.82989566866487


Epoch 1 of 5 | Iteration:  10%|▉         | 118/1212 [01:03<07:35,  2.40it/s]

Gradient norm: 8.475391114633544


Epoch 1 of 5 | Iteration:  10%|▉         | 119/1212 [01:03<07:37,  2.39it/s]

Gradient norm: 8.548751524623318


Epoch 1 of 5 | Iteration:  10%|▉         | 120/1212 [01:04<08:15,  2.20it/s]

Gradient norm: 8.54714189666574


Epoch 1 of 5 | Iteration:  10%|▉         | 121/1212 [01:04<08:12,  2.21it/s]

Gradient norm: 9.85334795363933


Epoch 1 of 5 | Iteration:  10%|█         | 122/1212 [01:04<07:43,  2.35it/s]

Gradient norm: 9.803814913318183


Epoch 1 of 5 | Iteration:  10%|█         | 123/1212 [01:05<08:19,  2.18it/s]

Gradient norm: 10.470126310040865


Epoch 1 of 5 | Iteration:  10%|█         | 124/1212 [01:05<07:40,  2.36it/s]

Gradient norm: 34.29601228850423


Epoch 1 of 5 | Iteration:  10%|█         | 125/1212 [01:06<07:32,  2.40it/s]

Gradient norm: 55.56438095921495


Epoch 1 of 5 | Iteration:  10%|█         | 126/1212 [01:06<07:54,  2.29it/s]

Gradient norm: 55.53778346142653


Epoch 1 of 5 | Iteration:  10%|█         | 127/1212 [01:07<08:01,  2.25it/s]

Gradient norm: 55.55250192624043


Epoch 1 of 5 | Iteration:  11%|█         | 128/1212 [01:07<07:28,  2.42it/s]

Gradient norm: 56.87987780733545


Epoch 1 of 5 | Iteration:  11%|█         | 129/1212 [01:07<07:54,  2.28it/s]

Gradient norm: 1.6906196167295102


Epoch 1 of 5 | Iteration:  11%|█         | 130/1212 [01:08<08:09,  2.21it/s]

Gradient norm: 5.741355789716777


Epoch 1 of 5 | Iteration:  11%|█         | 131/1212 [01:08<08:20,  2.16it/s]

Gradient norm: 35.13235257345431


Epoch 1 of 5 | Iteration:  11%|█         | 132/1212 [01:09<08:20,  2.16it/s]

Gradient norm: 35.13921413654202


Epoch 1 of 5 | Iteration:  11%|█         | 133/1212 [01:09<08:49,  2.04it/s]

Gradient norm: 35.90046833060682


Epoch 1 of 5 | Iteration:  11%|█         | 134/1212 [01:10<09:03,  1.98it/s]

Gradient norm: 35.976592054947325


Epoch 1 of 5 | Iteration:  11%|█         | 135/1212 [01:10<09:03,  1.98it/s]

Gradient norm: 55.03945267043797


Epoch 1 of 5 | Iteration:  11%|█         | 136/1212 [01:11<09:32,  1.88it/s]

Gradient norm: 54.04144173418138


Epoch 1 of 5 | Iteration:  11%|█▏        | 137/1212 [01:12<09:08,  1.96it/s]

Gradient norm: 54.191988554831944


Epoch 1 of 5 | Iteration:  11%|█▏        | 138/1212 [01:12<08:18,  2.16it/s]

Gradient norm: 54.685017928244356


Epoch 1 of 5 | Iteration:  11%|█▏        | 139/1212 [01:12<07:45,  2.31it/s]

Gradient norm: 54.595465856766516


Epoch 1 of 5 | Iteration:  12%|█▏        | 140/1212 [01:13<07:16,  2.45it/s]

Gradient norm: 69.46478493816194


Epoch 1 of 5 | Iteration:  12%|█▏        | 141/1212 [01:13<07:07,  2.51it/s]

Gradient norm: 69.56178440479644


Epoch 1 of 5 | Iteration:  12%|█▏        | 142/1212 [01:13<06:49,  2.61it/s]

Gradient norm: 69.52588560788502


Epoch 1 of 5 | Iteration:  12%|█▏        | 143/1212 [01:14<08:05,  2.20it/s]

Gradient norm: 69.72181233015344


Epoch 1 of 5 | Iteration:  12%|█▏        | 144/1212 [01:14<07:37,  2.33it/s]

Gradient norm: 69.42031388305186


Epoch 1 of 5 | Iteration:  12%|█▏        | 145/1212 [01:15<08:17,  2.14it/s]

Gradient norm: 5.206711957169527


Epoch 1 of 5 | Iteration:  12%|█▏        | 146/1212 [01:15<07:36,  2.34it/s]

Gradient norm: 31.850355265049373


Epoch 1 of 5 | Iteration:  12%|█▏        | 147/1212 [01:16<07:01,  2.53it/s]

Gradient norm: 32.07229917461871


Epoch 1 of 5 | Iteration:  12%|█▏        | 148/1212 [01:16<07:17,  2.43it/s]

Gradient norm: 33.608400433526235


Epoch 1 of 5 | Iteration:  12%|█▏        | 149/1212 [01:16<07:10,  2.47it/s]

Gradient norm: 35.98928435530082


Epoch 1 of 5 | Iteration:  12%|█▏        | 150/1212 [01:17<07:09,  2.47it/s]

Gradient norm: 34.630798717625574


Epoch 1 of 5 | Iteration:  12%|█▏        | 151/1212 [01:17<06:53,  2.56it/s]

Gradient norm: 34.70495113778048


Epoch 1 of 5 | Iteration:  13%|█▎        | 152/1212 [01:18<06:47,  2.60it/s]

Gradient norm: 34.741734400850284


Epoch 1 of 5 | Iteration:  13%|█▎        | 153/1212 [01:18<07:27,  2.37it/s]

Gradient norm: 2036.245693212111


Epoch 1 of 5 | Iteration:  13%|█▎        | 154/1212 [01:19<08:33,  2.06it/s]

Gradient norm: 2036.6820067256945


Epoch 1 of 5 | Iteration:  13%|█▎        | 155/1212 [01:19<08:47,  2.01it/s]

Gradient norm: 2035.1114083699363


Epoch 1 of 5 | Iteration:  13%|█▎        | 156/1212 [01:20<08:29,  2.07it/s]

Gradient norm: 2034.4074164610608


Epoch 1 of 5 | Iteration:  13%|█▎        | 157/1212 [01:20<08:04,  2.18it/s]

Gradient norm: 2034.348923767546


Epoch 1 of 5 | Iteration:  13%|█▎        | 158/1212 [01:21<08:39,  2.03it/s]

Gradient norm: 2034.4334409404578


Epoch 1 of 5 | Iteration:  13%|█▎        | 159/1212 [01:21<09:20,  1.88it/s]

Gradient norm: 2034.3056959955932


Epoch 1 of 5 | Iteration:  13%|█▎        | 160/1212 [01:22<09:20,  1.88it/s]

Gradient norm: 2033.7166181093075


Epoch 1 of 5 | Iteration:  13%|█▎        | 161/1212 [01:22<09:44,  1.80it/s]

Gradient norm: 0.1883144687332576


Epoch 1 of 5 | Iteration:  13%|█▎        | 162/1212 [01:23<09:18,  1.88it/s]

Gradient norm: 0.891441107929902


Epoch 1 of 5 | Iteration:  13%|█▎        | 163/1212 [01:24<10:17,  1.70it/s]

Gradient norm: 7.204547880596923


Epoch 1 of 5 | Iteration:  14%|█▎        | 164/1212 [01:24<09:29,  1.84it/s]

Gradient norm: 7.287969251531332


Epoch 1 of 5 | Iteration:  14%|█▎        | 165/1212 [01:24<08:47,  1.99it/s]

Gradient norm: 7.4357461827353415


Epoch 1 of 5 | Iteration:  14%|█▎        | 166/1212 [01:25<08:09,  2.14it/s]

Gradient norm: 8.174841040027658


Epoch 1 of 5 | Iteration:  14%|█▍        | 167/1212 [01:25<07:44,  2.25it/s]

Gradient norm: 8.294635125610972


Epoch 1 of 5 | Iteration:  14%|█▍        | 168/1212 [01:26<07:40,  2.26it/s]

Gradient norm: 25.50322945150005


Epoch 1 of 5 | Iteration:  14%|█▍        | 169/1212 [01:26<08:37,  2.01it/s]

Gradient norm: 25.82423794300512


Epoch 1 of 5 | Iteration:  14%|█▍        | 170/1212 [01:27<08:21,  2.08it/s]

Gradient norm: 42.85408662670297


Epoch 1 of 5 | Iteration:  14%|█▍        | 171/1212 [01:27<07:49,  2.22it/s]

Gradient norm: 42.181111777931484


Epoch 1 of 5 | Iteration:  14%|█▍        | 172/1212 [01:27<07:40,  2.26it/s]

Gradient norm: 42.31957404548515


Epoch 1 of 5 | Iteration:  14%|█▍        | 173/1212 [01:28<07:14,  2.39it/s]

Gradient norm: 42.22275660297583


Epoch 1 of 5 | Iteration:  14%|█▍        | 174/1212 [01:28<06:58,  2.48it/s]

Gradient norm: 43.38669208896718


Epoch 1 of 5 | Iteration:  14%|█▍        | 175/1212 [01:29<07:18,  2.37it/s]

Gradient norm: 43.371238541524654


Epoch 1 of 5 | Iteration:  15%|█▍        | 176/1212 [01:29<07:30,  2.30it/s]

Gradient norm: 43.44048767699304


Epoch 1 of 5 | Iteration:  15%|█▍        | 177/1212 [01:30<07:06,  2.43it/s]

Gradient norm: 1.2405629288525701


Epoch 1 of 5 | Iteration:  15%|█▍        | 178/1212 [01:30<06:46,  2.54it/s]

Gradient norm: 1.8009574644912907


Epoch 1 of 5 | Iteration:  15%|█▍        | 179/1212 [01:30<07:08,  2.41it/s]

Gradient norm: 2.400698240628449


Epoch 1 of 5 | Iteration:  15%|█▍        | 180/1212 [01:31<06:46,  2.54it/s]

Gradient norm: 2.4010156536717093


Epoch 1 of 5 | Iteration:  15%|█▍        | 181/1212 [01:31<06:37,  2.60it/s]

Gradient norm: 2.4989506349103383


Epoch 1 of 5 | Iteration:  15%|█▌        | 182/1212 [01:31<06:32,  2.63it/s]

Gradient norm: 7.507837480051459


Epoch 1 of 5 | Iteration:  15%|█▌        | 183/1212 [01:32<06:39,  2.58it/s]

Gradient norm: 116.8698714815897


Epoch 1 of 5 | Iteration:  15%|█▌        | 184/1212 [01:32<06:27,  2.66it/s]

Gradient norm: 116.88472639207254


Epoch 1 of 5 | Iteration:  15%|█▌        | 185/1212 [01:33<07:13,  2.37it/s]

Gradient norm: 116.80863209413155


Epoch 1 of 5 | Iteration:  15%|█▌        | 186/1212 [01:33<06:59,  2.45it/s]

Gradient norm: 114.47937170407194


Epoch 1 of 5 | Iteration:  15%|█▌        | 187/1212 [01:33<06:37,  2.58it/s]

Gradient norm: 114.6778713750932


Epoch 1 of 5 | Iteration:  16%|█▌        | 188/1212 [01:34<07:05,  2.41it/s]

Gradient norm: 115.22578423409675


Epoch 1 of 5 | Iteration:  16%|█▌        | 189/1212 [01:34<07:19,  2.33it/s]

Gradient norm: 115.2282180455089


Epoch 1 of 5 | Iteration:  16%|█▌        | 190/1212 [01:35<07:59,  2.13it/s]

Gradient norm: 115.31515636817772


Epoch 1 of 5 | Iteration:  16%|█▌        | 191/1212 [01:35<08:05,  2.10it/s]

Gradient norm: 115.15097425805651


Epoch 1 of 5 | Iteration:  16%|█▌        | 192/1212 [01:36<08:17,  2.05it/s]

Gradient norm: 118.13163006336026


Epoch 1 of 5 | Iteration:  16%|█▌        | 193/1212 [01:37<08:45,  1.94it/s]

Gradient norm: 1.3534266412022289


Epoch 1 of 5 | Iteration:  16%|█▌        | 194/1212 [01:37<08:27,  2.01it/s]

Gradient norm: 6.983908286951438


Epoch 1 of 5 | Iteration:  16%|█▌        | 195/1212 [01:37<07:54,  2.14it/s]

Gradient norm: 8.149270550092679


Epoch 1 of 5 | Iteration:  16%|█▌        | 196/1212 [01:38<07:27,  2.27it/s]

Gradient norm: 8.21859397871981


Epoch 1 of 5 | Iteration:  16%|█▋        | 197/1212 [01:38<07:00,  2.41it/s]

Gradient norm: 8.200932788581813


Epoch 1 of 5 | Iteration:  16%|█▋        | 198/1212 [01:39<07:04,  2.39it/s]

Gradient norm: 8.641402888692829


Epoch 1 of 5 | Iteration:  16%|█▋        | 199/1212 [01:39<07:29,  2.25it/s]

Gradient norm: 11.422013185949409


Epoch 1 of 5 | Iteration:  17%|█▋        | 200/1212 [01:40<07:53,  2.14it/s]

Gradient norm: 12.529180682018536


Epoch 1 of 5 | Iteration:  17%|█▋        | 201/1212 [01:40<07:44,  2.18it/s]

Gradient norm: 12.46632561373565


Epoch 1 of 5 | Iteration:  17%|█▋        | 202/1212 [01:40<07:13,  2.33it/s]

Gradient norm: 12.703186792814867


Epoch 1 of 5 | Iteration:  17%|█▋        | 203/1212 [01:41<07:35,  2.22it/s]

Gradient norm: 12.748178883385345


Epoch 1 of 5 | Iteration:  17%|█▋        | 204/1212 [01:41<07:54,  2.13it/s]

Gradient norm: 35.90183216362166


Epoch 1 of 5 | Iteration:  17%|█▋        | 205/1212 [01:42<09:17,  1.81it/s]

Gradient norm: 35.80636725766221


Epoch 1 of 5 | Iteration:  17%|█▋        | 206/1212 [01:43<09:38,  1.74it/s]

Gradient norm: 36.05357258687665


Epoch 1 of 5 | Iteration:  17%|█▋        | 207/1212 [01:43<09:24,  1.78it/s]

Gradient norm: 36.04319433977332


Epoch 1 of 5 | Iteration:  17%|█▋        | 208/1212 [01:44<08:50,  1.89it/s]

Gradient norm: 36.110992828442996


Epoch 1 of 5 | Iteration:  17%|█▋        | 209/1212 [01:44<08:37,  1.94it/s]

Gradient norm: 15.708123627910183


Epoch 1 of 5 | Iteration:  17%|█▋        | 210/1212 [01:45<08:40,  1.93it/s]

Gradient norm: 15.867252020149799


Epoch 1 of 5 | Iteration:  17%|█▋        | 211/1212 [01:45<07:52,  2.12it/s]

Gradient norm: 16.526333710002437


Epoch 1 of 5 | Iteration:  17%|█▋        | 212/1212 [01:46<07:41,  2.17it/s]

Gradient norm: 16.637365309375667


Epoch 1 of 5 | Iteration:  18%|█▊        | 213/1212 [01:46<08:32,  1.95it/s]

Gradient norm: 32.03741763756385


Epoch 1 of 5 | Iteration:  18%|█▊        | 214/1212 [01:47<09:08,  1.82it/s]

Gradient norm: 31.941319374671203


Epoch 1 of 5 | Iteration:  18%|█▊        | 215/1212 [01:47<08:51,  1.88it/s]

Gradient norm: 32.686068754107666


Epoch 1 of 5 | Iteration:  18%|█▊        | 216/1212 [01:48<09:03,  1.83it/s]

Gradient norm: 32.59504592214134


Epoch 1 of 5 | Iteration:  18%|█▊        | 217/1212 [01:48<08:52,  1.87it/s]

Gradient norm: 32.57391443667001


Epoch 1 of 5 | Iteration:  18%|█▊        | 218/1212 [01:49<08:54,  1.86it/s]

Gradient norm: 32.76713169831785


Epoch 1 of 5 | Iteration:  18%|█▊        | 219/1212 [01:49<08:34,  1.93it/s]

Gradient norm: 176.95206108023476


Epoch 1 of 5 | Iteration:  18%|█▊        | 220/1212 [01:50<09:06,  1.81it/s]

Gradient norm: 177.52710518736941


Epoch 1 of 5 | Iteration:  18%|█▊        | 221/1212 [01:50<08:17,  1.99it/s]

Gradient norm: 177.72464314084405


Epoch 1 of 5 | Iteration:  18%|█▊        | 222/1212 [01:51<08:33,  1.93it/s]

Gradient norm: 177.82385691464066


Epoch 1 of 5 | Iteration:  18%|█▊        | 223/1212 [01:51<07:57,  2.07it/s]

Gradient norm: 177.9506263273443


Epoch 1 of 5 | Iteration:  18%|█▊        | 224/1212 [01:52<08:16,  1.99it/s]

Gradient norm: 178.20917520869165


Epoch 1 of 5 | Iteration:  19%|█▊        | 225/1212 [01:52<07:29,  2.19it/s]

Gradient norm: 3.109462524540055


Epoch 1 of 5 | Iteration:  19%|█▊        | 226/1212 [01:53<07:55,  2.07it/s]

Gradient norm: 3.4914045098001205


Epoch 1 of 5 | Iteration:  19%|█▊        | 227/1212 [01:53<07:42,  2.13it/s]

Gradient norm: 5.343097770167325


Epoch 1 of 5 | Iteration:  19%|█▉        | 228/1212 [01:54<07:42,  2.13it/s]

Gradient norm: 5.809343074249483


Epoch 1 of 5 | Iteration:  19%|█▉        | 229/1212 [01:54<07:44,  2.12it/s]

Gradient norm: 5.9018269025401455


Epoch 1 of 5 | Iteration:  19%|█▉        | 230/1212 [01:55<07:33,  2.16it/s]

Gradient norm: 5.856178827648132


Epoch 1 of 5 | Iteration:  19%|█▉        | 231/1212 [01:55<08:22,  1.95it/s]

Gradient norm: 7.573477232162266


Epoch 1 of 5 | Iteration:  19%|█▉        | 232/1212 [01:56<08:20,  1.96it/s]

Gradient norm: 9.809793342956876


Epoch 1 of 5 | Iteration:  19%|█▉        | 233/1212 [01:56<08:24,  1.94it/s]

Gradient norm: 19.473395522107218


Epoch 1 of 5 | Iteration:  19%|█▉        | 234/1212 [01:57<08:16,  1.97it/s]

Gradient norm: 19.437231105528532


Epoch 1 of 5 | Iteration:  19%|█▉        | 235/1212 [01:57<07:40,  2.12it/s]

Gradient norm: 22.953144335494287


Epoch 1 of 5 | Iteration:  19%|█▉        | 236/1212 [01:58<07:03,  2.31it/s]

Gradient norm: 41.75715378034097


Epoch 1 of 5 | Iteration:  20%|█▉        | 237/1212 [01:58<06:52,  2.36it/s]

Gradient norm: 41.90224906826517


Epoch 1 of 5 | Iteration:  20%|█▉        | 238/1212 [01:58<06:35,  2.46it/s]

Gradient norm: 41.956389889044374


Epoch 1 of 5 | Iteration:  20%|█▉        | 239/1212 [01:59<07:15,  2.23it/s]

Gradient norm: 41.92187868137224


Epoch 1 of 5 | Iteration:  20%|█▉        | 240/1212 [01:59<06:52,  2.36it/s]

Gradient norm: 41.86639629025486


Epoch 1 of 5 | Iteration:  20%|█▉        | 241/1212 [02:00<06:27,  2.51it/s]

Gradient norm: 13.230287977689967


Epoch 1 of 5 | Iteration:  20%|█▉        | 242/1212 [02:00<06:07,  2.64it/s]

Gradient norm: 58.88717715630406


Epoch 1 of 5 | Iteration:  20%|██        | 243/1212 [02:00<06:46,  2.38it/s]

Gradient norm: 59.60317323405486


Epoch 1 of 5 | Iteration:  20%|██        | 244/1212 [02:01<07:12,  2.24it/s]

Gradient norm: 108.35043178170764


Epoch 1 of 5 | Iteration:  20%|██        | 245/1212 [02:01<07:30,  2.15it/s]

Gradient norm: 108.92340540921525


Epoch 1 of 5 | Iteration:  20%|██        | 246/1212 [02:02<07:27,  2.16it/s]

Gradient norm: 108.86082296608353


Epoch 1 of 5 | Iteration:  20%|██        | 247/1212 [02:02<08:04,  1.99it/s]

Gradient norm: 108.86408337578784


Epoch 1 of 5 | Iteration:  20%|██        | 248/1212 [02:03<08:10,  1.97it/s]

Gradient norm: 107.70505056446459


Epoch 1 of 5 | Iteration:  21%|██        | 249/1212 [02:04<08:47,  1.82it/s]

Gradient norm: 104.55039538040667


Epoch 1 of 5 | Iteration:  21%|██        | 250/1212 [02:04<08:15,  1.94it/s]

Gradient norm: 104.34947073523695


Epoch 1 of 5 | Iteration:  21%|██        | 251/1212 [02:04<07:54,  2.02it/s]

Gradient norm: 104.19465509604059


Epoch 1 of 5 | Iteration:  21%|██        | 252/1212 [02:05<07:19,  2.18it/s]

Gradient norm: 206.71672531812186


Epoch 1 of 5 | Iteration:  21%|██        | 253/1212 [02:05<06:48,  2.35it/s]

Gradient norm: 206.16195894937565


Epoch 1 of 5 | Iteration:  21%|██        | 254/1212 [02:06<07:51,  2.03it/s]

Gradient norm: 206.6820309461237


Epoch 1 of 5 | Iteration:  21%|██        | 255/1212 [02:06<08:13,  1.94it/s]

Gradient norm: 206.49441961835802


Epoch 1 of 5 | Iteration:  21%|██        | 256/1212 [02:07<08:00,  1.99it/s]

Gradient norm: 206.52151610767456


Epoch 1 of 5 | Iteration:  21%|██        | 257/1212 [02:07<07:21,  2.16it/s]

Gradient norm: 116.6279012156307


Epoch 1 of 5 | Iteration:  21%|██▏       | 258/1212 [02:08<07:28,  2.13it/s]

Gradient norm: 116.3724670127168


Epoch 1 of 5 | Iteration:  21%|██▏       | 259/1212 [02:08<07:06,  2.24it/s]

Gradient norm: 116.45808528801577


Epoch 1 of 5 | Iteration:  21%|██▏       | 260/1212 [02:09<06:50,  2.32it/s]

Gradient norm: 116.83231853309975


Epoch 1 of 5 | Iteration:  22%|██▏       | 261/1212 [02:09<06:50,  2.32it/s]

Gradient norm: 116.83339859559757


Epoch 1 of 5 | Iteration:  22%|██▏       | 262/1212 [02:10<07:49,  2.02it/s]

Gradient norm: 116.74336690679176


Epoch 1 of 5 | Iteration:  22%|██▏       | 263/1212 [02:10<07:03,  2.24it/s]

Gradient norm: 116.69462287722784


Epoch 1 of 5 | Iteration:  22%|██▏       | 264/1212 [02:10<06:35,  2.40it/s]

Gradient norm: 117.35127324142083


Epoch 1 of 5 | Iteration:  22%|██▏       | 265/1212 [02:11<06:41,  2.36it/s]

Gradient norm: 116.33999708184372


Epoch 1 of 5 | Iteration:  22%|██▏       | 266/1212 [02:11<07:11,  2.19it/s]

Gradient norm: 121.83979815362872


Epoch 1 of 5 | Iteration:  22%|██▏       | 267/1212 [02:12<06:48,  2.32it/s]

Gradient norm: 121.83790693615713


Epoch 1 of 5 | Iteration:  22%|██▏       | 268/1212 [02:12<06:20,  2.48it/s]

Gradient norm: 121.80434637922028


Epoch 1 of 5 | Iteration:  22%|██▏       | 269/1212 [02:12<06:47,  2.31it/s]

Gradient norm: 1588.713357032718


Epoch 1 of 5 | Iteration:  22%|██▏       | 270/1212 [02:13<06:34,  2.39it/s]

Gradient norm: 1588.9837706492515


Epoch 1 of 5 | Iteration:  22%|██▏       | 271/1212 [02:13<06:10,  2.54it/s]

Gradient norm: 1588.793508837823


Epoch 1 of 5 | Iteration:  22%|██▏       | 272/1212 [02:14<06:48,  2.30it/s]

Gradient norm: 1588.8685202989047


Epoch 1 of 5 | Iteration:  23%|██▎       | 273/1212 [02:14<07:13,  2.17it/s]

Gradient norm: 1.1035177504606442


Epoch 1 of 5 | Iteration:  23%|██▎       | 274/1212 [02:15<07:32,  2.07it/s]

Gradient norm: 6.243685617192017


Epoch 1 of 5 | Iteration:  23%|██▎       | 275/1212 [02:15<07:25,  2.10it/s]

Gradient norm: 6.159287396886997


Epoch 1 of 5 | Iteration:  23%|██▎       | 276/1212 [02:16<07:48,  2.00it/s]

Gradient norm: 8.70828730183618


Epoch 1 of 5 | Iteration:  23%|██▎       | 277/1212 [02:16<08:00,  1.95it/s]

Gradient norm: 14.977352288754338


Epoch 1 of 5 | Iteration:  23%|██▎       | 278/1212 [02:17<07:30,  2.07it/s]

Gradient norm: 52.387187762952315


Epoch 1 of 5 | Iteration:  23%|██▎       | 279/1212 [02:17<06:53,  2.25it/s]

Gradient norm: 52.05492259526807


Epoch 1 of 5 | Iteration:  23%|██▎       | 280/1212 [02:17<06:24,  2.43it/s]

Gradient norm: 54.04009160977055


Epoch 1 of 5 | Iteration:  23%|██▎       | 281/1212 [02:18<06:33,  2.37it/s]

Gradient norm: 55.640937286208725


Epoch 1 of 5 | Iteration:  23%|██▎       | 282/1212 [02:18<06:16,  2.47it/s]

Gradient norm: 55.65965902873356


Epoch 1 of 5 | Iteration:  23%|██▎       | 283/1212 [02:19<07:21,  2.11it/s]

Gradient norm: 63.55790829789465


Epoch 1 of 5 | Iteration:  23%|██▎       | 284/1212 [02:19<06:46,  2.28it/s]

Gradient norm: 62.05868174104054


Epoch 1 of 5 | Iteration:  24%|██▎       | 285/1212 [02:20<06:26,  2.40it/s]

Gradient norm: 59.396235638478785


Epoch 1 of 5 | Iteration:  24%|██▎       | 286/1212 [02:20<06:12,  2.49it/s]

Gradient norm: 113.9765710266854


Epoch 1 of 5 | Iteration:  24%|██▎       | 287/1212 [02:20<05:55,  2.60it/s]

Gradient norm: 143.1613911961086


Epoch 1 of 5 | Iteration:  24%|██▍       | 288/1212 [02:21<06:07,  2.51it/s]

Gradient norm: 143.34027269379817


Epoch 1 of 5 | Iteration:  24%|██▍       | 289/1212 [02:21<06:20,  2.43it/s]

Gradient norm: 11.594834164497255


Epoch 1 of 5 | Iteration:  24%|██▍       | 290/1212 [02:22<06:21,  2.42it/s]

Gradient norm: 11.697333193583047


Epoch 1 of 5 | Iteration:  24%|██▍       | 291/1212 [02:22<07:00,  2.19it/s]

Gradient norm: 12.304233231305712


Epoch 1 of 5 | Iteration:  24%|██▍       | 292/1212 [02:23<07:51,  1.95it/s]

Gradient norm: 82.08898390754106


Epoch 1 of 5 | Iteration:  24%|██▍       | 293/1212 [02:23<07:10,  2.14it/s]

Gradient norm: 82.72558422306712


Epoch 1 of 5 | Iteration:  24%|██▍       | 294/1212 [02:24<06:41,  2.29it/s]

Gradient norm: 82.8648825006852


Epoch 1 of 5 | Iteration:  24%|██▍       | 295/1212 [02:24<06:18,  2.42it/s]

Gradient norm: 88.24284532497147


Epoch 1 of 5 | Iteration:  24%|██▍       | 296/1212 [02:24<06:03,  2.52it/s]

Gradient norm: 88.38811566208166


Epoch 1 of 5 | Iteration:  25%|██▍       | 297/1212 [02:25<06:16,  2.43it/s]

Gradient norm: 88.26198545721564


Epoch 1 of 5 | Iteration:  25%|██▍       | 298/1212 [02:25<06:31,  2.34it/s]

Gradient norm: 88.38694085844078


Epoch 1 of 5 | Iteration:  25%|██▍       | 299/1212 [02:26<06:22,  2.38it/s]

Gradient norm: 88.87464561339758


Epoch 1 of 5 | Iteration:  25%|██▍       | 300/1212 [02:26<06:53,  2.20it/s]

Gradient norm: 88.99658272008247


Epoch 1 of 5 | Iteration:  25%|██▍       | 301/1212 [02:27<06:40,  2.27it/s]

Gradient norm: 88.58792113527204


Epoch 1 of 5 | Iteration:  25%|██▍       | 302/1212 [02:27<07:14,  2.09it/s]

Gradient norm: 88.54965290856664


Epoch 1 of 5 | Iteration:  25%|██▌       | 303/1212 [02:28<07:04,  2.14it/s]

Gradient norm: 88.13357900028659


Epoch 1 of 5 | Iteration:  25%|██▌       | 304/1212 [02:28<07:32,  2.01it/s]

Gradient norm: 88.09309956960111


Epoch 1 of 5 | Iteration:  25%|██▌       | 305/1212 [02:29<08:01,  1.88it/s]

Gradient norm: 0.32526061934929074


Epoch 1 of 5 | Iteration:  25%|██▌       | 306/1212 [02:29<07:54,  1.91it/s]

Gradient norm: 17.821045596895022


Epoch 1 of 5 | Iteration:  25%|██▌       | 307/1212 [02:30<09:05,  1.66it/s]

Gradient norm: 17.962632766172497


Epoch 1 of 5 | Iteration:  25%|██▌       | 308/1212 [02:31<09:14,  1.63it/s]

Gradient norm: 18.02170160051582


Epoch 1 of 5 | Iteration:  25%|██▌       | 309/1212 [02:31<08:02,  1.87it/s]

Gradient norm: 18.205237631412313


Epoch 1 of 5 | Iteration:  26%|██▌       | 310/1212 [02:31<07:51,  1.91it/s]

Gradient norm: 18.060906705537764


Epoch 1 of 5 | Iteration:  26%|██▌       | 311/1212 [02:32<07:04,  2.12it/s]

Gradient norm: 26.050063566105425


Epoch 1 of 5 | Iteration:  26%|██▌       | 312/1212 [02:32<06:43,  2.23it/s]

Gradient norm: 26.340695755420676


Epoch 1 of 5 | Iteration:  26%|██▌       | 313/1212 [02:33<06:40,  2.24it/s]

Gradient norm: 26.216727001136213


Epoch 1 of 5 | Iteration:  26%|██▌       | 314/1212 [02:33<06:09,  2.43it/s]

Gradient norm: 26.222572868087507


Epoch 1 of 5 | Iteration:  26%|██▌       | 315/1212 [02:33<06:05,  2.45it/s]

Gradient norm: 41.99393511389235


Epoch 1 of 5 | Iteration:  26%|██▌       | 316/1212 [02:34<05:46,  2.59it/s]

Gradient norm: 42.02256252000653


Epoch 1 of 5 | Iteration:  26%|██▌       | 317/1212 [02:34<06:11,  2.41it/s]

Gradient norm: 42.56812469934393


Epoch 1 of 5 | Iteration:  26%|██▌       | 318/1212 [02:35<05:54,  2.52it/s]

Gradient norm: 42.60312407299491


Epoch 1 of 5 | Iteration:  26%|██▋       | 319/1212 [02:35<05:40,  2.63it/s]

Gradient norm: 42.55212610031198


Epoch 1 of 5 | Iteration:  26%|██▋       | 320/1212 [02:35<05:37,  2.64it/s]

Gradient norm: 41.834470667471244


Epoch 1 of 5 | Iteration:  26%|██▋       | 321/1212 [02:36<06:37,  2.24it/s]

Gradient norm: 0.4681751160105529


Epoch 1 of 5 | Iteration:  27%|██▋       | 322/1212 [02:36<06:46,  2.19it/s]

Gradient norm: 2.836698410760358


Epoch 1 of 5 | Iteration:  27%|██▋       | 323/1212 [02:37<06:25,  2.30it/s]

Gradient norm: 42.02358957112904


Epoch 1 of 5 | Iteration:  27%|██▋       | 324/1212 [02:37<06:03,  2.44it/s]

Gradient norm: 49.73403750461111


Epoch 1 of 5 | Iteration:  27%|██▋       | 325/1212 [02:38<06:31,  2.27it/s]

Gradient norm: 47.987980696032665


Epoch 1 of 5 | Iteration:  27%|██▋       | 326/1212 [02:38<06:03,  2.43it/s]

Gradient norm: 49.452335368647404


Epoch 1 of 5 | Iteration:  27%|██▋       | 327/1212 [02:39<06:39,  2.21it/s]

Gradient norm: 49.469851128729445


Epoch 1 of 5 | Iteration:  27%|██▋       | 328/1212 [02:39<06:26,  2.29it/s]

Gradient norm: 49.64503060837271


Epoch 1 of 5 | Iteration:  27%|██▋       | 329/1212 [02:39<06:07,  2.40it/s]

Gradient norm: 49.6356634426225


Epoch 1 of 5 | Iteration:  27%|██▋       | 330/1212 [02:40<05:48,  2.53it/s]

Gradient norm: 49.88093892363048


Epoch 1 of 5 | Iteration:  27%|██▋       | 331/1212 [02:40<06:14,  2.35it/s]

Gradient norm: 49.835197657511124


Epoch 1 of 5 | Iteration:  27%|██▋       | 332/1212 [02:41<07:04,  2.07it/s]

Gradient norm: 59.987573405453986


Epoch 1 of 5 | Iteration:  27%|██▋       | 333/1212 [02:41<07:19,  2.00it/s]

Gradient norm: 60.996538810209586


Epoch 1 of 5 | Iteration:  28%|██▊       | 334/1212 [02:42<07:18,  2.00it/s]

Gradient norm: 61.45814414825176


Epoch 1 of 5 | Iteration:  28%|██▊       | 335/1212 [02:42<07:34,  1.93it/s]

Gradient norm: 61.66411303449898


Epoch 1 of 5 | Iteration:  28%|██▊       | 336/1212 [02:43<07:47,  1.88it/s]

Gradient norm: 75.64143064139787


Epoch 1 of 5 | Iteration:  28%|██▊       | 337/1212 [02:43<07:50,  1.86it/s]

Gradient norm: 0.968969158392488


Epoch 1 of 5 | Iteration:  28%|██▊       | 338/1212 [02:44<07:56,  1.84it/s]

Gradient norm: 1.2348583109922548


Epoch 1 of 5 | Iteration:  28%|██▊       | 339/1212 [02:44<07:26,  1.96it/s]

Gradient norm: 2.2242245234620346


Epoch 1 of 5 | Iteration:  28%|██▊       | 340/1212 [02:45<08:01,  1.81it/s]

Gradient norm: 2.164512954121457


Epoch 1 of 5 | Iteration:  28%|██▊       | 341/1212 [02:46<07:19,  1.98it/s]

Gradient norm: 5.561562272119175


Epoch 1 of 5 | Iteration:  28%|██▊       | 342/1212 [02:46<06:37,  2.19it/s]

Gradient norm: 5.679600295934467


Epoch 1 of 5 | Iteration:  28%|██▊       | 343/1212 [02:46<06:13,  2.33it/s]

Gradient norm: 38.80939905646408


Epoch 1 of 5 | Iteration:  28%|██▊       | 344/1212 [02:47<06:14,  2.32it/s]

Gradient norm: 39.46977890912614


Epoch 1 of 5 | Iteration:  28%|██▊       | 345/1212 [02:47<06:39,  2.17it/s]

Gradient norm: 39.43022216828182


Epoch 1 of 5 | Iteration:  29%|██▊       | 346/1212 [02:48<06:27,  2.24it/s]

Gradient norm: 39.49829114747949


Epoch 1 of 5 | Iteration:  29%|██▊       | 347/1212 [02:48<07:11,  2.00it/s]

Gradient norm: 39.23577819535162


Epoch 1 of 5 | Iteration:  29%|██▊       | 348/1212 [02:49<06:49,  2.11it/s]

Gradient norm: 40.74144048804068


Epoch 1 of 5 | Iteration:  29%|██▉       | 349/1212 [02:49<07:24,  1.94it/s]

Gradient norm: 40.91958994199282


Epoch 1 of 5 | Iteration:  29%|██▉       | 350/1212 [02:50<07:27,  1.93it/s]

Gradient norm: 572.3801596495679


Epoch 1 of 5 | Iteration:  29%|██▉       | 351/1212 [02:50<07:30,  1.91it/s]

Gradient norm: 572.4928131388194


Epoch 1 of 5 | Iteration:  29%|██▉       | 352/1212 [02:51<07:38,  1.88it/s]

Gradient norm: 572.5006642932907


Epoch 1 of 5 | Iteration:  29%|██▉       | 353/1212 [02:51<06:48,  2.10it/s]

Gradient norm: 9.673302479684837


Epoch 1 of 5 | Iteration:  29%|██▉       | 354/1212 [02:52<06:14,  2.29it/s]

Gradient norm: 30.1246483726998


Epoch 1 of 5 | Iteration:  29%|██▉       | 355/1212 [02:52<05:57,  2.40it/s]

Gradient norm: 30.022776031007673


Epoch 1 of 5 | Iteration:  29%|██▉       | 356/1212 [02:52<06:01,  2.37it/s]

Gradient norm: 29.863693163766776


Epoch 1 of 5 | Iteration:  29%|██▉       | 357/1212 [02:53<06:49,  2.09it/s]

Gradient norm: 29.854276551264018


Epoch 1 of 5 | Iteration:  30%|██▉       | 358/1212 [02:53<06:41,  2.13it/s]

Gradient norm: 30.303370629121403


Epoch 1 of 5 | Iteration:  30%|██▉       | 359/1212 [02:54<06:43,  2.11it/s]

Gradient norm: 112.00720095766283


Epoch 1 of 5 | Iteration:  30%|██▉       | 360/1212 [02:54<06:45,  2.10it/s]

Gradient norm: 112.00377423548902


Epoch 1 of 5 | Iteration:  30%|██▉       | 361/1212 [02:55<06:53,  2.06it/s]

Gradient norm: 111.81536274760803


Epoch 1 of 5 | Iteration:  30%|██▉       | 362/1212 [02:55<06:49,  2.07it/s]

Gradient norm: 111.78904830676169


Epoch 1 of 5 | Iteration:  30%|██▉       | 363/1212 [02:56<07:46,  1.82it/s]

Gradient norm: 111.02082002344169


Epoch 1 of 5 | Iteration:  30%|███       | 364/1212 [02:57<07:39,  1.85it/s]

Gradient norm: 110.81071022118267


Epoch 1 of 5 | Iteration:  30%|███       | 365/1212 [02:57<06:51,  2.06it/s]

Gradient norm: 110.83607422011185


Epoch 1 of 5 | Iteration:  30%|███       | 366/1212 [02:57<06:29,  2.17it/s]

Gradient norm: 110.82375502371123


Epoch 1 of 5 | Iteration:  30%|███       | 367/1212 [02:58<06:01,  2.34it/s]

Gradient norm: 105.76522853403587


Epoch 1 of 5 | Iteration:  30%|███       | 368/1212 [02:58<06:15,  2.24it/s]

Gradient norm: 105.66299429698547


Epoch 1 of 5 | Iteration:  30%|███       | 369/1212 [02:59<05:51,  2.40it/s]

Gradient norm: 1.3982345638711473


Epoch 1 of 5 | Iteration:  31%|███       | 370/1212 [02:59<05:36,  2.50it/s]

Gradient norm: 7.560055190545919


Epoch 1 of 5 | Iteration:  31%|███       | 371/1212 [02:59<05:23,  2.60it/s]

Gradient norm: 24.35434418411192


Epoch 1 of 5 | Iteration:  31%|███       | 372/1212 [03:00<05:15,  2.66it/s]

Gradient norm: 24.363961302162867


Epoch 1 of 5 | Iteration:  31%|███       | 373/1212 [03:00<05:06,  2.74it/s]

Gradient norm: 26.19844197397162


Epoch 1 of 5 | Iteration:  31%|███       | 374/1212 [03:00<05:46,  2.42it/s]

Gradient norm: 25.901022184098238


Epoch 1 of 5 | Iteration:  31%|███       | 375/1212 [03:01<06:01,  2.32it/s]

Gradient norm: 25.840570494158627


Epoch 1 of 5 | Iteration:  31%|███       | 376/1212 [03:01<05:49,  2.39it/s]

Gradient norm: 25.757303790821503


Epoch 1 of 5 | Iteration:  31%|███       | 377/1212 [03:02<05:30,  2.53it/s]

Gradient norm: 38.95453782627093


Epoch 1 of 5 | Iteration:  31%|███       | 378/1212 [03:02<05:59,  2.32it/s]

Gradient norm: 39.26060016507317


Epoch 1 of 5 | Iteration:  31%|███▏      | 379/1212 [03:03<05:37,  2.47it/s]

Gradient norm: 38.83329942082079


Epoch 1 of 5 | Iteration:  31%|███▏      | 380/1212 [03:03<05:47,  2.40it/s]

Gradient norm: 46.966231677992454


Epoch 1 of 5 | Iteration:  31%|███▏      | 381/1212 [03:03<05:35,  2.48it/s]

Gradient norm: 46.92492573305317


Epoch 1 of 5 | Iteration:  32%|███▏      | 382/1212 [03:04<05:32,  2.50it/s]

Gradient norm: 46.30321726801374


Epoch 1 of 5 | Iteration:  32%|███▏      | 383/1212 [03:04<06:16,  2.20it/s]

Gradient norm: 46.05011374147717


Epoch 1 of 5 | Iteration:  32%|███▏      | 384/1212 [03:05<06:14,  2.21it/s]

Gradient norm: 54.894469334111456


Epoch 1 of 5 | Iteration:  32%|███▏      | 385/1212 [03:05<05:54,  2.33it/s]

Gradient norm: 0.7875592522364213


Epoch 1 of 5 | Iteration:  32%|███▏      | 386/1212 [03:06<06:12,  2.22it/s]

Gradient norm: 1.714473522825746


Epoch 1 of 5 | Iteration:  32%|███▏      | 387/1212 [03:06<07:00,  1.96it/s]

Gradient norm: 2.329577290541204


Epoch 1 of 5 | Iteration:  32%|███▏      | 388/1212 [03:07<06:44,  2.03it/s]

Gradient norm: 3.1451491307967494


Epoch 1 of 5 | Iteration:  32%|███▏      | 389/1212 [03:07<06:38,  2.06it/s]

Gradient norm: 8.277526424534473


Epoch 1 of 5 | Iteration:  32%|███▏      | 390/1212 [03:08<06:34,  2.08it/s]

Gradient norm: 8.528001181710737


Epoch 1 of 5 | Iteration:  32%|███▏      | 391/1212 [03:08<06:35,  2.07it/s]

Gradient norm: 10.414242383947542


Epoch 1 of 5 | Iteration:  32%|███▏      | 392/1212 [03:09<06:31,  2.09it/s]

Gradient norm: 10.407916561539466


Epoch 1 of 5 | Iteration:  32%|███▏      | 393/1212 [03:09<06:28,  2.11it/s]

Gradient norm: 10.401548027873185


Epoch 1 of 5 | Iteration:  33%|███▎      | 394/1212 [03:10<06:56,  1.96it/s]

Gradient norm: 10.60034359763033


Epoch 1 of 5 | Iteration:  33%|███▎      | 395/1212 [03:10<06:38,  2.05it/s]

Gradient norm: 11.031803531277443


Epoch 1 of 5 | Iteration:  33%|███▎      | 396/1212 [03:11<06:21,  2.14it/s]

Gradient norm: 11.07469641873068


Epoch 1 of 5 | Iteration:  33%|███▎      | 397/1212 [03:11<06:18,  2.15it/s]

Gradient norm: 11.2868191533536


Epoch 1 of 5 | Iteration:  33%|███▎      | 398/1212 [03:11<05:49,  2.33it/s]

Gradient norm: 11.759737531935945


Epoch 1 of 5 | Iteration:  33%|███▎      | 399/1212 [03:12<05:42,  2.38it/s]

Gradient norm: 12.204925525467015


Epoch 1 of 5 | Iteration:  33%|███▎      | 400/1212 [03:12<06:10,  2.19it/s]

Gradient norm: 12.08647319855226


Epoch 1 of 5 | Iteration:  33%|███▎      | 401/1212 [03:13<05:41,  2.38it/s]

Gradient norm: 19.13343803114672


Epoch 1 of 5 | Iteration:  33%|███▎      | 402/1212 [03:13<05:24,  2.49it/s]

Gradient norm: 19.138257284623688


Epoch 1 of 5 | Iteration:  33%|███▎      | 403/1212 [03:13<05:08,  2.62it/s]

Gradient norm: 21.09284964233726


Epoch 1 of 5 | Iteration:  33%|███▎      | 404/1212 [03:14<04:58,  2.71it/s]

Gradient norm: 21.395726513567325


Epoch 1 of 5 | Iteration:  33%|███▎      | 405/1212 [03:14<05:24,  2.49it/s]

Gradient norm: 21.50330087087417


Epoch 1 of 5 | Iteration:  33%|███▎      | 406/1212 [03:15<05:15,  2.56it/s]

Gradient norm: 29.153039146398832


Epoch 1 of 5 | Iteration:  34%|███▎      | 407/1212 [03:15<05:08,  2.61it/s]

Gradient norm: 29.1360024636692


Epoch 1 of 5 | Iteration:  34%|███▎      | 408/1212 [03:15<06:00,  2.23it/s]

Gradient norm: 29.421858176741875


Epoch 1 of 5 | Iteration:  34%|███▎      | 409/1212 [03:16<05:37,  2.38it/s]

Gradient norm: 29.500908999091532


Epoch 1 of 5 | Iteration:  34%|███▍      | 410/1212 [03:16<05:21,  2.50it/s]

Gradient norm: 29.050131886953277


Epoch 1 of 5 | Iteration:  34%|███▍      | 411/1212 [03:17<05:07,  2.60it/s]

Gradient norm: 27.800443124164214


Epoch 1 of 5 | Iteration:  34%|███▍      | 412/1212 [03:17<05:04,  2.63it/s]

Gradient norm: 27.78354996910417


Epoch 1 of 5 | Iteration:  34%|███▍      | 413/1212 [03:17<05:25,  2.45it/s]

Gradient norm: 29.114385993725833


Epoch 1 of 5 | Iteration:  34%|███▍      | 414/1212 [03:18<05:56,  2.24it/s]

Gradient norm: 29.01619443671068


Epoch 1 of 5 | Iteration:  34%|███▍      | 415/1212 [03:18<05:33,  2.39it/s]

Gradient norm: 28.889142750815935


Epoch 1 of 5 | Iteration:  34%|███▍      | 416/1212 [03:19<05:35,  2.37it/s]

Gradient norm: 29.154009024076554


Epoch 1 of 5 | Iteration:  34%|███▍      | 417/1212 [03:19<05:35,  2.37it/s]

Gradient norm: 0.7452225449548721


Epoch 1 of 5 | Iteration:  34%|███▍      | 418/1212 [03:20<06:33,  2.02it/s]

Gradient norm: 3.2396286006339623


Epoch 1 of 5 | Iteration:  35%|███▍      | 419/1212 [03:20<06:32,  2.02it/s]

Gradient norm: 4.420273551635545


Epoch 1 of 5 | Iteration:  35%|███▍      | 420/1212 [03:21<07:22,  1.79it/s]

Gradient norm: 5.4780822412729275


Epoch 1 of 5 | Iteration:  35%|███▍      | 421/1212 [03:22<07:17,  1.81it/s]

Gradient norm: 29.856303818903186


Epoch 1 of 5 | Iteration:  35%|███▍      | 422/1212 [03:22<06:57,  1.89it/s]

Gradient norm: 28.40007607423307


Epoch 1 of 5 | Iteration:  35%|███▍      | 423/1212 [03:23<07:00,  1.88it/s]

Gradient norm: 33.48871329706715


Epoch 1 of 5 | Iteration:  35%|███▍      | 424/1212 [03:23<06:57,  1.89it/s]

Gradient norm: 33.36662781543768


Epoch 1 of 5 | Iteration:  35%|███▌      | 425/1212 [03:24<07:08,  1.84it/s]

Gradient norm: 33.39940569608526


Epoch 1 of 5 | Iteration:  35%|███▌      | 426/1212 [03:24<06:17,  2.08it/s]

Gradient norm: 35.099684794878044


Epoch 1 of 5 | Iteration:  35%|███▌      | 427/1212 [03:24<06:28,  2.02it/s]

Gradient norm: 36.1104250492752


Epoch 1 of 5 | Iteration:  35%|███▌      | 428/1212 [03:25<05:51,  2.23it/s]

Gradient norm: 36.126217384874025


Epoch 1 of 5 | Iteration:  35%|███▌      | 429/1212 [03:25<06:28,  2.02it/s]

Gradient norm: 36.18974135782173


Epoch 1 of 5 | Iteration:  35%|███▌      | 430/1212 [03:26<06:38,  1.96it/s]

Gradient norm: 37.97101997837311


Epoch 1 of 5 | Iteration:  36%|███▌      | 431/1212 [03:26<06:00,  2.17it/s]

Gradient norm: 37.96230119644152


Epoch 1 of 5 | Iteration:  36%|███▌      | 432/1212 [03:27<06:15,  2.08it/s]

Gradient norm: 37.592336749383804


Epoch 1 of 5 | Iteration:  36%|███▌      | 433/1212 [03:27<05:47,  2.24it/s]

Gradient norm: 1.0328942545603013


Epoch 1 of 5 | Iteration:  36%|███▌      | 434/1212 [03:28<05:25,  2.39it/s]

Gradient norm: 2.937891987281621


Epoch 1 of 5 | Iteration:  36%|███▌      | 435/1212 [03:28<05:21,  2.42it/s]

Gradient norm: 33.032908825675634


Epoch 1 of 5 | Iteration:  36%|███▌      | 436/1212 [03:28<05:09,  2.51it/s]

Gradient norm: 73.0535378462072


Epoch 1 of 5 | Iteration:  36%|███▌      | 437/1212 [03:29<05:13,  2.47it/s]

Gradient norm: 72.2459905615344


Epoch 1 of 5 | Iteration:  36%|███▌      | 438/1212 [03:29<05:03,  2.55it/s]

Gradient norm: 72.32381963573857


Epoch 1 of 5 | Iteration:  36%|███▌      | 439/1212 [03:30<05:19,  2.42it/s]

Gradient norm: 72.36965996974052


Epoch 1 of 5 | Iteration:  36%|███▋      | 440/1212 [03:30<05:17,  2.43it/s]

Gradient norm: 72.46301701204435


Epoch 1 of 5 | Iteration:  36%|███▋      | 441/1212 [03:30<05:14,  2.45it/s]

Gradient norm: 72.5573457239957


Epoch 1 of 5 | Iteration:  36%|███▋      | 442/1212 [03:31<05:25,  2.36it/s]

Gradient norm: 72.26953217890123


Epoch 1 of 5 | Iteration:  37%|███▋      | 443/1212 [03:31<05:23,  2.37it/s]

Gradient norm: 72.48676001036637


Epoch 1 of 5 | Iteration:  37%|███▋      | 444/1212 [03:32<05:48,  2.20it/s]

Gradient norm: 72.57747952966128


Epoch 1 of 5 | Iteration:  37%|███▋      | 445/1212 [03:32<06:00,  2.13it/s]

Gradient norm: 72.45430929427786


Epoch 1 of 5 | Iteration:  37%|███▋      | 446/1212 [03:33<05:32,  2.30it/s]

Gradient norm: 74.60689527130448


Epoch 1 of 5 | Iteration:  37%|███▋      | 447/1212 [03:33<05:17,  2.41it/s]

Gradient norm: 74.42603815817247


Epoch 1 of 5 | Iteration:  37%|███▋      | 448/1212 [03:34<05:45,  2.21it/s]

Gradient norm: 74.9394229241966


Epoch 1 of 5 | Iteration:  37%|███▋      | 449/1212 [03:34<05:55,  2.15it/s]

Gradient norm: 1.8974708719870668


Epoch 1 of 5 | Iteration:  37%|███▋      | 450/1212 [03:35<06:13,  2.04it/s]

Gradient norm: 3.5126281754878264


Epoch 1 of 5 | Iteration:  37%|███▋      | 451/1212 [03:35<06:17,  2.02it/s]

Gradient norm: 6.903458468037208


Epoch 1 of 5 | Iteration:  37%|███▋      | 452/1212 [03:36<06:30,  1.95it/s]

Gradient norm: 7.052079851020925


Epoch 1 of 5 | Iteration:  37%|███▋      | 453/1212 [03:36<06:42,  1.89it/s]

Gradient norm: 25.867516006887406


Epoch 1 of 5 | Iteration:  37%|███▋      | 454/1212 [03:37<06:13,  2.03it/s]

Gradient norm: 26.130884937619616


Epoch 1 of 5 | Iteration:  38%|███▊      | 455/1212 [03:37<05:41,  2.22it/s]

Gradient norm: 26.27957795912766


Epoch 1 of 5 | Iteration:  38%|███▊      | 456/1212 [03:37<05:21,  2.35it/s]

Gradient norm: 26.38866743893518


Epoch 1 of 5 | Iteration:  38%|███▊      | 457/1212 [03:38<05:13,  2.41it/s]

Gradient norm: 249.22876752689626


Epoch 1 of 5 | Iteration:  38%|███▊      | 458/1212 [03:38<04:59,  2.51it/s]

Gradient norm: 249.140480915175


Epoch 1 of 5 | Iteration:  38%|███▊      | 459/1212 [03:39<04:58,  2.52it/s]

Gradient norm: 249.16206505814287


Epoch 1 of 5 | Iteration:  38%|███▊      | 460/1212 [03:39<04:51,  2.58it/s]

Gradient norm: 249.03685002828894


Epoch 1 of 5 | Iteration:  38%|███▊      | 461/1212 [03:39<04:54,  2.55it/s]

Gradient norm: 249.00040075034906


Epoch 1 of 5 | Iteration:  38%|███▊      | 462/1212 [03:40<04:49,  2.59it/s]

Gradient norm: 249.2634001382864


Epoch 1 of 5 | Iteration:  38%|███▊      | 463/1212 [03:40<05:17,  2.36it/s]

Gradient norm: 247.2090988190881


Epoch 1 of 5 | Iteration:  38%|███▊      | 464/1212 [03:41<05:38,  2.21it/s]

Gradient norm: 247.13752412377568


Epoch 1 of 5 | Iteration:  38%|███▊      | 465/1212 [03:41<05:32,  2.24it/s]

Gradient norm: 3.4721919871656954


Epoch 1 of 5 | Iteration:  38%|███▊      | 466/1212 [03:42<06:05,  2.04it/s]

Gradient norm: 3.6766990333978216


Epoch 1 of 5 | Iteration:  39%|███▊      | 467/1212 [03:42<05:53,  2.11it/s]

Gradient norm: 3.6802373479864903


Epoch 1 of 5 | Iteration:  39%|███▊      | 468/1212 [03:43<05:26,  2.28it/s]

Gradient norm: 3.735450856588212


Epoch 1 of 5 | Iteration:  39%|███▊      | 469/1212 [03:43<05:12,  2.38it/s]

Gradient norm: 8.205302366201023


Epoch 1 of 5 | Iteration:  39%|███▉      | 470/1212 [03:44<05:58,  2.07it/s]

Gradient norm: 8.400317680574203


Epoch 1 of 5 | Iteration:  39%|███▉      | 471/1212 [03:44<05:28,  2.26it/s]

Gradient norm: 98.23944310327282


Epoch 1 of 5 | Iteration:  39%|███▉      | 472/1212 [03:44<05:07,  2.41it/s]

Gradient norm: 98.53128567204048


Epoch 1 of 5 | Iteration:  39%|███▉      | 473/1212 [03:45<04:51,  2.54it/s]

Gradient norm: 98.39153806916012


Epoch 1 of 5 | Iteration:  39%|███▉      | 474/1212 [03:45<05:28,  2.25it/s]

Gradient norm: 99.26749087655193


Epoch 1 of 5 | Iteration:  39%|███▉      | 475/1212 [03:45<05:04,  2.42it/s]

Gradient norm: 104.2409201029718


Epoch 1 of 5 | Iteration:  39%|███▉      | 476/1212 [03:46<05:12,  2.36it/s]

Gradient norm: 101.45258312931057


Epoch 1 of 5 | Iteration:  39%|███▉      | 477/1212 [03:47<06:02,  2.03it/s]

Gradient norm: 101.05176003625965


Epoch 1 of 5 | Iteration:  39%|███▉      | 478/1212 [03:47<06:02,  2.03it/s]

Gradient norm: 101.38287147341819


Epoch 1 of 5 | Iteration:  40%|███▉      | 479/1212 [03:48<06:19,  1.93it/s]

Gradient norm: 100.55278786874905


Epoch 1 of 5 | Iteration:  40%|███▉      | 480/1212 [03:48<06:57,  1.75it/s]

Gradient norm: 100.86178353876636


Epoch 1 of 5 | Iteration:  40%|███▉      | 481/1212 [03:49<06:51,  1.78it/s]

Gradient norm: 0.16619999100294083


Epoch 1 of 5 | Iteration:  40%|███▉      | 482/1212 [03:49<07:03,  1.72it/s]

Gradient norm: 1.8875364894476112


Epoch 1 of 5 | Iteration:  40%|███▉      | 483/1212 [03:50<06:46,  1.79it/s]

Gradient norm: 5.588963220047553


Epoch 1 of 5 | Iteration:  40%|███▉      | 484/1212 [03:51<06:39,  1.82it/s]

Gradient norm: 5.580615852407488


Epoch 1 of 5 | Iteration:  40%|████      | 485/1212 [03:51<05:55,  2.04it/s]

Gradient norm: 7.234269293669503


Epoch 1 of 5 | Iteration:  40%|████      | 486/1212 [03:51<05:54,  2.05it/s]

Gradient norm: 9.298463812776768


Epoch 1 of 5 | Iteration:  40%|████      | 487/1212 [03:52<05:35,  2.16it/s]

Gradient norm: 9.697628492381538


Epoch 1 of 5 | Iteration:  40%|████      | 488/1212 [03:52<05:11,  2.32it/s]

Gradient norm: 9.362298993190219


Epoch 1 of 5 | Iteration:  40%|████      | 489/1212 [03:52<04:57,  2.43it/s]

Gradient norm: 9.88449798558198


Epoch 1 of 5 | Iteration:  40%|████      | 490/1212 [03:53<04:45,  2.53it/s]

Gradient norm: 10.03115907906487


Epoch 1 of 5 | Iteration:  41%|████      | 491/1212 [03:53<04:37,  2.60it/s]

Gradient norm: 10.632316059608927


Epoch 1 of 5 | Iteration:  41%|████      | 492/1212 [03:54<05:11,  2.31it/s]

Gradient norm: 11.004167735291654


Epoch 1 of 5 | Iteration:  41%|████      | 493/1212 [03:54<04:57,  2.42it/s]

Gradient norm: 10.473843512863352


Epoch 1 of 5 | Iteration:  41%|████      | 494/1212 [03:55<05:16,  2.27it/s]

Gradient norm: 10.469776552791146


Epoch 1 of 5 | Iteration:  41%|████      | 495/1212 [03:55<04:54,  2.43it/s]

Gradient norm: 10.477704416089317


Epoch 1 of 5 | Iteration:  41%|████      | 496/1212 [03:56<05:28,  2.18it/s]

Gradient norm: 12.46159914024453


Epoch 1 of 5 | Iteration:  41%|████      | 497/1212 [03:56<05:37,  2.12it/s]

Gradient norm: 34.66081069773412


Epoch 1 of 5 | Iteration:  41%|████      | 498/1212 [03:56<05:09,  2.31it/s]

Gradient norm: 34.43368757096605


Epoch 1 of 5 | Iteration:  41%|████      | 499/1212 [03:57<04:51,  2.45it/s]

Gradient norm: 36.5449760922053


Epoch 1 of 5 | Iteration:  41%|████▏     | 500/1212 [03:57<04:52,  2.43it/s]

Gradient norm: 47.6529465447019


Epoch 1 of 5 | Iteration:  41%|████▏     | 501/1212 [03:58<05:09,  2.30it/s]

Gradient norm: 46.27314993980474


Epoch 1 of 5 | Iteration:  41%|████▏     | 502/1212 [03:58<05:11,  2.28it/s]

Gradient norm: 46.540473417935104


Epoch 1 of 5 | Iteration:  42%|████▏     | 503/1212 [03:58<05:05,  2.32it/s]

Gradient norm: 46.40800609361859


Epoch 1 of 5 | Iteration:  42%|████▏     | 504/1212 [03:59<04:50,  2.44it/s]

Gradient norm: 50.91231324257295


Epoch 1 of 5 | Iteration:  42%|████▏     | 505/1212 [03:59<04:42,  2.51it/s]

Gradient norm: 47.55500900623254


Epoch 1 of 5 | Iteration:  42%|████▏     | 506/1212 [04:00<04:49,  2.44it/s]

Gradient norm: 47.59430241409421


Epoch 1 of 5 | Iteration:  42%|████▏     | 507/1212 [04:00<05:06,  2.30it/s]

Gradient norm: 47.16590452889894


Epoch 1 of 5 | Iteration:  42%|████▏     | 508/1212 [04:01<05:26,  2.16it/s]

Gradient norm: 47.01773597813177


Epoch 1 of 5 | Iteration:  42%|████▏     | 509/1212 [04:01<05:29,  2.14it/s]

Gradient norm: 46.83883619222933


Epoch 1 of 5 | Iteration:  42%|████▏     | 510/1212 [04:02<05:38,  2.07it/s]

Gradient norm: 46.76832003931799


Epoch 1 of 5 | Iteration:  42%|████▏     | 511/1212 [04:02<05:53,  1.99it/s]

Gradient norm: 46.65793471077143


Epoch 1 of 5 | Iteration:  42%|████▏     | 512/1212 [04:03<06:00,  1.94it/s]

Gradient norm: 52.912906157791895


Epoch 1 of 5 | Iteration:  42%|████▏     | 513/1212 [04:03<05:56,  1.96it/s]

Gradient norm: 3.2453629675744726


Epoch 1 of 5 | Iteration:  42%|████▏     | 514/1212 [04:04<05:34,  2.09it/s]

Gradient norm: 18.137058482048072


Epoch 1 of 5 | Iteration:  42%|████▏     | 515/1212 [04:04<05:44,  2.02it/s]

Gradient norm: 137.55132686564357


Epoch 1 of 5 | Iteration:  43%|████▎     | 516/1212 [04:05<05:17,  2.19it/s]

Gradient norm: 138.96056881140223


Epoch 1 of 5 | Iteration:  43%|████▎     | 517/1212 [04:05<04:57,  2.34it/s]

Gradient norm: 138.0352074715903


Epoch 1 of 5 | Iteration:  43%|████▎     | 518/1212 [04:06<05:36,  2.06it/s]

Gradient norm: 137.99121185330384


Epoch 1 of 5 | Iteration:  43%|████▎     | 519/1212 [04:06<05:36,  2.06it/s]

Gradient norm: 138.4947749030429


Epoch 1 of 5 | Iteration:  43%|████▎     | 520/1212 [04:07<05:39,  2.04it/s]

Gradient norm: 138.78739244752472


Epoch 1 of 5 | Iteration:  43%|████▎     | 521/1212 [04:07<05:23,  2.14it/s]

Gradient norm: 138.7084045684091


Epoch 1 of 5 | Iteration:  43%|████▎     | 522/1212 [04:07<05:02,  2.28it/s]

Gradient norm: 138.68193897588532


Epoch 1 of 5 | Iteration:  43%|████▎     | 523/1212 [04:08<04:53,  2.35it/s]

Gradient norm: 138.5724222771915


Epoch 1 of 5 | Iteration:  43%|████▎     | 524/1212 [04:08<04:39,  2.46it/s]

Gradient norm: 142.0477804614198


Epoch 1 of 5 | Iteration:  43%|████▎     | 525/1212 [04:09<05:04,  2.25it/s]

Gradient norm: 142.11310601356908


Epoch 1 of 5 | Iteration:  43%|████▎     | 526/1212 [04:09<04:48,  2.37it/s]

Gradient norm: 141.86248523166597


Epoch 1 of 5 | Iteration:  43%|████▎     | 527/1212 [04:09<04:35,  2.49it/s]

Gradient norm: 141.75689550385863


Epoch 1 of 5 | Iteration:  44%|████▎     | 528/1212 [04:10<05:15,  2.17it/s]

Gradient norm: 141.75125465941076


Epoch 1 of 5 | Iteration:  44%|████▎     | 529/1212 [04:10<04:52,  2.34it/s]

Gradient norm: 0.23110609571124605


Epoch 1 of 5 | Iteration:  44%|████▎     | 530/1212 [04:11<04:49,  2.36it/s]

Gradient norm: 1.1737736165166057


Epoch 1 of 5 | Iteration:  44%|████▍     | 531/1212 [04:11<04:28,  2.54it/s]

Gradient norm: 3.27818776335519


Epoch 1 of 5 | Iteration:  44%|████▍     | 532/1212 [04:12<05:02,  2.25it/s]

Gradient norm: 3.459792473304403


Epoch 1 of 5 | Iteration:  44%|████▍     | 533/1212 [04:12<04:38,  2.44it/s]

Gradient norm: 5.209833406837047


Epoch 1 of 5 | Iteration:  44%|████▍     | 534/1212 [04:12<04:26,  2.54it/s]

Gradient norm: 11.88388725896574


Epoch 1 of 5 | Iteration:  44%|████▍     | 535/1212 [04:13<04:22,  2.58it/s]

Gradient norm: 19.607092729045412


Epoch 1 of 5 | Iteration:  44%|████▍     | 536/1212 [04:13<04:33,  2.47it/s]

Gradient norm: 19.569969670972466


Epoch 1 of 5 | Iteration:  44%|████▍     | 537/1212 [04:14<04:50,  2.32it/s]

Gradient norm: 75.37414079483344


Epoch 1 of 5 | Iteration:  44%|████▍     | 538/1212 [04:14<05:05,  2.21it/s]

Gradient norm: 75.53576674409557


Epoch 1 of 5 | Iteration:  44%|████▍     | 539/1212 [04:15<05:26,  2.06it/s]

Gradient norm: 76.49902735561508


Epoch 1 of 5 | Iteration:  45%|████▍     | 540/1212 [04:15<05:23,  2.08it/s]

Gradient norm: 76.1479342814718


Epoch 1 of 5 | Iteration:  45%|████▍     | 541/1212 [04:16<05:14,  2.13it/s]

Gradient norm: 77.10635948312287


Epoch 1 of 5 | Iteration:  45%|████▍     | 542/1212 [04:16<05:45,  1.94it/s]

Gradient norm: 77.078979740834


Epoch 1 of 5 | Iteration:  45%|████▍     | 543/1212 [04:17<05:23,  2.07it/s]

Gradient norm: 76.9611308327208


Epoch 1 of 5 | Iteration:  45%|████▍     | 544/1212 [04:17<06:19,  1.76it/s]

Gradient norm: 76.99799001153637


Epoch 1 of 5 | Iteration:  45%|████▍     | 545/1212 [04:18<05:45,  1.93it/s]

Gradient norm: 1.1476532232394967


Epoch 1 of 5 | Iteration:  45%|████▌     | 546/1212 [04:18<05:43,  1.94it/s]

Gradient norm: 1.200633625031166


Epoch 1 of 5 | Iteration:  45%|████▌     | 547/1212 [04:19<05:10,  2.14it/s]

Gradient norm: 4.5293754196803935


Epoch 1 of 5 | Iteration:  45%|████▌     | 548/1212 [04:19<05:21,  2.07it/s]

Gradient norm: 4.553546134831177


Epoch 1 of 5 | Iteration:  45%|████▌     | 549/1212 [04:20<04:54,  2.25it/s]

Gradient norm: 4.867232221441744


Epoch 1 of 5 | Iteration:  45%|████▌     | 550/1212 [04:20<04:48,  2.30it/s]

Gradient norm: 4.895720694897363


Epoch 1 of 5 | Iteration:  45%|████▌     | 551/1212 [04:20<04:28,  2.46it/s]

Gradient norm: 5.977026224760828


Epoch 1 of 5 | Iteration:  46%|████▌     | 552/1212 [04:21<04:18,  2.56it/s]

Gradient norm: 13.793176018359867


Epoch 1 of 5 | Iteration:  46%|████▌     | 553/1212 [04:21<04:20,  2.53it/s]

Gradient norm: 15.512407617306117


Epoch 1 of 5 | Iteration:  46%|████▌     | 554/1212 [04:21<04:27,  2.46it/s]

Gradient norm: 17.305854611404776


Epoch 1 of 5 | Iteration:  46%|████▌     | 555/1212 [04:22<04:35,  2.38it/s]

Gradient norm: 18.528996059546692


Epoch 1 of 5 | Iteration:  46%|████▌     | 556/1212 [04:22<04:31,  2.41it/s]

Gradient norm: 37.00037041256171


Epoch 1 of 5 | Iteration:  46%|████▌     | 557/1212 [04:23<04:18,  2.53it/s]

Gradient norm: 37.33783608737269


Epoch 1 of 5 | Iteration:  46%|████▌     | 558/1212 [04:23<04:13,  2.58it/s]

Gradient norm: 37.20960445767698


Epoch 1 of 5 | Iteration:  46%|████▌     | 559/1212 [04:24<04:52,  2.23it/s]

Gradient norm: 38.38363524271609


Epoch 1 of 5 | Iteration:  46%|████▌     | 560/1212 [04:24<04:56,  2.20it/s]

Gradient norm: 38.91586355042237


Epoch 1 of 5 | Iteration:  46%|████▋     | 561/1212 [04:24<04:45,  2.28it/s]

Gradient norm: 0.6167173309859652


Epoch 1 of 5 | Iteration:  46%|████▋     | 562/1212 [04:25<04:50,  2.23it/s]

Gradient norm: 5.534081407522542


Epoch 1 of 5 | Iteration:  46%|████▋     | 563/1212 [04:25<04:29,  2.41it/s]

Gradient norm: 6.491520167660893


Epoch 1 of 5 | Iteration:  47%|████▋     | 564/1212 [04:26<04:16,  2.53it/s]

Gradient norm: 12.979108331325424


Epoch 1 of 5 | Iteration:  47%|████▋     | 565/1212 [04:26<04:03,  2.66it/s]

Gradient norm: 13.085505819762304


Epoch 1 of 5 | Iteration:  47%|████▋     | 566/1212 [04:27<04:41,  2.30it/s]

Gradient norm: 13.1061891766081


Epoch 1 of 5 | Iteration:  47%|████▋     | 567/1212 [04:27<04:44,  2.27it/s]

Gradient norm: 12.47202568238892


Epoch 1 of 5 | Iteration:  47%|████▋     | 568/1212 [04:28<05:04,  2.12it/s]

Gradient norm: 18.846928102123975


Epoch 1 of 5 | Iteration:  47%|████▋     | 569/1212 [04:28<05:07,  2.09it/s]

Gradient norm: 32.683883834764174


Epoch 1 of 5 | Iteration:  47%|████▋     | 570/1212 [04:29<05:04,  2.11it/s]

Gradient norm: 32.6103320664306


Epoch 1 of 5 | Iteration:  47%|████▋     | 571/1212 [04:29<05:06,  2.09it/s]

Gradient norm: 32.704378733828015


Epoch 1 of 5 | Iteration:  47%|████▋     | 572/1212 [04:30<05:22,  1.98it/s]

Gradient norm: 339.7742883772161


Epoch 1 of 5 | Iteration:  47%|████▋     | 573/1212 [04:30<05:20,  1.99it/s]

Gradient norm: 339.82123098867476


Epoch 1 of 5 | Iteration:  47%|████▋     | 574/1212 [04:30<04:53,  2.18it/s]

Gradient norm: 339.4537466356814


Epoch 1 of 5 | Iteration:  47%|████▋     | 575/1212 [04:31<05:09,  2.06it/s]

Gradient norm: 339.2216671131883


Epoch 1 of 5 | Iteration:  48%|████▊     | 576/1212 [04:32<05:21,  1.98it/s]

Gradient norm: 339.3963567693112


Epoch 1 of 5 | Iteration:  48%|████▊     | 577/1212 [04:32<04:49,  2.20it/s]

Gradient norm: 8.638571582558544


Epoch 1 of 5 | Iteration:  48%|████▊     | 578/1212 [04:32<05:11,  2.04it/s]

Gradient norm: 9.178214960265507


Epoch 1 of 5 | Iteration:  48%|████▊     | 579/1212 [04:33<05:13,  2.02it/s]

Gradient norm: 10.087361835634375


Epoch 1 of 5 | Iteration:  48%|████▊     | 580/1212 [04:33<05:09,  2.04it/s]

Gradient norm: 10.147040269063206


Epoch 1 of 5 | Iteration:  48%|████▊     | 581/1212 [04:34<05:00,  2.10it/s]

Gradient norm: 10.405782005202337


Epoch 1 of 5 | Iteration:  48%|████▊     | 582/1212 [04:34<05:28,  1.92it/s]

Gradient norm: 10.408793104236326


Epoch 1 of 5 | Iteration:  48%|████▊     | 583/1212 [04:35<05:49,  1.80it/s]

Gradient norm: 10.473996539379508


Epoch 1 of 5 | Iteration:  48%|████▊     | 584/1212 [04:36<05:44,  1.82it/s]

Gradient norm: 100.31385806110208


Epoch 1 of 5 | Iteration:  48%|████▊     | 585/1212 [04:36<05:34,  1.87it/s]

Gradient norm: 99.3194465708432


Epoch 1 of 5 | Iteration:  48%|████▊     | 586/1212 [04:36<04:58,  2.10it/s]

Gradient norm: 102.02726518394046


Epoch 1 of 5 | Iteration:  48%|████▊     | 587/1212 [04:37<04:51,  2.14it/s]

Gradient norm: 102.22359655711242


Epoch 1 of 5 | Iteration:  49%|████▊     | 588/1212 [04:37<04:30,  2.31it/s]

Gradient norm: 102.34519747921904


Epoch 1 of 5 | Iteration:  49%|████▊     | 589/1212 [04:38<04:16,  2.43it/s]

Gradient norm: 100.92028323833863


Epoch 1 of 5 | Iteration:  49%|████▊     | 590/1212 [04:38<04:23,  2.36it/s]

Gradient norm: 98.99732662186838


Epoch 1 of 5 | Iteration:  49%|████▉     | 591/1212 [04:38<04:08,  2.50it/s]

Gradient norm: 104.18964522381582


Epoch 1 of 5 | Iteration:  49%|████▉     | 592/1212 [04:39<04:01,  2.57it/s]

Gradient norm: 104.27353654458622


Epoch 1 of 5 | Iteration:  49%|████▉     | 593/1212 [04:39<03:55,  2.63it/s]

Gradient norm: 0.636645345284858


Epoch 1 of 5 | Iteration:  49%|████▉     | 594/1212 [04:40<03:52,  2.66it/s]

Gradient norm: 5.883336641188178


Epoch 1 of 5 | Iteration:  49%|████▉     | 595/1212 [04:40<04:15,  2.41it/s]

Gradient norm: 6.033600065825497


Epoch 1 of 5 | Iteration:  49%|████▉     | 596/1212 [04:41<04:25,  2.32it/s]

Gradient norm: 6.139693185133075


Epoch 1 of 5 | Iteration:  49%|████▉     | 597/1212 [04:41<04:36,  2.22it/s]

Gradient norm: 6.259719597861814


Epoch 1 of 5 | Iteration:  49%|████▉     | 598/1212 [04:42<05:33,  1.84it/s]

Gradient norm: 6.5917334166887755


Epoch 1 of 5 | Iteration:  49%|████▉     | 599/1212 [04:42<06:02,  1.69it/s]

Gradient norm: 7.037922240883648


Epoch 1 of 5 | Iteration:  50%|████▉     | 600/1212 [04:43<05:39,  1.80it/s]

Gradient norm: 128.85292196479213


Epoch 1 of 5 | Iteration:  50%|████▉     | 601/1212 [04:43<05:01,  2.03it/s]

Gradient norm: 128.40556706505012


Epoch 1 of 5 | Iteration:  50%|████▉     | 602/1212 [04:44<04:39,  2.18it/s]

Gradient norm: 129.24787390600352


Epoch 1 of 5 | Iteration:  50%|████▉     | 603/1212 [04:44<04:22,  2.32it/s]

Gradient norm: 128.8635374281191


Epoch 1 of 5 | Iteration:  50%|████▉     | 604/1212 [04:44<04:06,  2.47it/s]

Gradient norm: 128.80531702307377


Epoch 1 of 5 | Iteration:  50%|████▉     | 605/1212 [04:45<04:06,  2.47it/s]

Gradient norm: 128.94823053224354


Epoch 1 of 5 | Iteration:  50%|█████     | 606/1212 [04:45<03:56,  2.57it/s]

Gradient norm: 129.9517480766248


Epoch 1 of 5 | Iteration:  50%|█████     | 607/1212 [04:45<03:47,  2.66it/s]

Gradient norm: 129.55404591166274


Epoch 1 of 5 | Iteration:  50%|█████     | 608/1212 [04:46<03:51,  2.61it/s]

Gradient norm: 129.47472178221216


Epoch 1 of 5 | Iteration:  50%|█████     | 609/1212 [04:46<03:46,  2.66it/s]

Gradient norm: 5.747148304513197


Epoch 1 of 5 | Iteration:  50%|█████     | 610/1212 [04:47<03:42,  2.71it/s]

Gradient norm: 6.207117034208003


Epoch 1 of 5 | Iteration:  50%|█████     | 611/1212 [04:47<03:46,  2.65it/s]

Gradient norm: 6.279186455869577


Epoch 1 of 5 | Iteration:  50%|█████     | 612/1212 [04:47<03:40,  2.73it/s]

Gradient norm: 8.493781759727533


Epoch 1 of 5 | Iteration:  51%|█████     | 613/1212 [04:48<03:45,  2.66it/s]

Gradient norm: 8.465079985461307


Epoch 1 of 5 | Iteration:  51%|█████     | 614/1212 [04:48<04:34,  2.18it/s]

Gradient norm: 8.43330586897693


Epoch 1 of 5 | Iteration:  51%|█████     | 615/1212 [04:49<04:13,  2.35it/s]

Gradient norm: 9.60966767374016


Epoch 1 of 5 | Iteration:  51%|█████     | 616/1212 [04:49<04:00,  2.48it/s]

Gradient norm: 9.68185795527291


Epoch 1 of 5 | Iteration:  51%|█████     | 617/1212 [04:49<03:47,  2.62it/s]

Gradient norm: 9.681863947237487


Epoch 1 of 5 | Iteration:  51%|█████     | 618/1212 [04:50<03:41,  2.69it/s]

Gradient norm: 11.278687370826551


Epoch 1 of 5 | Iteration:  51%|█████     | 619/1212 [04:50<03:45,  2.64it/s]

Gradient norm: 11.408824961809092


Epoch 1 of 5 | Iteration:  51%|█████     | 620/1212 [04:50<03:37,  2.72it/s]

Gradient norm: 11.408875769175767


Epoch 1 of 5 | Iteration:  51%|█████     | 621/1212 [04:51<03:39,  2.69it/s]

Gradient norm: 11.213334764330757


Epoch 1 of 5 | Iteration:  51%|█████▏    | 622/1212 [04:51<03:34,  2.75it/s]

Gradient norm: 11.129975670733257


Epoch 1 of 5 | Iteration:  51%|█████▏    | 623/1212 [04:52<04:22,  2.25it/s]

Gradient norm: 11.524427428972414


Epoch 1 of 5 | Iteration:  51%|█████▏    | 624/1212 [04:52<04:19,  2.26it/s]

Gradient norm: 11.583760656416128


Epoch 1 of 5 | Iteration:  52%|█████▏    | 625/1212 [04:53<04:02,  2.42it/s]

Gradient norm: 0.8860444856159246


Epoch 1 of 5 | Iteration:  52%|█████▏    | 626/1212 [04:53<04:45,  2.05it/s]

Gradient norm: 2.0618626807401834


Epoch 1 of 5 | Iteration:  52%|█████▏    | 627/1212 [04:54<04:39,  2.10it/s]

Gradient norm: 2.623126149973035


Epoch 1 of 5 | Iteration:  52%|█████▏    | 628/1212 [04:54<04:51,  2.01it/s]

Gradient norm: 2.791229744795012


Epoch 1 of 5 | Iteration:  52%|█████▏    | 629/1212 [04:55<04:49,  2.02it/s]

Gradient norm: 4.514797659557832


Epoch 1 of 5 | Iteration:  52%|█████▏    | 630/1212 [04:55<04:47,  2.03it/s]

Gradient norm: 8.413662028919763


Epoch 1 of 5 | Iteration:  52%|█████▏    | 631/1212 [04:56<04:57,  1.95it/s]

Gradient norm: 21.68772850619752


Epoch 1 of 5 | Iteration:  52%|█████▏    | 632/1212 [04:56<04:40,  2.07it/s]

Gradient norm: 107.6953298043888


Epoch 1 of 5 | Iteration:  52%|█████▏    | 633/1212 [04:57<04:42,  2.05it/s]

Gradient norm: 106.63376425591113


Epoch 1 of 5 | Iteration:  52%|█████▏    | 634/1212 [04:57<04:18,  2.23it/s]

Gradient norm: 107.03694547736535


Epoch 1 of 5 | Iteration:  52%|█████▏    | 635/1212 [04:58<04:09,  2.31it/s]

Gradient norm: 108.43625192043211


Epoch 1 of 5 | Iteration:  52%|█████▏    | 636/1212 [04:58<03:52,  2.47it/s]

Gradient norm: 108.59553090991172


Epoch 1 of 5 | Iteration:  53%|█████▎    | 637/1212 [04:58<04:05,  2.34it/s]

Gradient norm: 108.43845953604841


Epoch 1 of 5 | Iteration:  53%|█████▎    | 638/1212 [04:59<03:50,  2.49it/s]

Gradient norm: 108.43989817056043


Epoch 1 of 5 | Iteration:  53%|█████▎    | 639/1212 [04:59<03:39,  2.61it/s]

Gradient norm: 108.46832300959441


Epoch 1 of 5 | Iteration:  53%|█████▎    | 640/1212 [04:59<03:36,  2.64it/s]

Gradient norm: 167.7819281562131


Epoch 1 of 5 | Iteration:  53%|█████▎    | 641/1212 [05:00<04:09,  2.29it/s]

Gradient norm: 1.3259976910542899


Epoch 1 of 5 | Iteration:  53%|█████▎    | 642/1212 [05:00<04:24,  2.15it/s]

Gradient norm: 1.927109798489996


Epoch 1 of 5 | Iteration:  53%|█████▎    | 643/1212 [05:01<04:06,  2.31it/s]

Gradient norm: 6.484535259945683


Epoch 1 of 5 | Iteration:  53%|█████▎    | 644/1212 [05:01<04:02,  2.34it/s]

Gradient norm: 30.03114932271974


Epoch 1 of 5 | Iteration:  53%|█████▎    | 645/1212 [05:02<04:25,  2.14it/s]

Gradient norm: 58.33884917982882


Epoch 1 of 5 | Iteration:  53%|█████▎    | 646/1212 [05:02<04:12,  2.25it/s]

Gradient norm: 57.76819229958986


Epoch 1 of 5 | Iteration:  53%|█████▎    | 647/1212 [05:03<03:53,  2.42it/s]

Gradient norm: 58.201702927868915


Epoch 1 of 5 | Iteration:  53%|█████▎    | 648/1212 [05:03<03:39,  2.57it/s]

Gradient norm: 58.967816494455285


Epoch 1 of 5 | Iteration:  54%|█████▎    | 649/1212 [05:03<04:02,  2.33it/s]

Gradient norm: 72.49571824397383


Epoch 1 of 5 | Iteration:  54%|█████▎    | 650/1212 [05:04<03:50,  2.43it/s]

Gradient norm: 73.0496225482872


Epoch 1 of 5 | Iteration:  54%|█████▎    | 651/1212 [05:04<04:33,  2.05it/s]

Gradient norm: 161.64754918148913


Epoch 1 of 5 | Iteration:  54%|█████▍    | 652/1212 [05:05<04:11,  2.23it/s]

Gradient norm: 160.487589775717


Epoch 1 of 5 | Iteration:  54%|█████▍    | 653/1212 [05:05<04:02,  2.30it/s]

Gradient norm: 160.38518404454686


Epoch 1 of 5 | Iteration:  54%|█████▍    | 654/1212 [05:06<04:08,  2.24it/s]

Gradient norm: 160.38859862713167


Epoch 1 of 5 | Iteration:  54%|█████▍    | 655/1212 [05:06<04:19,  2.15it/s]

Gradient norm: 160.831144234149


Epoch 1 of 5 | Iteration:  54%|█████▍    | 656/1212 [05:07<04:29,  2.06it/s]

Gradient norm: 233.18647172887268


Epoch 1 of 5 | Iteration:  54%|█████▍    | 657/1212 [05:07<04:36,  2.01it/s]

Gradient norm: 6.608030076359513


Epoch 1 of 5 | Iteration:  54%|█████▍    | 658/1212 [05:08<05:07,  1.80it/s]

Gradient norm: 7.054973661094642


Epoch 1 of 5 | Iteration:  54%|█████▍    | 659/1212 [05:08<05:10,  1.78it/s]

Gradient norm: 12.989624557024671


Epoch 1 of 5 | Iteration:  54%|█████▍    | 660/1212 [05:09<05:08,  1.79it/s]

Gradient norm: 27.88579796129339


Epoch 1 of 5 | Iteration:  55%|█████▍    | 661/1212 [05:09<04:48,  1.91it/s]

Gradient norm: 28.064326144860768


Epoch 1 of 5 | Iteration:  55%|█████▍    | 662/1212 [05:10<04:43,  1.94it/s]

Gradient norm: 28.372080396590313


Epoch 1 of 5 | Iteration:  55%|█████▍    | 663/1212 [05:10<04:40,  1.96it/s]

Gradient norm: 28.058142895444124


Epoch 1 of 5 | Iteration:  55%|█████▍    | 664/1212 [05:11<04:16,  2.14it/s]

Gradient norm: 28.07579384590835


Epoch 1 of 5 | Iteration:  55%|█████▍    | 665/1212 [05:11<03:53,  2.34it/s]

Gradient norm: 27.63064746974084


Epoch 1 of 5 | Iteration:  55%|█████▍    | 666/1212 [05:12<04:29,  2.02it/s]

Gradient norm: 27.334144270331446


Epoch 1 of 5 | Iteration:  55%|█████▌    | 667/1212 [05:12<04:08,  2.20it/s]

Gradient norm: 27.97737957921146


Epoch 1 of 5 | Iteration:  55%|█████▌    | 668/1212 [05:13<03:53,  2.33it/s]

Gradient norm: 40.3813978002728


Epoch 1 of 5 | Iteration:  55%|█████▌    | 669/1212 [05:13<03:40,  2.46it/s]

Gradient norm: 40.094072121048335


Epoch 1 of 5 | Iteration:  55%|█████▌    | 670/1212 [05:13<03:30,  2.58it/s]

Gradient norm: 40.03794559706723


Epoch 1 of 5 | Iteration:  55%|█████▌    | 671/1212 [05:14<03:26,  2.63it/s]

Gradient norm: 40.16731319527026


Epoch 1 of 5 | Iteration:  55%|█████▌    | 672/1212 [05:14<03:26,  2.62it/s]

Gradient norm: 40.08084428929835


Epoch 1 of 5 | Iteration:  56%|█████▌    | 673/1212 [05:14<03:25,  2.63it/s]

Gradient norm: 1.3003950711565453


Epoch 1 of 5 | Iteration:  56%|█████▌    | 674/1212 [05:15<03:35,  2.49it/s]

Gradient norm: 36.526913330677004


Epoch 1 of 5 | Iteration:  56%|█████▌    | 675/1212 [05:15<03:48,  2.35it/s]

Gradient norm: 36.56073687631692


Epoch 1 of 5 | Iteration:  56%|█████▌    | 676/1212 [05:16<03:57,  2.26it/s]

Gradient norm: 38.613569770923064


Epoch 1 of 5 | Iteration:  56%|█████▌    | 677/1212 [05:16<03:56,  2.27it/s]

Gradient norm: 38.71741520181506


Epoch 1 of 5 | Iteration:  56%|█████▌    | 678/1212 [05:17<03:45,  2.36it/s]

Gradient norm: 38.78971462045245


Epoch 1 of 5 | Iteration:  56%|█████▌    | 679/1212 [05:17<04:38,  1.91it/s]

Gradient norm: 38.769585698796455


Epoch 1 of 5 | Iteration:  56%|█████▌    | 680/1212 [05:18<04:11,  2.12it/s]

Gradient norm: 38.769973720298815


Epoch 1 of 5 | Iteration:  56%|█████▌    | 681/1212 [05:18<03:51,  2.30it/s]

Gradient norm: 39.68990200926764


Epoch 1 of 5 | Iteration:  56%|█████▋    | 682/1212 [05:18<03:38,  2.43it/s]

Gradient norm: 39.68311007080635


Epoch 1 of 5 | Iteration:  56%|█████▋    | 683/1212 [05:19<03:33,  2.48it/s]

Gradient norm: 41.817728098023935


Epoch 1 of 5 | Iteration:  56%|█████▋    | 684/1212 [05:19<03:49,  2.30it/s]

Gradient norm: 41.74964936561093


Epoch 1 of 5 | Iteration:  57%|█████▋    | 685/1212 [05:20<04:15,  2.07it/s]

Gradient norm: 41.81246452250078


Epoch 1 of 5 | Iteration:  57%|█████▋    | 686/1212 [05:21<04:58,  1.76it/s]

Gradient norm: 42.051521168574


Epoch 1 of 5 | Iteration:  57%|█████▋    | 687/1212 [05:21<04:54,  1.78it/s]

Gradient norm: 42.018370425641116


Epoch 1 of 5 | Iteration:  57%|█████▋    | 688/1212 [05:22<04:41,  1.86it/s]

Gradient norm: 42.10554455899756


Epoch 1 of 5 | Iteration:  57%|█████▋    | 689/1212 [05:22<04:44,  1.84it/s]

Gradient norm: 9.32753763806558


Epoch 1 of 5 | Iteration:  57%|█████▋    | 690/1212 [05:23<04:26,  1.96it/s]

Gradient norm: 9.307826529574896


Epoch 1 of 5 | Iteration:  57%|█████▋    | 691/1212 [05:23<04:15,  2.04it/s]

Gradient norm: 9.301201638229283


Epoch 1 of 5 | Iteration:  57%|█████▋    | 692/1212 [05:23<03:51,  2.25it/s]

Gradient norm: 9.28450349528158


Epoch 1 of 5 | Iteration:  57%|█████▋    | 693/1212 [05:24<03:49,  2.26it/s]

Gradient norm: 9.527799285278737


Epoch 1 of 5 | Iteration:  57%|█████▋    | 694/1212 [05:24<03:58,  2.17it/s]

Gradient norm: 9.67118270212438


Epoch 1 of 5 | Iteration:  57%|█████▋    | 695/1212 [05:25<03:41,  2.34it/s]

Gradient norm: 39.30276089687885


Epoch 1 of 5 | Iteration:  57%|█████▋    | 696/1212 [05:25<03:33,  2.41it/s]

Gradient norm: 51.61432541208167


Epoch 1 of 5 | Iteration:  58%|█████▊    | 697/1212 [05:26<03:30,  2.44it/s]

Gradient norm: 51.34419084555859


Epoch 1 of 5 | Iteration:  58%|█████▊    | 698/1212 [05:26<03:36,  2.37it/s]

Gradient norm: 83.45330690425922


Epoch 1 of 5 | Iteration:  58%|█████▊    | 699/1212 [05:26<03:24,  2.50it/s]

Gradient norm: 82.89179319754965


Epoch 1 of 5 | Iteration:  58%|█████▊    | 700/1212 [05:27<03:24,  2.50it/s]

Gradient norm: 83.0044765826477


Epoch 1 of 5 | Iteration:  58%|█████▊    | 701/1212 [05:27<03:16,  2.60it/s]

Gradient norm: 82.87192543396165


Epoch 1 of 5 | Iteration:  58%|█████▊    | 702/1212 [05:27<03:10,  2.67it/s]

Gradient norm: 82.81276153596252


Epoch 1 of 5 | Iteration:  58%|█████▊    | 703/1212 [05:28<03:23,  2.51it/s]

Gradient norm: 82.85203231553541


Epoch 1 of 5 | Iteration:  58%|█████▊    | 704/1212 [05:28<03:17,  2.57it/s]

Gradient norm: 83.48186410131915


Epoch 1 of 5 | Iteration:  58%|█████▊    | 705/1212 [05:29<03:10,  2.67it/s]

Gradient norm: 10.468231410918145


Epoch 1 of 5 | Iteration:  58%|█████▊    | 706/1212 [05:29<03:19,  2.54it/s]

Gradient norm: 10.484254440246383


Epoch 1 of 5 | Iteration:  58%|█████▊    | 707/1212 [05:29<03:11,  2.64it/s]

Gradient norm: 13.75061092950158


Epoch 1 of 5 | Iteration:  58%|█████▊    | 708/1212 [05:30<03:22,  2.48it/s]

Gradient norm: 246.01611859886364


Epoch 1 of 5 | Iteration:  58%|█████▊    | 709/1212 [05:30<03:42,  2.26it/s]

Gradient norm: 245.87739487662316


Epoch 1 of 5 | Iteration:  59%|█████▊    | 710/1212 [05:31<03:31,  2.37it/s]

Gradient norm: 245.85879173576413


Epoch 1 of 5 | Iteration:  59%|█████▊    | 711/1212 [05:31<03:21,  2.49it/s]

Gradient norm: 461.83185330788666


Epoch 1 of 5 | Iteration:  59%|█████▊    | 712/1212 [05:32<03:43,  2.24it/s]

Gradient norm: 462.9609492497851


Epoch 1 of 5 | Iteration:  59%|█████▉    | 713/1212 [05:32<04:10,  1.99it/s]

Gradient norm: 463.10548771899806


Epoch 1 of 5 | Iteration:  59%|█████▉    | 714/1212 [05:33<04:08,  2.00it/s]

Gradient norm: 462.221086506372


Epoch 1 of 5 | Iteration:  59%|█████▉    | 715/1212 [05:33<04:06,  2.02it/s]

Gradient norm: 462.53931979933424


Epoch 1 of 5 | Iteration:  59%|█████▉    | 716/1212 [05:34<04:06,  2.01it/s]

Gradient norm: 462.4193339485058


Epoch 1 of 5 | Iteration:  59%|█████▉    | 717/1212 [05:34<04:06,  2.01it/s]

Gradient norm: 462.3355855048698


Epoch 1 of 5 | Iteration:  59%|█████▉    | 718/1212 [05:35<04:54,  1.68it/s]

Gradient norm: 462.34218596911256


Epoch 1 of 5 | Iteration:  59%|█████▉    | 719/1212 [05:36<04:49,  1.70it/s]

Gradient norm: 462.67035165540136


Epoch 1 of 5 | Iteration:  59%|█████▉    | 720/1212 [05:36<04:34,  1.79it/s]

Gradient norm: 461.9442981873557


Epoch 1 of 5 | Iteration:  59%|█████▉    | 721/1212 [05:37<04:10,  1.96it/s]

Gradient norm: 1.5722806329675612


Epoch 1 of 5 | Iteration:  60%|█████▉    | 722/1212 [05:37<03:50,  2.13it/s]

Gradient norm: 1.8800753821726073


Epoch 1 of 5 | Iteration:  60%|█████▉    | 723/1212 [05:37<03:46,  2.16it/s]

Gradient norm: 1.9821877079197516


Epoch 1 of 5 | Iteration:  60%|█████▉    | 724/1212 [05:38<03:29,  2.32it/s]

Gradient norm: 1.9967087515295947


Epoch 1 of 5 | Iteration:  60%|█████▉    | 725/1212 [05:38<03:25,  2.37it/s]

Gradient norm: 6.189875248057828


Epoch 1 of 5 | Iteration:  60%|█████▉    | 726/1212 [05:39<03:18,  2.45it/s]

Gradient norm: 6.377468018572148


Epoch 1 of 5 | Iteration:  60%|█████▉    | 727/1212 [05:39<03:52,  2.09it/s]

Gradient norm: 6.370645986283272


Epoch 1 of 5 | Iteration:  60%|██████    | 728/1212 [05:40<03:33,  2.26it/s]

Gradient norm: 43.57018419217864


Epoch 1 of 5 | Iteration:  60%|██████    | 729/1212 [05:40<03:18,  2.44it/s]

Gradient norm: 43.57170717849272


Epoch 1 of 5 | Iteration:  60%|██████    | 730/1212 [05:40<03:06,  2.58it/s]

Gradient norm: 43.2957767131443


Epoch 1 of 5 | Iteration:  60%|██████    | 731/1212 [05:41<03:08,  2.55it/s]

Gradient norm: 44.04965316614542


Epoch 1 of 5 | Iteration:  60%|██████    | 732/1212 [05:41<03:42,  2.15it/s]

Gradient norm: 43.39766799805741


Epoch 1 of 5 | Iteration:  60%|██████    | 733/1212 [05:42<03:26,  2.32it/s]

Gradient norm: 44.734799965682825


Epoch 1 of 5 | Iteration:  61%|██████    | 734/1212 [05:42<03:40,  2.16it/s]

Gradient norm: 43.84212056794542


Epoch 1 of 5 | Iteration:  61%|██████    | 735/1212 [05:43<03:55,  2.03it/s]

Gradient norm: 43.535394663780046


Epoch 1 of 5 | Iteration:  61%|██████    | 736/1212 [05:43<03:36,  2.20it/s]

Gradient norm: 45.06874313343696


Epoch 1 of 5 | Iteration:  61%|██████    | 737/1212 [05:43<03:25,  2.31it/s]

Gradient norm: 4.829195231428178


Epoch 1 of 5 | Iteration:  61%|██████    | 738/1212 [05:44<03:26,  2.29it/s]

Gradient norm: 4.938972462453344


Epoch 1 of 5 | Iteration:  61%|██████    | 739/1212 [05:44<03:16,  2.41it/s]

Gradient norm: 5.933952047562041


Epoch 1 of 5 | Iteration:  61%|██████    | 740/1212 [05:45<03:22,  2.33it/s]

Gradient norm: 6.8739178686224935


Epoch 1 of 5 | Iteration:  61%|██████    | 741/1212 [05:45<03:19,  2.36it/s]

Gradient norm: 6.805134151622013


Epoch 1 of 5 | Iteration:  61%|██████    | 742/1212 [05:46<03:21,  2.34it/s]

Gradient norm: 8.743919008118558


Epoch 1 of 5 | Iteration:  61%|██████▏   | 743/1212 [05:46<03:35,  2.18it/s]

Gradient norm: 8.768214158816958


Epoch 1 of 5 | Iteration:  61%|██████▏   | 744/1212 [05:47<03:37,  2.16it/s]

Gradient norm: 9.516035112717175


Epoch 1 of 5 | Iteration:  61%|██████▏   | 745/1212 [05:47<03:41,  2.11it/s]

Gradient norm: 31.760788642839632


Epoch 1 of 5 | Iteration:  62%|██████▏   | 746/1212 [05:48<03:45,  2.07it/s]

Gradient norm: 32.15012053235279


Epoch 1 of 5 | Iteration:  62%|██████▏   | 747/1212 [05:48<04:18,  1.80it/s]

Gradient norm: 32.085821720350175


Epoch 1 of 5 | Iteration:  62%|██████▏   | 748/1212 [05:49<04:25,  1.75it/s]

Gradient norm: 65.10208616837399


Epoch 1 of 5 | Iteration:  62%|██████▏   | 749/1212 [05:49<04:03,  1.90it/s]

Gradient norm: 65.8154677013656


Epoch 1 of 5 | Iteration:  62%|██████▏   | 750/1212 [05:50<04:16,  1.80it/s]

Gradient norm: 66.05589371320669


Epoch 1 of 5 | Iteration:  62%|██████▏   | 751/1212 [05:50<04:13,  1.82it/s]

Gradient norm: 66.24602166696668


Epoch 1 of 5 | Iteration:  62%|██████▏   | 752/1212 [05:51<03:53,  1.97it/s]

Gradient norm: 66.30027649882858


Epoch 1 of 5 | Iteration:  62%|██████▏   | 753/1212 [05:51<03:27,  2.21it/s]

Gradient norm: 2.695607893634536


Epoch 1 of 5 | Iteration:  62%|██████▏   | 754/1212 [05:52<03:13,  2.37it/s]

Gradient norm: 3.3407555862491902


Epoch 1 of 5 | Iteration:  62%|██████▏   | 755/1212 [05:52<03:01,  2.52it/s]

Gradient norm: 3.3407554021912556


Epoch 1 of 5 | Iteration:  62%|██████▏   | 756/1212 [05:52<03:07,  2.43it/s]

Gradient norm: 3.6546973150564432


Epoch 1 of 5 | Iteration:  62%|██████▏   | 757/1212 [05:53<03:03,  2.48it/s]

Gradient norm: 3.842753508314983


Epoch 1 of 5 | Iteration:  63%|██████▎   | 758/1212 [05:53<02:58,  2.54it/s]

Gradient norm: 49.7639976475153


Epoch 1 of 5 | Iteration:  63%|██████▎   | 759/1212 [05:53<02:51,  2.65it/s]

Gradient norm: 49.75683842553136


Epoch 1 of 5 | Iteration:  63%|██████▎   | 760/1212 [05:54<02:53,  2.60it/s]

Gradient norm: 48.72423959357012


Epoch 1 of 5 | Iteration:  63%|██████▎   | 761/1212 [05:54<02:47,  2.69it/s]

Gradient norm: 49.59161163408815


Epoch 1 of 5 | Iteration:  63%|██████▎   | 762/1212 [05:55<03:07,  2.40it/s]

Gradient norm: 51.98421259319568


Epoch 1 of 5 | Iteration:  63%|██████▎   | 763/1212 [05:55<02:58,  2.51it/s]

Gradient norm: 58.751310873805565


Epoch 1 of 5 | Iteration:  63%|██████▎   | 764/1212 [05:55<02:50,  2.62it/s]

Gradient norm: 60.82521266819074


Epoch 1 of 5 | Iteration:  63%|██████▎   | 765/1212 [05:56<02:46,  2.68it/s]

Gradient norm: 61.564611919119606


Epoch 1 of 5 | Iteration:  63%|██████▎   | 766/1212 [05:56<02:44,  2.72it/s]

Gradient norm: 61.517649767171406


Epoch 1 of 5 | Iteration:  63%|██████▎   | 767/1212 [05:56<02:43,  2.72it/s]

Gradient norm: 62.151287050128424


Epoch 1 of 5 | Iteration:  63%|██████▎   | 768/1212 [05:57<02:51,  2.60it/s]

Gradient norm: 62.04026836220295


Epoch 1 of 5 | Iteration:  63%|██████▎   | 769/1212 [05:57<02:45,  2.68it/s]

Gradient norm: 1.5539011576749169


Epoch 1 of 5 | Iteration:  64%|██████▎   | 770/1212 [05:58<02:48,  2.62it/s]

Gradient norm: 2.0256039517546474


Epoch 1 of 5 | Iteration:  64%|██████▎   | 771/1212 [05:58<02:48,  2.62it/s]

Gradient norm: 2.591468318014896


Epoch 1 of 5 | Iteration:  64%|██████▎   | 772/1212 [05:58<02:44,  2.68it/s]

Gradient norm: 3.1449816146496725


Epoch 1 of 5 | Iteration:  64%|██████▍   | 773/1212 [05:59<02:42,  2.70it/s]

Gradient norm: 3.411072051536954


Epoch 1 of 5 | Iteration:  64%|██████▍   | 774/1212 [05:59<03:06,  2.35it/s]

Gradient norm: 290.3494579351449


Epoch 1 of 5 | Iteration:  64%|██████▍   | 775/1212 [06:00<03:11,  2.29it/s]

Gradient norm: 290.27003497437676


Epoch 1 of 5 | Iteration:  64%|██████▍   | 776/1212 [06:00<03:14,  2.24it/s]

Gradient norm: 290.42846047364327


Epoch 1 of 5 | Iteration:  64%|██████▍   | 777/1212 [06:01<03:20,  2.17it/s]

Gradient norm: 288.65570141594003


Epoch 1 of 5 | Iteration:  64%|██████▍   | 778/1212 [06:01<03:25,  2.12it/s]

Gradient norm: 288.425305607595


Epoch 1 of 5 | Iteration:  64%|██████▍   | 779/1212 [06:02<03:26,  2.10it/s]

Gradient norm: 288.42637439624656


Epoch 1 of 5 | Iteration:  64%|██████▍   | 780/1212 [06:02<03:41,  1.95it/s]

Gradient norm: 288.5607792352522


Epoch 1 of 5 | Iteration:  64%|██████▍   | 781/1212 [06:03<03:20,  2.15it/s]

Gradient norm: 289.0696870200803


Epoch 1 of 5 | Iteration:  65%|██████▍   | 782/1212 [06:03<03:07,  2.30it/s]

Gradient norm: 288.83591506870556


Epoch 1 of 5 | Iteration:  65%|██████▍   | 783/1212 [06:03<03:08,  2.28it/s]

Gradient norm: 288.8212587968929


Epoch 1 of 5 | Iteration:  65%|██████▍   | 784/1212 [06:04<02:58,  2.39it/s]

Gradient norm: 284.06191960810577


Epoch 1 of 5 | Iteration:  65%|██████▍   | 785/1212 [06:04<03:11,  2.23it/s]

Gradient norm: 20.875772213402925


Epoch 1 of 5 | Iteration:  65%|██████▍   | 786/1212 [06:05<03:13,  2.20it/s]

Gradient norm: 20.71741740487368


Epoch 1 of 5 | Iteration:  65%|██████▍   | 787/1212 [06:05<03:07,  2.26it/s]

Gradient norm: 19.79429020007519


Epoch 1 of 5 | Iteration:  65%|██████▌   | 788/1212 [06:06<02:55,  2.42it/s]

Gradient norm: 19.687742556868532


Epoch 1 of 5 | Iteration:  65%|██████▌   | 789/1212 [06:06<02:56,  2.40it/s]

Gradient norm: 18.934267314266986


Epoch 1 of 5 | Iteration:  65%|██████▌   | 790/1212 [06:06<02:52,  2.45it/s]

Gradient norm: 20.900178637250942


Epoch 1 of 5 | Iteration:  65%|██████▌   | 791/1212 [06:07<02:46,  2.53it/s]

Gradient norm: 27.6665986686514


Epoch 1 of 5 | Iteration:  65%|██████▌   | 792/1212 [06:07<03:00,  2.33it/s]

Gradient norm: 25.77379062247423


Epoch 1 of 5 | Iteration:  65%|██████▌   | 793/1212 [06:08<02:51,  2.45it/s]

Gradient norm: 25.793295281023422


Epoch 1 of 5 | Iteration:  66%|██████▌   | 794/1212 [06:08<02:42,  2.57it/s]

Gradient norm: 26.251040992339448


Epoch 1 of 5 | Iteration:  66%|██████▌   | 795/1212 [06:08<02:44,  2.53it/s]

Gradient norm: 26.153638981929934


Epoch 1 of 5 | Iteration:  66%|██████▌   | 796/1212 [06:09<03:01,  2.29it/s]

Gradient norm: 26.10143565513499


Epoch 1 of 5 | Iteration:  66%|██████▌   | 797/1212 [06:09<02:50,  2.44it/s]

Gradient norm: 25.130050125712035


Epoch 1 of 5 | Iteration:  66%|██████▌   | 798/1212 [06:10<03:03,  2.26it/s]

Gradient norm: 25.127836997522667


Epoch 1 of 5 | Iteration:  66%|██████▌   | 799/1212 [06:10<02:50,  2.42it/s]

Gradient norm: 26.061945184632165


Epoch 1 of 5 | Iteration:  66%|██████▌   | 800/1212 [06:11<02:45,  2.48it/s]

Gradient norm: 26.066059258721967


Epoch 1 of 5 | Iteration:  66%|██████▌   | 801/1212 [06:11<02:52,  2.39it/s]

Gradient norm: 1.6581836521314341


Epoch 1 of 5 | Iteration:  66%|██████▌   | 802/1212 [06:11<02:44,  2.49it/s]

Gradient norm: 3.9448353775039013


Epoch 1 of 5 | Iteration:  66%|██████▋   | 803/1212 [06:12<02:35,  2.63it/s]

Gradient norm: 4.032336552268215


Epoch 1 of 5 | Iteration:  66%|██████▋   | 804/1212 [06:12<02:48,  2.42it/s]

Gradient norm: 4.844247241241009


Epoch 1 of 5 | Iteration:  66%|██████▋   | 805/1212 [06:13<02:56,  2.31it/s]

Gradient norm: 5.56140730808442


Epoch 1 of 5 | Iteration:  67%|██████▋   | 806/1212 [06:13<03:03,  2.21it/s]

Gradient norm: 6.358627343714747


Epoch 1 of 5 | Iteration:  67%|██████▋   | 807/1212 [06:14<03:38,  1.86it/s]

Gradient norm: 7.92793786628416


Epoch 1 of 5 | Iteration:  67%|██████▋   | 808/1212 [06:14<03:31,  1.91it/s]

Gradient norm: 12.019043997227202


Epoch 1 of 5 | Iteration:  67%|██████▋   | 809/1212 [06:15<03:26,  1.95it/s]

Gradient norm: 99.10564054757961


Epoch 1 of 5 | Iteration:  67%|██████▋   | 810/1212 [06:15<03:36,  1.85it/s]

Gradient norm: 100.0308951655588


Epoch 1 of 5 | Iteration:  67%|██████▋   | 811/1212 [06:16<03:22,  1.98it/s]

Gradient norm: 100.60589978125019


Epoch 1 of 5 | Iteration:  67%|██████▋   | 812/1212 [06:16<03:04,  2.16it/s]

Gradient norm: 95.88665734641287


Epoch 1 of 5 | Iteration:  67%|██████▋   | 813/1212 [06:17<02:56,  2.26it/s]

Gradient norm: 95.71712441056927


Epoch 1 of 5 | Iteration:  67%|██████▋   | 814/1212 [06:17<03:24,  1.94it/s]

Gradient norm: 95.13640122966426


Epoch 1 of 5 | Iteration:  67%|██████▋   | 815/1212 [06:18<03:14,  2.05it/s]

Gradient norm: 95.0105696450787


Epoch 1 of 5 | Iteration:  67%|██████▋   | 816/1212 [06:18<02:58,  2.22it/s]

Gradient norm: 97.54057139563251


Epoch 1 of 5 | Iteration:  67%|██████▋   | 817/1212 [06:18<02:47,  2.36it/s]

Gradient norm: 84.97397228405929


Epoch 1 of 5 | Iteration:  67%|██████▋   | 818/1212 [06:19<02:56,  2.23it/s]

Gradient norm: 84.92437726388879


Epoch 1 of 5 | Iteration:  68%|██████▊   | 819/1212 [06:19<02:45,  2.38it/s]

Gradient norm: 84.92734818127413


Epoch 1 of 5 | Iteration:  68%|██████▊   | 820/1212 [06:20<02:39,  2.46it/s]

Gradient norm: 96.56984448516776


Epoch 1 of 5 | Iteration:  68%|██████▊   | 821/1212 [06:20<03:06,  2.10it/s]

Gradient norm: 94.7876715873886


Epoch 1 of 5 | Iteration:  68%|██████▊   | 822/1212 [06:21<03:12,  2.03it/s]

Gradient norm: 94.65572502271974


Epoch 1 of 5 | Iteration:  68%|██████▊   | 823/1212 [06:21<03:06,  2.08it/s]

Gradient norm: 94.94125629302988


Epoch 1 of 5 | Iteration:  68%|██████▊   | 824/1212 [06:22<02:53,  2.24it/s]

Gradient norm: 94.9894472688126


Epoch 1 of 5 | Iteration:  68%|██████▊   | 825/1212 [06:22<02:48,  2.30it/s]

Gradient norm: 94.3284769153711


Epoch 1 of 5 | Iteration:  68%|██████▊   | 826/1212 [06:22<02:38,  2.44it/s]

Gradient norm: 103.59494230164724


Epoch 1 of 5 | Iteration:  68%|██████▊   | 827/1212 [06:23<02:45,  2.33it/s]

Gradient norm: 103.72113460502312


Epoch 1 of 5 | Iteration:  68%|██████▊   | 828/1212 [06:23<02:46,  2.31it/s]

Gradient norm: 103.29949297679632


Epoch 1 of 5 | Iteration:  68%|██████▊   | 829/1212 [06:24<02:43,  2.35it/s]

Gradient norm: 103.30432983819433


Epoch 1 of 5 | Iteration:  68%|██████▊   | 830/1212 [06:24<02:39,  2.39it/s]

Gradient norm: 103.35292713686478


Epoch 1 of 5 | Iteration:  69%|██████▊   | 831/1212 [06:25<02:42,  2.35it/s]

Gradient norm: 103.2254150321797


Epoch 1 of 5 | Iteration:  69%|██████▊   | 832/1212 [06:25<02:46,  2.28it/s]

Gradient norm: 103.23084909563015


Epoch 1 of 5 | Iteration:  69%|██████▊   | 833/1212 [06:25<02:36,  2.42it/s]

Gradient norm: 0.5161411611166532


Epoch 1 of 5 | Iteration:  69%|██████▉   | 834/1212 [06:26<02:47,  2.26it/s]

Gradient norm: 1.0677604637511988


Epoch 1 of 5 | Iteration:  69%|██████▉   | 835/1212 [06:27<03:06,  2.02it/s]

Gradient norm: 47.23763349590938


Epoch 1 of 5 | Iteration:  69%|██████▉   | 836/1212 [06:27<03:11,  1.97it/s]

Gradient norm: 47.39428837223049


Epoch 1 of 5 | Iteration:  69%|██████▉   | 837/1212 [06:28<03:08,  1.98it/s]

Gradient norm: 58.85971496592405


Epoch 1 of 5 | Iteration:  69%|██████▉   | 838/1212 [06:28<03:10,  1.97it/s]

Gradient norm: 59.23246889008988


Epoch 1 of 5 | Iteration:  69%|██████▉   | 839/1212 [06:29<03:15,  1.91it/s]

Gradient norm: 59.336042463886166


Epoch 1 of 5 | Iteration:  69%|██████▉   | 840/1212 [06:29<02:56,  2.10it/s]

Gradient norm: 59.25620381139372


Epoch 1 of 5 | Iteration:  69%|██████▉   | 841/1212 [06:29<02:42,  2.29it/s]

Gradient norm: 60.250685542190915


Epoch 1 of 5 | Iteration:  69%|██████▉   | 842/1212 [06:30<02:49,  2.18it/s]

Gradient norm: 60.15580695988518


Epoch 1 of 5 | Iteration:  70%|██████▉   | 843/1212 [06:30<02:37,  2.34it/s]

Gradient norm: 58.81328661187428


Epoch 1 of 5 | Iteration:  70%|██████▉   | 844/1212 [06:31<02:28,  2.47it/s]

Gradient norm: 62.29931397951608


Epoch 1 of 5 | Iteration:  70%|██████▉   | 845/1212 [06:31<02:25,  2.52it/s]

Gradient norm: 62.532394119717374


Epoch 1 of 5 | Iteration:  70%|██████▉   | 846/1212 [06:31<02:28,  2.47it/s]

Gradient norm: 61.96408109093747


Epoch 1 of 5 | Iteration:  70%|██████▉   | 847/1212 [06:32<02:38,  2.30it/s]

Gradient norm: 62.343510855522844


Epoch 1 of 5 | Iteration:  70%|██████▉   | 848/1212 [06:32<02:31,  2.40it/s]

Gradient norm: 62.2516837969597


Epoch 1 of 5 | Iteration:  70%|███████   | 849/1212 [06:33<02:33,  2.37it/s]

Gradient norm: 2.115535059432061


Epoch 1 of 5 | Iteration:  70%|███████   | 850/1212 [06:33<02:30,  2.40it/s]

Gradient norm: 4.308260789655462


Epoch 1 of 5 | Iteration:  70%|███████   | 851/1212 [06:34<02:54,  2.06it/s]

Gradient norm: 7.6879137347811


Epoch 1 of 5 | Iteration:  70%|███████   | 852/1212 [06:34<02:48,  2.13it/s]

Gradient norm: 7.561903818551488


Epoch 1 of 5 | Iteration:  70%|███████   | 853/1212 [06:35<02:57,  2.03it/s]

Gradient norm: 75.5955023601888


Epoch 1 of 5 | Iteration:  70%|███████   | 854/1212 [06:35<02:43,  2.20it/s]

Gradient norm: 82.9203551517123


Epoch 1 of 5 | Iteration:  71%|███████   | 855/1212 [06:35<02:31,  2.35it/s]

Gradient norm: 107.50399929676736


Epoch 1 of 5 | Iteration:  71%|███████   | 856/1212 [06:36<02:22,  2.50it/s]

Gradient norm: 107.55659687120941


Epoch 1 of 5 | Iteration:  71%|███████   | 857/1212 [06:36<02:23,  2.48it/s]

Gradient norm: 107.25312114511327


Epoch 1 of 5 | Iteration:  71%|███████   | 858/1212 [06:37<02:24,  2.45it/s]

Gradient norm: 107.3217948654974


Epoch 1 of 5 | Iteration:  71%|███████   | 859/1212 [06:37<02:20,  2.51it/s]

Gradient norm: 107.58663983856198


Epoch 1 of 5 | Iteration:  71%|███████   | 860/1212 [06:37<02:26,  2.40it/s]

Gradient norm: 107.56593431966176


Epoch 1 of 5 | Iteration:  71%|███████   | 861/1212 [06:38<02:36,  2.24it/s]

Gradient norm: 106.27715992733769


Epoch 1 of 5 | Iteration:  71%|███████   | 862/1212 [06:38<02:36,  2.23it/s]

Gradient norm: 105.91790930603284


Epoch 1 of 5 | Iteration:  71%|███████   | 863/1212 [06:39<02:52,  2.03it/s]

Gradient norm: 105.89834933560002


Epoch 1 of 5 | Iteration:  71%|███████▏  | 864/1212 [06:40<02:54,  2.00it/s]

Gradient norm: 105.90636383016749


Epoch 1 of 5 | Iteration:  71%|███████▏  | 865/1212 [06:40<02:57,  1.95it/s]

Gradient norm: 5.235820694951528


Epoch 1 of 5 | Iteration:  71%|███████▏  | 866/1212 [06:41<02:54,  1.98it/s]

Gradient norm: 5.348531846300387


Epoch 1 of 5 | Iteration:  72%|███████▏  | 867/1212 [06:41<02:54,  1.97it/s]

Gradient norm: 5.86767071114868


Epoch 1 of 5 | Iteration:  72%|███████▏  | 868/1212 [06:42<02:59,  1.91it/s]

Gradient norm: 23.59132529122358


Epoch 1 of 5 | Iteration:  72%|███████▏  | 869/1212 [06:42<02:53,  1.98it/s]

Gradient norm: 23.580463437817517


Epoch 1 of 5 | Iteration:  72%|███████▏  | 870/1212 [06:43<02:55,  1.95it/s]

Gradient norm: 23.933736957330822


Epoch 1 of 5 | Iteration:  72%|███████▏  | 871/1212 [06:43<03:01,  1.88it/s]

Gradient norm: 23.960434051044984


Epoch 1 of 5 | Iteration:  72%|███████▏  | 872/1212 [06:44<02:44,  2.07it/s]

Gradient norm: 23.46728841094385


Epoch 1 of 5 | Iteration:  72%|███████▏  | 873/1212 [06:44<02:30,  2.25it/s]

Gradient norm: 23.467529106492893


Epoch 1 of 5 | Iteration:  72%|███████▏  | 874/1212 [06:45<02:43,  2.07it/s]

Gradient norm: 28.39367197217316


Epoch 1 of 5 | Iteration:  72%|███████▏  | 875/1212 [06:45<02:30,  2.25it/s]

Gradient norm: 28.673992797694215


Epoch 1 of 5 | Iteration:  72%|███████▏  | 876/1212 [06:45<02:23,  2.35it/s]

Gradient norm: 28.314378350387937


Epoch 1 of 5 | Iteration:  72%|███████▏  | 877/1212 [06:46<02:26,  2.29it/s]

Gradient norm: 28.805058492190675


Epoch 1 of 5 | Iteration:  72%|███████▏  | 878/1212 [06:46<02:25,  2.30it/s]

Gradient norm: 28.77959277585858


Epoch 1 of 5 | Iteration:  73%|███████▎  | 879/1212 [06:47<02:42,  2.05it/s]

Gradient norm: 28.769938466560575


Epoch 1 of 5 | Iteration:  73%|███████▎  | 880/1212 [06:47<02:38,  2.10it/s]

Gradient norm: 28.8274260636244


Epoch 1 of 5 | Iteration:  73%|███████▎  | 881/1212 [06:48<02:34,  2.14it/s]

Gradient norm: 1007.2846334513581


Epoch 1 of 5 | Iteration:  73%|███████▎  | 882/1212 [06:48<02:24,  2.29it/s]

Gradient norm: 1004.6907932827029


Epoch 1 of 5 | Iteration:  73%|███████▎  | 883/1212 [06:48<02:16,  2.41it/s]

Gradient norm: 1002.699122345756


Epoch 1 of 5 | Iteration:  73%|███████▎  | 884/1212 [06:49<02:26,  2.24it/s]

Gradient norm: 1002.713271915167


Epoch 1 of 5 | Iteration:  73%|███████▎  | 885/1212 [06:49<02:33,  2.13it/s]

Gradient norm: 1001.3171585067548


Epoch 1 of 5 | Iteration:  73%|███████▎  | 886/1212 [06:50<02:30,  2.16it/s]

Gradient norm: 1004.6946684623496


Epoch 1 of 5 | Iteration:  73%|███████▎  | 887/1212 [06:50<02:19,  2.32it/s]

Gradient norm: 1003.4653624429371


Epoch 1 of 5 | Iteration:  73%|███████▎  | 888/1212 [06:51<02:19,  2.32it/s]

Gradient norm: 1003.8690036360863


Epoch 1 of 5 | Iteration:  73%|███████▎  | 889/1212 [06:51<02:11,  2.46it/s]

Gradient norm: 1003.8293843857851


Epoch 1 of 5 | Iteration:  73%|███████▎  | 890/1212 [06:51<02:06,  2.55it/s]

Gradient norm: 1003.4916984921583


Epoch 1 of 5 | Iteration:  74%|███████▎  | 891/1212 [06:52<02:08,  2.51it/s]

Gradient norm: 1003.5498209302544


Epoch 1 of 5 | Iteration:  74%|███████▎  | 892/1212 [06:52<02:17,  2.33it/s]

Gradient norm: 1003.8474002307382


Epoch 1 of 5 | Iteration:  74%|███████▎  | 893/1212 [06:53<02:22,  2.23it/s]

Gradient norm: 1003.8977269098189


Epoch 1 of 5 | Iteration:  74%|███████▍  | 894/1212 [06:53<02:27,  2.16it/s]

Gradient norm: 1725.237134510444


Epoch 1 of 5 | Iteration:  74%|███████▍  | 895/1212 [06:54<02:44,  1.92it/s]

Gradient norm: 1725.9117417403675


Epoch 1 of 5 | Iteration:  74%|███████▍  | 896/1212 [06:54<02:43,  1.94it/s]

Gradient norm: 1725.9480604839493


Epoch 1 of 5 | Iteration:  74%|███████▍  | 897/1212 [06:55<02:48,  1.87it/s]

Gradient norm: 0.932702362544651


Epoch 1 of 5 | Iteration:  74%|███████▍  | 898/1212 [06:55<02:40,  1.96it/s]

Gradient norm: 23.080532206615135


Epoch 1 of 5 | Iteration:  74%|███████▍  | 899/1212 [06:56<02:27,  2.12it/s]

Gradient norm: 22.435239048854413


Epoch 1 of 5 | Iteration:  74%|███████▍  | 900/1212 [06:56<02:29,  2.09it/s]

Gradient norm: 21.982063297345608


Epoch 1 of 5 | Iteration:  74%|███████▍  | 901/1212 [06:57<02:26,  2.13it/s]

Gradient norm: 22.225772216724465


Epoch 1 of 5 | Iteration:  74%|███████▍  | 902/1212 [06:57<02:14,  2.31it/s]

Gradient norm: 27.650691976150355


Epoch 1 of 5 | Iteration:  75%|███████▍  | 903/1212 [06:58<02:05,  2.46it/s]

Gradient norm: 28.232111153477373


Epoch 1 of 5 | Iteration:  75%|███████▍  | 904/1212 [06:58<02:00,  2.57it/s]

Gradient norm: 257.05288322800766


Epoch 1 of 5 | Iteration:  75%|███████▍  | 905/1212 [06:58<01:57,  2.62it/s]

Gradient norm: 257.0267712169182


Epoch 1 of 5 | Iteration:  75%|███████▍  | 906/1212 [06:59<01:52,  2.71it/s]

Gradient norm: 257.14530892308153


Epoch 1 of 5 | Iteration:  75%|███████▍  | 907/1212 [06:59<01:54,  2.66it/s]

Gradient norm: 257.489876672312


Epoch 1 of 5 | Iteration:  75%|███████▍  | 908/1212 [06:59<01:56,  2.62it/s]

Gradient norm: 258.64797172551533


Epoch 1 of 5 | Iteration:  75%|███████▌  | 909/1212 [07:00<01:57,  2.57it/s]

Gradient norm: 259.21616945926957


Epoch 1 of 5 | Iteration:  75%|███████▌  | 910/1212 [07:00<01:54,  2.64it/s]

Gradient norm: 259.20641947090104


Epoch 1 of 5 | Iteration:  75%|███████▌  | 911/1212 [07:00<01:51,  2.71it/s]

Gradient norm: 259.23305613124603


Epoch 1 of 5 | Iteration:  75%|███████▌  | 912/1212 [07:01<02:02,  2.44it/s]

Gradient norm: 258.6546690129305


Epoch 1 of 5 | Iteration:  75%|███████▌  | 913/1212 [07:01<01:56,  2.57it/s]

Gradient norm: 3.587311010925036


Epoch 1 of 5 | Iteration:  75%|███████▌  | 914/1212 [07:02<01:59,  2.50it/s]

Gradient norm: 3.95848627917672


Epoch 1 of 5 | Iteration:  75%|███████▌  | 915/1212 [07:02<02:01,  2.44it/s]

Gradient norm: 4.583784065429832


Epoch 1 of 5 | Iteration:  76%|███████▌  | 916/1212 [07:03<02:08,  2.30it/s]

Gradient norm: 4.59907812401601


Epoch 1 of 5 | Iteration:  76%|███████▌  | 917/1212 [07:03<02:16,  2.16it/s]

Gradient norm: 142.3303797555754


Epoch 1 of 5 | Iteration:  76%|███████▌  | 918/1212 [07:04<02:11,  2.24it/s]

Gradient norm: 142.46186658927832


Epoch 1 of 5 | Iteration:  76%|███████▌  | 919/1212 [07:04<02:02,  2.39it/s]

Gradient norm: 141.8145823248565


Epoch 1 of 5 | Iteration:  76%|███████▌  | 920/1212 [07:05<02:18,  2.12it/s]

Gradient norm: 141.69609713961677


Epoch 1 of 5 | Iteration:  76%|███████▌  | 921/1212 [07:05<02:12,  2.19it/s]

Gradient norm: 141.6308688024346


Epoch 1 of 5 | Iteration:  76%|███████▌  | 922/1212 [07:05<02:07,  2.27it/s]

Gradient norm: 141.84258499558285


Epoch 1 of 5 | Iteration:  76%|███████▌  | 923/1212 [07:06<02:19,  2.07it/s]

Gradient norm: 141.96352543523298


Epoch 1 of 5 | Iteration:  76%|███████▌  | 924/1212 [07:06<02:22,  2.02it/s]

Gradient norm: 141.80806087358255


Epoch 1 of 5 | Iteration:  76%|███████▋  | 925/1212 [07:07<02:23,  2.00it/s]

Gradient norm: 141.88151042070737


Epoch 1 of 5 | Iteration:  76%|███████▋  | 926/1212 [07:07<02:21,  2.02it/s]

Gradient norm: 140.8465345439554


Epoch 1 of 5 | Iteration:  76%|███████▋  | 927/1212 [07:08<02:29,  1.91it/s]

Gradient norm: 140.53719108158984


Epoch 1 of 5 | Iteration:  77%|███████▋  | 928/1212 [07:09<02:29,  1.90it/s]

Gradient norm: 140.66436852635306


Epoch 1 of 5 | Iteration:  77%|███████▋  | 929/1212 [07:09<02:17,  2.05it/s]

Gradient norm: 6.563799812682919


Epoch 1 of 5 | Iteration:  77%|███████▋  | 930/1212 [07:09<02:05,  2.25it/s]

Gradient norm: 19.30923820811621


Epoch 1 of 5 | Iteration:  77%|███████▋  | 931/1212 [07:10<02:02,  2.30it/s]

Gradient norm: 19.668970101900598


Epoch 1 of 5 | Iteration:  77%|███████▋  | 932/1212 [07:10<02:14,  2.09it/s]

Gradient norm: 19.727696932044555


Epoch 1 of 5 | Iteration:  77%|███████▋  | 933/1212 [07:11<02:03,  2.26it/s]

Gradient norm: 19.91095855165459


Epoch 1 of 5 | Iteration:  77%|███████▋  | 934/1212 [07:11<02:03,  2.26it/s]

Gradient norm: 20.231261709737765


Epoch 1 of 5 | Iteration:  77%|███████▋  | 935/1212 [07:11<01:54,  2.43it/s]

Gradient norm: 20.55881094328702


Epoch 1 of 5 | Iteration:  77%|███████▋  | 936/1212 [07:12<01:49,  2.51it/s]

Gradient norm: 20.58056483157613


Epoch 1 of 5 | Iteration:  77%|███████▋  | 937/1212 [07:12<02:05,  2.19it/s]

Gradient norm: 20.734732491405104


Epoch 1 of 5 | Iteration:  77%|███████▋  | 938/1212 [07:13<01:56,  2.35it/s]

Gradient norm: 20.88434714781744


Epoch 1 of 5 | Iteration:  77%|███████▋  | 939/1212 [07:13<01:57,  2.32it/s]

Gradient norm: 21.390582662678902


Epoch 1 of 5 | Iteration:  78%|███████▊  | 940/1212 [07:14<01:54,  2.38it/s]

Gradient norm: 21.244572714210975


Epoch 1 of 5 | Iteration:  78%|███████▊  | 941/1212 [07:14<01:53,  2.40it/s]

Gradient norm: 24.698775218559224


Epoch 1 of 5 | Iteration:  78%|███████▊  | 942/1212 [07:15<02:10,  2.08it/s]

Gradient norm: 25.490264269325003


Epoch 1 of 5 | Iteration:  78%|███████▊  | 943/1212 [07:15<02:04,  2.17it/s]

Gradient norm: 25.47937421436117


Epoch 1 of 5 | Iteration:  78%|███████▊  | 944/1212 [07:16<02:17,  1.95it/s]

Gradient norm: 25.517269136135695


Epoch 1 of 5 | Iteration:  78%|███████▊  | 945/1212 [07:16<02:10,  2.05it/s]

Gradient norm: 2.3960020456273807


Epoch 1 of 5 | Iteration:  78%|███████▊  | 946/1212 [07:17<02:17,  1.93it/s]

Gradient norm: 2.531238391844985


Epoch 1 of 5 | Iteration:  78%|███████▊  | 947/1212 [07:17<02:16,  1.95it/s]

Gradient norm: 13.539064295234907


Epoch 1 of 5 | Iteration:  78%|███████▊  | 948/1212 [07:18<02:02,  2.16it/s]

Gradient norm: 43.68311064811793


Epoch 1 of 5 | Iteration:  78%|███████▊  | 949/1212 [07:18<02:16,  1.92it/s]

Gradient norm: 43.649355307462166


Epoch 1 of 5 | Iteration:  78%|███████▊  | 950/1212 [07:19<02:14,  1.95it/s]

Gradient norm: 43.491313854834935


Epoch 1 of 5 | Iteration:  78%|███████▊  | 951/1212 [07:19<02:10,  2.00it/s]

Gradient norm: 43.42758764768069


Epoch 1 of 5 | Iteration:  79%|███████▊  | 952/1212 [07:20<02:12,  1.96it/s]

Gradient norm: 44.23069057021138


Epoch 1 of 5 | Iteration:  79%|███████▊  | 953/1212 [07:20<02:09,  2.00it/s]

Gradient norm: 49.51547678549794


Epoch 1 of 5 | Iteration:  79%|███████▊  | 954/1212 [07:21<02:21,  1.83it/s]

Gradient norm: 51.09469781852448


Epoch 1 of 5 | Iteration:  79%|███████▉  | 955/1212 [07:22<02:37,  1.63it/s]

Gradient norm: 51.0300574655722


Epoch 1 of 5 | Iteration:  79%|███████▉  | 956/1212 [07:22<02:23,  1.79it/s]

Gradient norm: 50.86192167636562


Epoch 1 of 5 | Iteration:  79%|███████▉  | 957/1212 [07:22<02:06,  2.01it/s]

Gradient norm: 50.86192167636562


Epoch 1 of 5 | Iteration:  79%|███████▉  | 958/1212 [07:23<02:12,  1.91it/s]

Gradient norm: 50.865076378303606


Epoch 1 of 5 | Iteration:  79%|███████▉  | 959/1212 [07:23<01:59,  2.12it/s]

Gradient norm: 876.2865719892999


Epoch 1 of 5 | Iteration:  79%|███████▉  | 960/1212 [07:24<01:51,  2.26it/s]

Gradient norm: 875.4660311783473


Epoch 1 of 5 | Iteration:  79%|███████▉  | 961/1212 [07:24<01:44,  2.41it/s]

Gradient norm: 1.7679306749547632


Epoch 1 of 5 | Iteration:  79%|███████▉  | 962/1212 [07:25<01:55,  2.16it/s]

Gradient norm: 1.8135783293855434


Epoch 1 of 5 | Iteration:  79%|███████▉  | 963/1212 [07:25<01:55,  2.16it/s]

Gradient norm: 2.0054225406346955


Epoch 1 of 5 | Iteration:  80%|███████▉  | 964/1212 [07:26<01:49,  2.26it/s]

Gradient norm: 4.130812723601651


Epoch 1 of 5 | Iteration:  80%|███████▉  | 965/1212 [07:26<01:54,  2.16it/s]

Gradient norm: 4.438743215939561


Epoch 1 of 5 | Iteration:  80%|███████▉  | 966/1212 [07:26<01:46,  2.32it/s]

Gradient norm: 50.83809047925878


Epoch 1 of 5 | Iteration:  80%|███████▉  | 967/1212 [07:27<01:42,  2.39it/s]

Gradient norm: 53.043218694454254


Epoch 1 of 5 | Iteration:  80%|███████▉  | 968/1212 [07:27<01:57,  2.07it/s]

Gradient norm: 52.30435990269674


Epoch 1 of 5 | Iteration:  80%|███████▉  | 969/1212 [07:28<01:48,  2.23it/s]

Gradient norm: 53.58829527679128


Epoch 1 of 5 | Iteration:  80%|████████  | 970/1212 [07:28<01:54,  2.12it/s]

Gradient norm: 55.62820485897213


Epoch 1 of 5 | Iteration:  80%|████████  | 971/1212 [07:29<01:47,  2.25it/s]

Gradient norm: 54.9916016931462


Epoch 1 of 5 | Iteration:  80%|████████  | 972/1212 [07:29<01:40,  2.39it/s]

Gradient norm: 54.83004133256904


Epoch 1 of 5 | Iteration:  80%|████████  | 973/1212 [07:29<01:37,  2.45it/s]

Gradient norm: 55.96662304107707


Epoch 1 of 5 | Iteration:  80%|████████  | 974/1212 [07:30<01:33,  2.55it/s]

Gradient norm: 77.57517554818136


Epoch 1 of 5 | Iteration:  80%|████████  | 975/1212 [07:30<01:50,  2.15it/s]

Gradient norm: 80.25936709481913


Epoch 1 of 5 | Iteration:  81%|████████  | 976/1212 [07:31<01:43,  2.27it/s]

Gradient norm: 81.78079601624663


Epoch 1 of 5 | Iteration:  81%|████████  | 977/1212 [07:31<01:36,  2.43it/s]

Gradient norm: 20.948405341109066


Epoch 1 of 5 | Iteration:  81%|████████  | 978/1212 [07:31<01:32,  2.54it/s]

Gradient norm: 21.519704066686334


Epoch 1 of 5 | Iteration:  81%|████████  | 979/1212 [07:32<01:34,  2.48it/s]

Gradient norm: 25.40480271945901


Epoch 1 of 5 | Iteration:  81%|████████  | 980/1212 [07:32<01:39,  2.32it/s]

Gradient norm: 25.9673878336803


Epoch 1 of 5 | Iteration:  81%|████████  | 981/1212 [07:33<01:40,  2.29it/s]

Gradient norm: 25.92358283148698


Epoch 1 of 5 | Iteration:  81%|████████  | 982/1212 [07:33<01:43,  2.21it/s]

Gradient norm: 26.676240245687552


Epoch 1 of 5 | Iteration:  81%|████████  | 983/1212 [07:34<02:03,  1.86it/s]

Gradient norm: 27.147116636968736


Epoch 1 of 5 | Iteration:  81%|████████  | 984/1212 [07:35<02:02,  1.87it/s]

Gradient norm: 27.348483937701424


Epoch 1 of 5 | Iteration:  81%|████████▏ | 985/1212 [07:35<02:00,  1.89it/s]

Gradient norm: 27.658221500294857


Epoch 1 of 5 | Iteration:  81%|████████▏ | 986/1212 [07:36<02:06,  1.79it/s]

Gradient norm: 27.385541999809615


Epoch 1 of 5 | Iteration:  81%|████████▏ | 987/1212 [07:36<01:55,  1.94it/s]

Gradient norm: 77.42254261712057


Epoch 1 of 5 | Iteration:  82%|████████▏ | 988/1212 [07:37<01:47,  2.08it/s]

Gradient norm: 77.24240153154135


Epoch 1 of 5 | Iteration:  82%|████████▏ | 989/1212 [07:37<01:47,  2.07it/s]

Gradient norm: 78.78805065848458


Epoch 1 of 5 | Iteration:  82%|████████▏ | 990/1212 [07:38<01:48,  2.04it/s]

Gradient norm: 78.78411185689636


Epoch 1 of 5 | Iteration:  82%|████████▏ | 991/1212 [07:38<01:40,  2.20it/s]

Gradient norm: 1484.3985852575854


Epoch 1 of 5 | Iteration:  82%|████████▏ | 992/1212 [07:38<01:37,  2.25it/s]

Gradient norm: 1478.646494827704


Epoch 1 of 5 | Iteration:  82%|████████▏ | 993/1212 [07:39<01:48,  2.03it/s]

Gradient norm: 6.246763456793513


Epoch 1 of 5 | Iteration:  82%|████████▏ | 994/1212 [07:40<01:54,  1.90it/s]

Gradient norm: 6.844936748096948


Epoch 1 of 5 | Iteration:  82%|████████▏ | 995/1212 [07:40<01:42,  2.12it/s]

Gradient norm: 8.03584836597944


Epoch 1 of 5 | Iteration:  82%|████████▏ | 996/1212 [07:40<01:39,  2.17it/s]

Gradient norm: 9.097155190481004


Epoch 1 of 5 | Iteration:  82%|████████▏ | 997/1212 [07:41<01:31,  2.34it/s]

Gradient norm: 34.70562455390165


Epoch 1 of 5 | Iteration:  82%|████████▏ | 998/1212 [07:41<01:29,  2.38it/s]

Gradient norm: 41.15906542077552


Epoch 1 of 5 | Iteration:  82%|████████▏ | 999/1212 [07:41<01:24,  2.52it/s]

Gradient norm: 41.19191663363338


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1000/1212 [07:42<01:21,  2.61it/s]

Gradient norm: 41.589790728331025


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1001/1212 [07:42<01:18,  2.70it/s]

Gradient norm: 41.58690786001701


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1002/1212 [07:43<01:17,  2.69it/s]

Gradient norm: 43.245494397802396


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1003/1212 [07:43<01:28,  2.36it/s]

Gradient norm: 43.39168407371766


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1004/1212 [07:43<01:24,  2.46it/s]

Gradient norm: 43.00345674332945


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1005/1212 [07:44<01:20,  2.57it/s]

Gradient norm: 45.898190116471454


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1006/1212 [07:44<01:19,  2.60it/s]

Gradient norm: 44.73115548067739


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1007/1212 [07:44<01:17,  2.66it/s]

Gradient norm: 45.69789315542197


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1008/1212 [07:45<01:17,  2.62it/s]

Gradient norm: 118.75133391259381


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1009/1212 [07:45<01:22,  2.47it/s]

Gradient norm: 1.904383261624433


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1010/1212 [07:46<01:27,  2.31it/s]

Gradient norm: 3.6428756174609256


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1011/1212 [07:46<01:31,  2.21it/s]

Gradient norm: 4.227625601392414


Epoch 1 of 5 | Iteration:  83%|████████▎ | 1012/1212 [07:47<01:38,  2.02it/s]

Gradient norm: 5.716700390288388


Epoch 1 of 5 | Iteration:  84%|████████▎ | 1013/1212 [07:47<01:37,  2.03it/s]

Gradient norm: 5.912800287510489


Epoch 1 of 5 | Iteration:  84%|████████▎ | 1014/1212 [07:48<01:37,  2.02it/s]

Gradient norm: 9.073230128584488


Epoch 1 of 5 | Iteration:  84%|████████▎ | 1015/1212 [07:48<01:38,  2.00it/s]

Gradient norm: 11.829149939319592


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1016/1212 [07:49<01:30,  2.17it/s]

Gradient norm: 11.608640028191394


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1017/1212 [07:49<01:22,  2.35it/s]

Gradient norm: 13.79880525429203


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1018/1212 [07:50<01:19,  2.45it/s]

Gradient norm: 14.347866283390866


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1019/1212 [07:50<01:25,  2.25it/s]

Gradient norm: 14.323981331615952


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1020/1212 [07:50<01:19,  2.41it/s]

Gradient norm: 19.505787171520623


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1021/1212 [07:51<01:24,  2.27it/s]

Gradient norm: 19.88648957427791


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1022/1212 [07:51<01:21,  2.34it/s]

Gradient norm: 20.28885626768504


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1023/1212 [07:52<01:39,  1.90it/s]

Gradient norm: 23.991346240277494


Epoch 1 of 5 | Iteration:  84%|████████▍ | 1024/1212 [07:53<01:38,  1.90it/s]

Gradient norm: 23.9893886138779


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1025/1212 [07:53<01:36,  1.94it/s]

Gradient norm: 1.7580991450925167


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1026/1212 [07:53<01:25,  2.18it/s]

Gradient norm: 1.751428478542214


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1027/1212 [07:54<01:18,  2.35it/s]

Gradient norm: 1.768040969224573


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1028/1212 [07:54<01:13,  2.50it/s]

Gradient norm: 1.7737975057245872


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1029/1212 [07:55<01:28,  2.07it/s]

Gradient norm: 2.023436404388171


Epoch 1 of 5 | Iteration:  85%|████████▍ | 1030/1212 [07:55<01:31,  2.00it/s]

Gradient norm: 35.221969104705316


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1031/1212 [07:56<01:22,  2.19it/s]

Gradient norm: 39.960625818061594


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1032/1212 [07:56<01:16,  2.35it/s]

Gradient norm: 40.143923476960374


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1033/1212 [07:56<01:12,  2.46it/s]

Gradient norm: 40.27997072491386


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1034/1212 [07:57<01:10,  2.54it/s]

Gradient norm: 88.7335537822971


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1035/1212 [07:57<01:06,  2.65it/s]

Gradient norm: 88.61751532031383


Epoch 1 of 5 | Iteration:  85%|████████▌ | 1036/1212 [07:57<01:04,  2.71it/s]

Gradient norm: 88.67520587333436


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1037/1212 [07:58<01:12,  2.41it/s]

Gradient norm: 105.91239134691152


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1038/1212 [07:59<01:20,  2.15it/s]

Gradient norm: 107.46069239733794


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1039/1212 [07:59<01:27,  1.97it/s]

Gradient norm: 107.5677208478559


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1040/1212 [08:00<01:33,  1.84it/s]

Gradient norm: 111.71015960800081


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1041/1212 [08:00<01:29,  1.90it/s]

Gradient norm: 0.912607921480793


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1042/1212 [08:01<01:27,  1.94it/s]

Gradient norm: 3.6850968830093387


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1043/1212 [08:01<01:27,  1.92it/s]

Gradient norm: 33.33634609368963


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1044/1212 [08:02<01:24,  1.99it/s]

Gradient norm: 33.232357533879856


Epoch 1 of 5 | Iteration:  86%|████████▌ | 1045/1212 [08:02<01:17,  2.17it/s]

Gradient norm: 33.24768564247379


Epoch 1 of 5 | Iteration:  86%|████████▋ | 1046/1212 [08:02<01:11,  2.34it/s]

Gradient norm: 33.24764056434667


Epoch 1 of 5 | Iteration:  86%|████████▋ | 1047/1212 [08:03<01:06,  2.47it/s]

Gradient norm: 42.65938040630705


Epoch 1 of 5 | Iteration:  86%|████████▋ | 1048/1212 [08:03<01:04,  2.54it/s]

Gradient norm: 42.678685061779866


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1049/1212 [08:03<01:01,  2.64it/s]

Gradient norm: 659.3663296444355


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1050/1212 [08:04<01:00,  2.70it/s]

Gradient norm: 665.4178653242308


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1051/1212 [08:04<01:06,  2.41it/s]

Gradient norm: 665.1449121770321


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1052/1212 [08:05<01:05,  2.44it/s]

Gradient norm: 665.9845513955249


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1053/1212 [08:05<01:03,  2.51it/s]

Gradient norm: 973.1090819265852


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1054/1212 [08:05<01:00,  2.60it/s]

Gradient norm: 977.7416487564228


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1055/1212 [08:06<00:59,  2.66it/s]

Gradient norm: 976.9918771985675


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1056/1212 [08:06<00:58,  2.66it/s]

Gradient norm: 976.9477847548238


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1057/1212 [08:07<00:57,  2.71it/s]

Gradient norm: 14.625180419171556


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1058/1212 [08:07<00:56,  2.73it/s]

Gradient norm: 21.355387238079594


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1059/1212 [08:07<01:04,  2.37it/s]

Gradient norm: 21.704750251057913


Epoch 1 of 5 | Iteration:  87%|████████▋ | 1060/1212 [08:08<01:00,  2.51it/s]

Gradient norm: 21.64906389733176


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1061/1212 [08:08<00:58,  2.59it/s]

Gradient norm: 22.200848804859756


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1062/1212 [08:09<00:58,  2.55it/s]

Gradient norm: 22.293557209201296


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1063/1212 [08:09<01:08,  2.18it/s]

Gradient norm: 25.446834417471383


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1064/1212 [08:10<01:09,  2.14it/s]

Gradient norm: 25.660952578863565


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1065/1212 [08:10<01:06,  2.23it/s]

Gradient norm: 25.574475147930276


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1066/1212 [08:10<01:02,  2.35it/s]

Gradient norm: 32.28712908460288


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1067/1212 [08:11<00:58,  2.47it/s]

Gradient norm: 32.82068802148976


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1068/1212 [08:11<00:56,  2.53it/s]

Gradient norm: 85.30093147958004


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1069/1212 [08:12<01:06,  2.16it/s]

Gradient norm: 85.3269921628057


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1070/1212 [08:12<01:07,  2.11it/s]

Gradient norm: 84.98778100216526


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1071/1212 [08:13<01:07,  2.10it/s]

Gradient norm: 84.78320451485733


Epoch 1 of 5 | Iteration:  88%|████████▊ | 1072/1212 [08:13<01:08,  2.03it/s]

Gradient norm: 84.85211584300214


Epoch 1 of 5 | Iteration:  89%|████████▊ | 1073/1212 [08:14<01:10,  1.97it/s]

Gradient norm: 1.6222549806510502


Epoch 1 of 5 | Iteration:  89%|████████▊ | 1074/1212 [08:14<01:11,  1.93it/s]

Gradient norm: 5.604948861788315


Epoch 1 of 5 | Iteration:  89%|████████▊ | 1075/1212 [08:15<01:11,  1.91it/s]

Gradient norm: 77.31281699912132


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1076/1212 [08:16<01:12,  1.88it/s]

Gradient norm: 80.99776132618811


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1077/1212 [08:16<01:04,  2.08it/s]

Gradient norm: 82.9610013766363


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1078/1212 [08:16<01:06,  2.03it/s]

Gradient norm: 82.28012648546489


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1079/1212 [08:17<01:00,  2.20it/s]

Gradient norm: 82.28636466101945


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1080/1212 [08:17<00:57,  2.30it/s]

Gradient norm: 81.60096305115545


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1081/1212 [08:18<00:55,  2.34it/s]

Gradient norm: 82.5853724896325


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1082/1212 [08:18<00:53,  2.42it/s]

Gradient norm: 81.3595757297661


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1083/1212 [08:18<00:50,  2.54it/s]

Gradient norm: 78.56494297310806


Epoch 1 of 5 | Iteration:  89%|████████▉ | 1084/1212 [08:19<00:51,  2.47it/s]

Gradient norm: 78.6208777972706


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1085/1212 [08:19<00:55,  2.31it/s]

Gradient norm: 78.62319678587838


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1086/1212 [08:20<00:57,  2.20it/s]

Gradient norm: 79.72364363105139


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1087/1212 [08:20<00:54,  2.28it/s]

Gradient norm: 86.51768622033218


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1088/1212 [08:21<00:52,  2.36it/s]

Gradient norm: 86.54515604008519


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1089/1212 [08:21<00:49,  2.50it/s]

Gradient norm: 2.849638718668568


Epoch 1 of 5 | Iteration:  90%|████████▉ | 1090/1212 [08:21<00:49,  2.46it/s]

Gradient norm: 2.9445119231266066


Epoch 1 of 5 | Iteration:  90%|█████████ | 1091/1212 [08:22<00:47,  2.55it/s]

Gradient norm: 5.428407344919137


Epoch 1 of 5 | Iteration:  90%|█████████ | 1092/1212 [08:22<00:51,  2.31it/s]

Gradient norm: 5.879635736978606


Epoch 1 of 5 | Iteration:  90%|█████████ | 1093/1212 [08:23<00:49,  2.41it/s]

Gradient norm: 15.522511494353692


Epoch 1 of 5 | Iteration:  90%|█████████ | 1094/1212 [08:23<00:46,  2.52it/s]

Gradient norm: 15.516858570983198


Epoch 1 of 5 | Iteration:  90%|█████████ | 1095/1212 [08:23<00:48,  2.42it/s]

Gradient norm: 15.846753066454879


Epoch 1 of 5 | Iteration:  90%|█████████ | 1096/1212 [08:24<00:47,  2.43it/s]

Gradient norm: 15.73529280781367


Epoch 1 of 5 | Iteration:  91%|█████████ | 1097/1212 [08:24<00:44,  2.56it/s]

Gradient norm: 15.770442229701413


Epoch 1 of 5 | Iteration:  91%|█████████ | 1098/1212 [08:24<00:44,  2.58it/s]

Gradient norm: 15.690183071290125


Epoch 1 of 5 | Iteration:  91%|█████████ | 1099/1212 [08:25<00:52,  2.14it/s]

Gradient norm: 15.786221060634137


Epoch 1 of 5 | Iteration:  91%|█████████ | 1100/1212 [08:26<00:53,  2.09it/s]

Gradient norm: 15.84286716113451


Epoch 1 of 5 | Iteration:  91%|█████████ | 1101/1212 [08:26<00:52,  2.11it/s]

Gradient norm: 18.026178390493133


Epoch 1 of 5 | Iteration:  91%|█████████ | 1102/1212 [08:27<00:53,  2.07it/s]

Gradient norm: 18.881483113141023


Epoch 1 of 5 | Iteration:  91%|█████████ | 1103/1212 [08:27<00:52,  2.08it/s]

Gradient norm: 85.1820481216242


Epoch 1 of 5 | Iteration:  91%|█████████ | 1104/1212 [08:28<00:52,  2.06it/s]

Gradient norm: 88.51214055501022


Epoch 1 of 5 | Iteration:  91%|█████████ | 1105/1212 [08:28<00:56,  1.89it/s]

Gradient norm: 1.5065606054203615


Epoch 1 of 5 | Iteration:  91%|█████████▏| 1106/1212 [08:29<00:51,  2.08it/s]

Gradient norm: 12.936451287303191


Epoch 1 of 5 | Iteration:  91%|█████████▏| 1107/1212 [08:29<00:46,  2.26it/s]

Gradient norm: 13.20444899142937


Epoch 1 of 5 | Iteration:  91%|█████████▏| 1108/1212 [08:29<00:43,  2.41it/s]

Gradient norm: 13.556125991345665


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1109/1212 [08:30<00:40,  2.53it/s]

Gradient norm: 13.59270148416153


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1110/1212 [08:30<00:38,  2.62it/s]

Gradient norm: 13.443853882027073


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1111/1212 [08:30<00:39,  2.58it/s]

Gradient norm: 13.568461815835379


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1112/1212 [08:31<00:37,  2.64it/s]

Gradient norm: 13.584226945665202


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1113/1212 [08:31<00:36,  2.69it/s]

Gradient norm: 13.610461928102003


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1114/1212 [08:32<00:38,  2.53it/s]

Gradient norm: 13.561703144945579


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1115/1212 [08:32<00:41,  2.36it/s]

Gradient norm: 102.4691860906949


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1116/1212 [08:32<00:41,  2.30it/s]

Gradient norm: 107.03419044619908


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1117/1212 [08:33<00:39,  2.41it/s]

Gradient norm: 107.16428575794573


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1118/1212 [08:33<00:37,  2.51it/s]

Gradient norm: 107.06904993954443


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1119/1212 [08:34<00:35,  2.62it/s]

Gradient norm: 655.7781175871114


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1120/1212 [08:34<00:39,  2.35it/s]

Gradient norm: 655.6833303281395


Epoch 1 of 5 | Iteration:  92%|█████████▏| 1121/1212 [08:34<00:37,  2.41it/s]

Gradient norm: 1.2873247222750184


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1122/1212 [08:35<00:41,  2.14it/s]

Gradient norm: 1.6872749040698958


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1123/1212 [08:36<00:41,  2.17it/s]

Gradient norm: 2.929760908118208


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1124/1212 [08:36<00:37,  2.34it/s]

Gradient norm: 3.0015931792642814


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1125/1212 [08:36<00:35,  2.46it/s]

Gradient norm: 3.9761168141873915


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1126/1212 [08:37<00:38,  2.25it/s]

Gradient norm: 4.092723485633735


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1127/1212 [08:37<00:35,  2.37it/s]

Gradient norm: 5.340464172357068


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1128/1212 [08:37<00:33,  2.48it/s]

Gradient norm: 5.735031533111753


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1129/1212 [08:38<00:32,  2.59it/s]

Gradient norm: 7.414527926695759


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1130/1212 [08:38<00:31,  2.60it/s]

Gradient norm: 7.903817605572491


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1131/1212 [08:39<00:34,  2.33it/s]

Gradient norm: 7.981007984468


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1132/1212 [08:39<00:38,  2.08it/s]

Gradient norm: 8.284061140120006


Epoch 1 of 5 | Iteration:  93%|█████████▎| 1133/1212 [08:40<00:38,  2.04it/s]

Gradient norm: 8.210597150665052


Epoch 1 of 5 | Iteration:  94%|█████████▎| 1134/1212 [08:40<00:37,  2.06it/s]

Gradient norm: 8.293457564964779


Epoch 1 of 5 | Iteration:  94%|█████████▎| 1135/1212 [08:41<00:40,  1.92it/s]

Gradient norm: 9.095711022570654


Epoch 1 of 5 | Iteration:  94%|█████████▎| 1136/1212 [08:42<00:40,  1.86it/s]

Gradient norm: 9.16975968759788


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1137/1212 [08:42<00:38,  1.96it/s]

Gradient norm: 2.08136393849475


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1138/1212 [08:42<00:38,  1.93it/s]

Gradient norm: 67.2883946049331


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1139/1212 [08:43<00:36,  2.00it/s]

Gradient norm: 67.14910304616929


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1140/1212 [08:43<00:33,  2.16it/s]

Gradient norm: 67.15411649085462


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1141/1212 [08:44<00:30,  2.32it/s]

Gradient norm: 67.36929423956428


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1142/1212 [08:44<00:32,  2.16it/s]

Gradient norm: 68.4670681172382


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1143/1212 [08:45<00:30,  2.25it/s]

Gradient norm: 68.50160589034594


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1144/1212 [08:45<00:28,  2.40it/s]

Gradient norm: 68.38306705818925


Epoch 1 of 5 | Iteration:  94%|█████████▍| 1145/1212 [08:45<00:28,  2.35it/s]

Gradient norm: 68.53622137070293


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1146/1212 [08:46<00:26,  2.47it/s]

Gradient norm: 68.51051011304062


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1147/1212 [08:46<00:25,  2.50it/s]

Gradient norm: 68.80732236214261


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1148/1212 [08:47<00:24,  2.58it/s]

Gradient norm: 69.25182571446132


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1149/1212 [08:47<00:29,  2.16it/s]

Gradient norm: 74.9405008341572


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1150/1212 [08:48<00:28,  2.16it/s]

Gradient norm: 77.07887106637827


Epoch 1 of 5 | Iteration:  95%|█████████▍| 1151/1212 [08:48<00:26,  2.34it/s]

Gradient norm: 74.18703428202005


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1152/1212 [08:49<00:29,  2.00it/s]

Gradient norm: 74.2561396028687


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1153/1212 [08:49<00:28,  2.04it/s]

Gradient norm: 2.412975194101046


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1154/1212 [08:50<00:31,  1.82it/s]

Gradient norm: 2.653865154895388


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1155/1212 [08:50<00:29,  1.93it/s]

Gradient norm: 2.829718967434111


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1156/1212 [08:51<00:26,  2.10it/s]

Gradient norm: 5.9265721860327565


Epoch 1 of 5 | Iteration:  95%|█████████▌| 1157/1212 [08:51<00:27,  2.02it/s]

Gradient norm: 6.448711208152865


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1158/1212 [08:52<00:24,  2.17it/s]

Gradient norm: 6.75789052009629


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1159/1212 [08:52<00:25,  2.05it/s]

Gradient norm: 6.943436252288794


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1160/1212 [08:53<00:25,  2.04it/s]

Gradient norm: 7.0266098181908125


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1161/1212 [08:53<00:25,  1.98it/s]

Gradient norm: 7.162130606766432


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1162/1212 [08:54<00:24,  2.00it/s]

Gradient norm: 10.557238506071977


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1163/1212 [08:54<00:26,  1.85it/s]

Gradient norm: 13.06496583128706


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1164/1212 [08:55<00:28,  1.70it/s]

Gradient norm: 13.114526671183464


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1165/1212 [08:55<00:26,  1.75it/s]

Gradient norm: 13.232324927603985


Epoch 1 of 5 | Iteration:  96%|█████████▌| 1166/1212 [08:56<00:23,  1.96it/s]

Gradient norm: 13.718021205843502


Epoch 1 of 5 | Iteration:  96%|█████████▋| 1167/1212 [08:56<00:21,  2.14it/s]

Gradient norm: 14.08297136247033


Epoch 1 of 5 | Iteration:  96%|█████████▋| 1168/1212 [08:57<00:19,  2.21it/s]

Gradient norm: 14.110303824807572


Epoch 1 of 5 | Iteration:  96%|█████████▋| 1169/1212 [08:57<00:19,  2.20it/s]

Gradient norm: 3.916300593557033


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1170/1212 [08:57<00:17,  2.38it/s]

Gradient norm: 5.729214851445751


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1171/1212 [08:58<00:16,  2.48it/s]

Gradient norm: 6.558012769130206


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1172/1212 [08:58<00:15,  2.56it/s]

Gradient norm: 6.851603536492628


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1173/1212 [08:59<00:17,  2.25it/s]

Gradient norm: 6.86577148125391


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1174/1212 [08:59<00:17,  2.16it/s]

Gradient norm: 6.830761433620897


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1175/1212 [09:00<00:16,  2.31it/s]

Gradient norm: 9.317263902134398


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1176/1212 [09:00<00:16,  2.17it/s]

Gradient norm: 26.1311877003146


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1177/1212 [09:00<00:15,  2.31it/s]

Gradient norm: 26.088935072199693


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1178/1212 [09:01<00:14,  2.27it/s]

Gradient norm: 42.66594491022537


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1179/1212 [09:01<00:13,  2.41it/s]

Gradient norm: 43.7756664264182


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1180/1212 [09:02<00:14,  2.17it/s]

Gradient norm: 45.181430599125555


Epoch 1 of 5 | Iteration:  97%|█████████▋| 1181/1212 [09:02<00:14,  2.19it/s]

Gradient norm: 45.041475514505606


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1182/1212 [09:03<00:12,  2.31it/s]

Gradient norm: 45.75470620760963


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1183/1212 [09:03<00:12,  2.40it/s]

Gradient norm: 45.887101286756376


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1184/1212 [09:04<00:11,  2.36it/s]

Gradient norm: 46.12521496348286


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1185/1212 [09:04<00:13,  2.07it/s]

Gradient norm: 2.86198933167427


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1186/1212 [09:05<00:11,  2.18it/s]

Gradient norm: 109.32937393018054


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1187/1212 [09:05<00:11,  2.13it/s]

Gradient norm: 109.29203495461417


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1188/1212 [09:06<00:11,  2.10it/s]

Gradient norm: 109.29056590591985


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1189/1212 [09:06<00:11,  2.08it/s]

Gradient norm: 109.29493678685395


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1190/1212 [09:07<00:11,  2.00it/s]

Gradient norm: 109.18119380739172


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1191/1212 [09:07<00:10,  1.99it/s]

Gradient norm: 109.10584326491251


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1192/1212 [09:08<00:10,  1.90it/s]

Gradient norm: 109.76873885717853


Epoch 1 of 5 | Iteration:  98%|█████████▊| 1193/1212 [09:08<00:10,  1.80it/s]

Gradient norm: 109.406040997658


Epoch 1 of 5 | Iteration:  99%|█████████▊| 1194/1212 [09:09<00:09,  1.88it/s]

Gradient norm: 109.20993375692598


Epoch 1 of 5 | Iteration:  99%|█████████▊| 1195/1212 [09:09<00:08,  1.94it/s]

Gradient norm: 109.09564089692614


Epoch 1 of 5 | Iteration:  99%|█████████▊| 1196/1212 [09:10<00:07,  2.05it/s]

Gradient norm: 109.32371296972347


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1197/1212 [09:10<00:06,  2.20it/s]

Gradient norm: 110.33845042850028


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1198/1212 [09:11<00:07,  1.95it/s]

Gradient norm: 110.99759000769511


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1199/1212 [09:11<00:06,  2.11it/s]

Gradient norm: 110.95881699480658


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1200/1212 [09:12<00:05,  2.03it/s]

Gradient norm: 110.92690030160321


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1201/1212 [09:12<00:05,  2.18it/s]

Gradient norm: 0.543640725671196


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1202/1212 [09:13<00:04,  2.09it/s]

Gradient norm: 4.857464034434557


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1203/1212 [09:13<00:04,  1.92it/s]

Gradient norm: 5.680220165636753


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1204/1212 [09:14<00:04,  1.92it/s]

Gradient norm: 7.812578558109057


Epoch 1 of 5 | Iteration:  99%|█████████▉| 1205/1212 [09:14<00:03,  2.09it/s]

Gradient norm: 8.267550530146087


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1206/1212 [09:14<00:02,  2.20it/s]

Gradient norm: 12.293135750736853


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1207/1212 [09:15<00:02,  2.08it/s]

Gradient norm: 12.229128912499172


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1208/1212 [09:15<00:01,  2.20it/s]

Gradient norm: 26.84286420582782


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1209/1212 [09:16<00:01,  2.31it/s]

Gradient norm: 28.017523453012057


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1210/1212 [09:16<00:00,  2.36it/s]

Gradient norm: 28.015829658801188


Epoch 1 of 5 | Iteration: 100%|█████████▉| 1211/1212 [09:17<00:00,  2.31it/s]

Gradient norm: 29.69639783768993


Epoch 1 of 5 | Iteration: 100%|██████████| 1212/1212 [09:17<00:00,  2.17it/s]


Gradient norm: 29.764787396388797


100%|██████████| 1212/1212 [05:03<00:00,  4.00it/s]


Epoch 1/5, Training Loss: 2.0159, Validation Loss: 1.9142
Validation top k acc: 0.7863
              precision    recall  f1-score   support

           0       0.91      0.77      0.83     10666
           1       0.31      0.57      0.40      1947

    accuracy                           0.74     12613
   macro avg       0.61      0.67      0.62     12613
weighted avg       0.82      0.74      0.77     12613



Epoch 2 of 5 | Iteration:   0%|          | 0/1212 [00:00<?, ?it/s]

Train ...


Epoch 2 of 5 | Iteration:   0%|          | 1/1212 [00:00<11:38,  1.73it/s]

Gradient norm: 26.263353533378964


Epoch 2 of 5 | Iteration:   0%|          | 2/1212 [00:01<11:10,  1.81it/s]

Gradient norm: 26.625718718847764


Epoch 2 of 5 | Iteration:   0%|          | 3/1212 [00:01<09:44,  2.07it/s]

Gradient norm: 26.285039109019632


Epoch 2 of 5 | Iteration:   0%|          | 4/1212 [00:01<09:10,  2.19it/s]

Gradient norm: 26.24832377921264


Epoch 2 of 5 | Iteration:   0%|          | 5/1212 [00:02<08:57,  2.24it/s]

Gradient norm: 26.48738192530622


Epoch 2 of 5 | Iteration:   0%|          | 6/1212 [00:02<08:18,  2.42it/s]

Gradient norm: 26.60512944724505


Epoch 2 of 5 | Iteration:   1%|          | 7/1212 [00:03<07:58,  2.52it/s]

Gradient norm: 26.645167208586688


Epoch 2 of 5 | Iteration:   1%|          | 8/1212 [00:03<07:48,  2.57it/s]

Gradient norm: 25.631397986019877


Epoch 2 of 5 | Iteration:   1%|          | 9/1212 [00:03<07:57,  2.52it/s]

Gradient norm: 25.77469859670748


Epoch 2 of 5 | Iteration:   1%|          | 10/1212 [00:04<07:50,  2.56it/s]

Gradient norm: 25.835611709101258


Epoch 2 of 5 | Iteration:   1%|          | 11/1212 [00:04<07:45,  2.58it/s]

Gradient norm: 25.843172790595894


Epoch 2 of 5 | Iteration:   1%|          | 12/1212 [00:05<08:05,  2.47it/s]

Gradient norm: 25.85616471776449


Epoch 2 of 5 | Iteration:   1%|          | 13/1212 [00:05<08:32,  2.34it/s]

Gradient norm: 25.85624099364342


Epoch 2 of 5 | Iteration:   1%|          | 14/1212 [00:05<08:28,  2.36it/s]

Gradient norm: 26.274046032632203


Epoch 2 of 5 | Iteration:   1%|          | 15/1212 [00:06<08:57,  2.23it/s]

Gradient norm: 27.71904424837902


Epoch 2 of 5 | Iteration:   1%|▏         | 16/1212 [00:06<09:23,  2.12it/s]

Gradient norm: 27.779393878037112


Epoch 2 of 5 | Iteration:   1%|▏         | 17/1212 [00:07<10:13,  1.95it/s]

Gradient norm: 1.8980772739462368


Epoch 2 of 5 | Iteration:   1%|▏         | 18/1212 [00:08<10:07,  1.96it/s]

Gradient norm: 1.89420552896233


Epoch 2 of 5 | Iteration:   2%|▏         | 19/1212 [00:08<10:00,  1.99it/s]

Gradient norm: 423.0754816493903


Epoch 2 of 5 | Iteration:   2%|▏         | 20/1212 [00:09<10:09,  1.96it/s]

Gradient norm: 564.096333964395


Epoch 2 of 5 | Iteration:   2%|▏         | 21/1212 [00:09<10:25,  1.90it/s]

Gradient norm: 565.0600981079323


Epoch 2 of 5 | Iteration:   2%|▏         | 22/1212 [00:10<10:47,  1.84it/s]

Gradient norm: 568.739222657257


Epoch 2 of 5 | Iteration:   2%|▏         | 23/1212 [00:10<09:44,  2.04it/s]

Gradient norm: 568.469240453658


Epoch 2 of 5 | Iteration:   2%|▏         | 24/1212 [00:11<09:11,  2.16it/s]

Gradient norm: 568.5241677185955


Epoch 2 of 5 | Iteration:   2%|▏         | 25/1212 [00:11<08:29,  2.33it/s]

Gradient norm: 567.8709647374707


Epoch 2 of 5 | Iteration:   2%|▏         | 26/1212 [00:11<08:35,  2.30it/s]

Gradient norm: 567.8323039101854


Epoch 2 of 5 | Iteration:   2%|▏         | 27/1212 [00:12<08:24,  2.35it/s]

Gradient norm: 571.2932769209672


Epoch 2 of 5 | Iteration:   2%|▏         | 28/1212 [00:12<07:57,  2.48it/s]

Gradient norm: 559.584044090573


Epoch 2 of 5 | Iteration:   2%|▏         | 29/1212 [00:13<08:07,  2.42it/s]

Gradient norm: 557.4519748119782


Epoch 2 of 5 | Iteration:   2%|▏         | 30/1212 [00:13<07:50,  2.51it/s]

Gradient norm: 562.8848401813035


Epoch 2 of 5 | Iteration:   3%|▎         | 31/1212 [00:13<07:52,  2.50it/s]

Gradient norm: 563.1404179350907


Epoch 2 of 5 | Iteration:   3%|▎         | 32/1212 [00:14<07:49,  2.51it/s]

Gradient norm: 563.3292390143857


Epoch 2 of 5 | Iteration:   3%|▎         | 33/1212 [00:14<07:30,  2.62it/s]

Gradient norm: 2.2645189747242096


Epoch 2 of 5 | Iteration:   3%|▎         | 34/1212 [00:14<07:30,  2.62it/s]

Gradient norm: 3.855986858684348


Epoch 2 of 5 | Iteration:   3%|▎         | 35/1212 [00:15<07:22,  2.66it/s]

Gradient norm: 4.292546535206778


Epoch 2 of 5 | Iteration:   3%|▎         | 36/1212 [00:15<07:17,  2.69it/s]

Gradient norm: 8.381908200413298


Epoch 2 of 5 | Iteration:   3%|▎         | 37/1212 [00:16<08:19,  2.35it/s]

Gradient norm: 8.323928012872033


Epoch 2 of 5 | Iteration:   3%|▎         | 38/1212 [00:16<08:34,  2.28it/s]

Gradient norm: 8.869960430345357


Epoch 2 of 5 | Iteration:   3%|▎         | 39/1212 [00:17<08:15,  2.37it/s]

Gradient norm: 8.916226208410492


Epoch 2 of 5 | Iteration:   3%|▎         | 40/1212 [00:17<08:04,  2.42it/s]

Gradient norm: 10.829507455278883


Epoch 2 of 5 | Iteration:   3%|▎         | 41/1212 [00:17<08:15,  2.36it/s]

Gradient norm: 10.962678208617472


Epoch 2 of 5 | Iteration:   3%|▎         | 42/1212 [00:18<07:53,  2.47it/s]

Gradient norm: 146.02518235757952


Epoch 2 of 5 | Iteration:   4%|▎         | 43/1212 [00:18<08:33,  2.28it/s]

Gradient norm: 145.97470306218912


Epoch 2 of 5 | Iteration:   4%|▎         | 44/1212 [00:19<08:48,  2.21it/s]

Gradient norm: 145.90772523918767


Epoch 2 of 5 | Iteration:   4%|▎         | 45/1212 [00:19<08:16,  2.35it/s]

Gradient norm: 146.899938386189


Epoch 2 of 5 | Iteration:   4%|▍         | 46/1212 [00:20<08:51,  2.20it/s]

Gradient norm: 143.22689116666328


Epoch 2 of 5 | Iteration:   4%|▍         | 47/1212 [00:20<09:17,  2.09it/s]

Gradient norm: 142.7976902379163


Epoch 2 of 5 | Iteration:   4%|▍         | 48/1212 [00:21<09:42,  2.00it/s]

Gradient norm: 142.34352885542205


Epoch 2 of 5 | Iteration:   4%|▍         | 49/1212 [00:21<09:51,  1.97it/s]

Gradient norm: 3.9912805524531096


Epoch 2 of 5 | Iteration:   4%|▍         | 50/1212 [00:22<09:46,  1.98it/s]

Gradient norm: 8.187019374090085


Epoch 2 of 5 | Iteration:   4%|▍         | 51/1212 [00:22<09:57,  1.94it/s]

Gradient norm: 9.717404166202709


Epoch 2 of 5 | Iteration:   4%|▍         | 52/1212 [00:23<10:54,  1.77it/s]

Gradient norm: 15.680444971786807


Epoch 2 of 5 | Iteration:   4%|▍         | 53/1212 [00:23<10:12,  1.89it/s]

Gradient norm: 16.366223627894946


Epoch 2 of 5 | Iteration:   4%|▍         | 54/1212 [00:24<10:02,  1.92it/s]

Gradient norm: 16.402854394626743


Epoch 2 of 5 | Iteration:   5%|▍         | 55/1212 [00:24<09:09,  2.11it/s]

Gradient norm: 19.435211113679827


Epoch 2 of 5 | Iteration:   5%|▍         | 56/1212 [00:25<08:37,  2.23it/s]

Gradient norm: 86.56918481184718


Epoch 2 of 5 | Iteration:   5%|▍         | 57/1212 [00:25<08:35,  2.24it/s]

Gradient norm: 85.8228755590603


Epoch 2 of 5 | Iteration:   5%|▍         | 58/1212 [00:26<08:37,  2.23it/s]

Gradient norm: 86.91640988572824


Epoch 2 of 5 | Iteration:   5%|▍         | 59/1212 [00:26<08:12,  2.34it/s]

Gradient norm: 84.35519743819432


Epoch 2 of 5 | Iteration:   5%|▍         | 60/1212 [00:26<08:28,  2.26it/s]

Gradient norm: 84.46405615548933


Epoch 2 of 5 | Iteration:   5%|▌         | 61/1212 [00:27<08:33,  2.24it/s]

Gradient norm: 84.3577114060609


Epoch 2 of 5 | Iteration:   5%|▌         | 62/1212 [00:27<08:33,  2.24it/s]

Gradient norm: 84.39296129497191


Epoch 2 of 5 | Iteration:   5%|▌         | 63/1212 [00:28<08:32,  2.24it/s]

Gradient norm: 85.19728536211188


Epoch 2 of 5 | Iteration:   5%|▌         | 64/1212 [00:28<09:07,  2.10it/s]

Gradient norm: 85.09856884382462


Epoch 2 of 5 | Iteration:   5%|▌         | 65/1212 [00:29<09:28,  2.02it/s]

Gradient norm: 31.284498909354728


Epoch 2 of 5 | Iteration:   5%|▌         | 66/1212 [00:29<10:08,  1.88it/s]

Gradient norm: 32.18455712780721


Epoch 2 of 5 | Iteration:   6%|▌         | 67/1212 [00:30<09:13,  2.07it/s]

Gradient norm: 32.165648625405424


Epoch 2 of 5 | Iteration:   6%|▌         | 68/1212 [00:30<10:08,  1.88it/s]

Gradient norm: 148.70101600347024


Epoch 2 of 5 | Iteration:   6%|▌         | 69/1212 [00:31<09:12,  2.07it/s]

Gradient norm: 148.6595273035772


Epoch 2 of 5 | Iteration:   6%|▌         | 70/1212 [00:31<09:54,  1.92it/s]

Gradient norm: 148.5720594170454


Epoch 2 of 5 | Iteration:   6%|▌         | 71/1212 [00:32<08:59,  2.12it/s]

Gradient norm: 148.6020062420527


Epoch 2 of 5 | Iteration:   6%|▌         | 72/1212 [00:32<10:04,  1.89it/s]

Gradient norm: 148.39235787824623


Epoch 2 of 5 | Iteration:   6%|▌         | 73/1212 [00:33<10:26,  1.82it/s]

Gradient norm: 147.62863770948482


Epoch 2 of 5 | Iteration:   6%|▌         | 74/1212 [00:34<10:02,  1.89it/s]

Gradient norm: 147.79346267854214


Epoch 2 of 5 | Iteration:   6%|▌         | 75/1212 [00:34<09:59,  1.90it/s]

Gradient norm: 147.81202568987635


Epoch 2 of 5 | Iteration:   6%|▋         | 76/1212 [00:35<09:49,  1.93it/s]

Gradient norm: 147.83101910500355


Epoch 2 of 5 | Iteration:   6%|▋         | 77/1212 [00:35<09:46,  1.93it/s]

Gradient norm: 148.29131086187547


Epoch 2 of 5 | Iteration:   6%|▋         | 78/1212 [00:36<10:08,  1.86it/s]

Gradient norm: 148.33208063022875


Epoch 2 of 5 | Iteration:   7%|▋         | 79/1212 [00:36<09:21,  2.02it/s]

Gradient norm: 148.36509463276874


Epoch 2 of 5 | Iteration:   7%|▋         | 80/1212 [00:36<08:55,  2.11it/s]

Gradient norm: 151.01825714139426


Epoch 2 of 5 | Iteration:   7%|▋         | 81/1212 [00:37<08:14,  2.29it/s]

Gradient norm: 2.208145011397987


Epoch 2 of 5 | Iteration:   7%|▋         | 82/1212 [00:37<08:31,  2.21it/s]

Gradient norm: 5.750956428353975


Epoch 2 of 5 | Iteration:   7%|▋         | 83/1212 [00:38<08:46,  2.14it/s]

Gradient norm: 12.217696998513441


Epoch 2 of 5 | Iteration:   7%|▋         | 84/1212 [00:38<08:21,  2.25it/s]

Gradient norm: 12.347627000015256


Epoch 2 of 5 | Iteration:   7%|▋         | 85/1212 [00:39<08:14,  2.28it/s]

Gradient norm: 15.854531696789312


Epoch 2 of 5 | Iteration:   7%|▋         | 86/1212 [00:39<07:51,  2.39it/s]

Gradient norm: 16.174305702614642


Epoch 2 of 5 | Iteration:   7%|▋         | 87/1212 [00:39<07:58,  2.35it/s]

Gradient norm: 16.137048144313553


Epoch 2 of 5 | Iteration:   7%|▋         | 88/1212 [00:40<07:34,  2.47it/s]

Gradient norm: 16.49089607522219


Epoch 2 of 5 | Iteration:   7%|▋         | 89/1212 [00:40<07:21,  2.54it/s]

Gradient norm: 22.12534236561345


Epoch 2 of 5 | Iteration:   7%|▋         | 90/1212 [00:41<07:57,  2.35it/s]

Gradient norm: 22.252142726801953


Epoch 2 of 5 | Iteration:   8%|▊         | 91/1212 [00:41<07:37,  2.45it/s]

Gradient norm: 22.52813357366871


Epoch 2 of 5 | Iteration:   8%|▊         | 92/1212 [00:42<08:16,  2.26it/s]

Gradient norm: 42.1572961989969


Epoch 2 of 5 | Iteration:   8%|▊         | 93/1212 [00:42<08:18,  2.24it/s]

Gradient norm: 42.25931930306976


Epoch 2 of 5 | Iteration:   8%|▊         | 94/1212 [00:42<07:57,  2.34it/s]

Gradient norm: 42.37338948617424


Epoch 2 of 5 | Iteration:   8%|▊         | 95/1212 [00:43<09:04,  2.05it/s]

Gradient norm: 42.24027489353055


Epoch 2 of 5 | Iteration:   8%|▊         | 96/1212 [00:43<08:43,  2.13it/s]

Gradient norm: 42.135437177426475


Epoch 2 of 5 | Iteration:   8%|▊         | 97/1212 [00:44<08:17,  2.24it/s]

Gradient norm: 1.581591064117734


Epoch 2 of 5 | Iteration:   8%|▊         | 98/1212 [00:44<07:49,  2.37it/s]

Gradient norm: 1.6640926026592446


Epoch 2 of 5 | Iteration:   8%|▊         | 99/1212 [00:45<07:44,  2.40it/s]

Gradient norm: 3.7541399970790974


Epoch 2 of 5 | Iteration:   8%|▊         | 100/1212 [00:45<07:23,  2.51it/s]

Gradient norm: 4.756152260674233


Epoch 2 of 5 | Iteration:   8%|▊         | 101/1212 [00:45<07:11,  2.58it/s]

Gradient norm: 4.759358790430982


Epoch 2 of 5 | Iteration:   8%|▊         | 102/1212 [00:46<07:05,  2.61it/s]

Gradient norm: 4.603445461661349


Epoch 2 of 5 | Iteration:   8%|▊         | 103/1212 [00:46<08:42,  2.12it/s]

Gradient norm: 84.33849088366475


Epoch 2 of 5 | Iteration:   9%|▊         | 104/1212 [00:47<08:55,  2.07it/s]

Gradient norm: 84.29023343014828


Epoch 2 of 5 | Iteration:   9%|▊         | 105/1212 [00:48<09:53,  1.87it/s]

Gradient norm: 84.32679536456547


Epoch 2 of 5 | Iteration:   9%|▊         | 106/1212 [00:48<09:38,  1.91it/s]

Gradient norm: 84.76036566433746


Epoch 2 of 5 | Iteration:   9%|▉         | 107/1212 [00:49<09:40,  1.90it/s]

Gradient norm: 84.45290656423072


Epoch 2 of 5 | Iteration:   9%|▉         | 108/1212 [00:49<09:52,  1.86it/s]

Gradient norm: 84.6944189956511


Epoch 2 of 5 | Iteration:   9%|▉         | 109/1212 [00:50<09:16,  1.98it/s]

Gradient norm: 92.48945248922873


Epoch 2 of 5 | Iteration:   9%|▉         | 110/1212 [00:50<09:00,  2.04it/s]

Gradient norm: 91.64080477745499


Epoch 2 of 5 | Iteration:   9%|▉         | 111/1212 [00:50<08:15,  2.22it/s]

Gradient norm: 91.499919916182


Epoch 2 of 5 | Iteration:   9%|▉         | 112/1212 [00:51<08:03,  2.28it/s]

Gradient norm: 91.64235610421251


Epoch 2 of 5 | Iteration:   9%|▉         | 113/1212 [00:51<08:46,  2.09it/s]

Gradient norm: 8.613442015981287


Epoch 2 of 5 | Iteration:   9%|▉         | 114/1212 [00:52<08:43,  2.10it/s]

Gradient norm: 8.654350253192389


Epoch 2 of 5 | Iteration:   9%|▉         | 115/1212 [00:52<09:37,  1.90it/s]

Gradient norm: 8.574008589820236


Epoch 2 of 5 | Iteration:  10%|▉         | 116/1212 [00:53<09:44,  1.87it/s]

Gradient norm: 540.4867029390277


Epoch 2 of 5 | Iteration:  10%|▉         | 117/1212 [00:53<08:51,  2.06it/s]

Gradient norm: 540.6894116535461


Epoch 2 of 5 | Iteration:  10%|▉         | 118/1212 [00:54<08:38,  2.11it/s]

Gradient norm: 542.635400306045


Epoch 2 of 5 | Iteration:  10%|▉         | 119/1212 [00:54<08:10,  2.23it/s]

Gradient norm: 542.5013654283216


Epoch 2 of 5 | Iteration:  10%|▉         | 120/1212 [00:55<07:46,  2.34it/s]

Gradient norm: 542.7672329311243


Epoch 2 of 5 | Iteration:  10%|▉         | 121/1212 [00:55<07:26,  2.44it/s]

Gradient norm: 543.3906225612302


Epoch 2 of 5 | Iteration:  10%|█         | 122/1212 [00:55<07:38,  2.38it/s]

Gradient norm: 543.0654513720241


Epoch 2 of 5 | Iteration:  10%|█         | 123/1212 [00:56<07:41,  2.36it/s]

Gradient norm: 543.0710956788944


Epoch 2 of 5 | Iteration:  10%|█         | 124/1212 [00:56<07:16,  2.49it/s]

Gradient norm: 543.1490748866491


Epoch 2 of 5 | Iteration:  10%|█         | 125/1212 [00:57<07:15,  2.50it/s]

Gradient norm: 543.1409068030489


Epoch 2 of 5 | Iteration:  10%|█         | 126/1212 [00:57<07:08,  2.53it/s]

Gradient norm: 542.3210406106738


Epoch 2 of 5 | Iteration:  10%|█         | 127/1212 [00:57<07:13,  2.50it/s]

Gradient norm: 541.7680094004783


Epoch 2 of 5 | Iteration:  11%|█         | 128/1212 [00:58<07:17,  2.48it/s]

Gradient norm: 541.5944420237311


Epoch 2 of 5 | Iteration:  11%|█         | 129/1212 [00:59<09:13,  1.96it/s]

Gradient norm: 315.43844896302767


Epoch 2 of 5 | Iteration:  11%|█         | 130/1212 [00:59<08:23,  2.15it/s]

Gradient norm: 315.78440330666774


Epoch 2 of 5 | Iteration:  11%|█         | 131/1212 [00:59<08:23,  2.15it/s]

Gradient norm: 315.8290029092954


Epoch 2 of 5 | Iteration:  11%|█         | 132/1212 [01:00<08:37,  2.09it/s]

Gradient norm: 317.1756215596237


Epoch 2 of 5 | Iteration:  11%|█         | 133/1212 [01:00<08:37,  2.09it/s]

Gradient norm: 318.4204022490747


Epoch 2 of 5 | Iteration:  11%|█         | 134/1212 [01:01<08:40,  2.07it/s]

Gradient norm: 318.4452725538956


Epoch 2 of 5 | Iteration:  11%|█         | 135/1212 [01:01<09:02,  1.98it/s]

Gradient norm: 317.56021461677256


Epoch 2 of 5 | Iteration:  11%|█         | 136/1212 [01:02<09:09,  1.96it/s]

Gradient norm: 317.769314873891


Epoch 2 of 5 | Iteration:  11%|█▏        | 137/1212 [01:03<09:31,  1.88it/s]

Gradient norm: 317.7693148606511


Epoch 2 of 5 | Iteration:  11%|█▏        | 138/1212 [01:03<09:32,  1.88it/s]

Gradient norm: 529.6696321059326


Epoch 2 of 5 | Iteration:  11%|█▏        | 139/1212 [01:04<09:33,  1.87it/s]

Gradient norm: 529.631918667068


Epoch 2 of 5 | Iteration:  12%|█▏        | 140/1212 [01:04<09:03,  1.97it/s]

Gradient norm: 529.6290670870811


Epoch 2 of 5 | Iteration:  12%|█▏        | 141/1212 [01:04<08:08,  2.19it/s]

Gradient norm: 531.6666335001479


Epoch 2 of 5 | Iteration:  12%|█▏        | 142/1212 [01:05<08:23,  2.13it/s]

Gradient norm: 532.2652799490511


Epoch 2 of 5 | Iteration:  12%|█▏        | 143/1212 [01:06<09:06,  1.96it/s]

Gradient norm: 532.2652321671694


Epoch 2 of 5 | Iteration:  12%|█▏        | 144/1212 [01:06<08:32,  2.08it/s]

Gradient norm: 532.2615344449082


Epoch 2 of 5 | Iteration:  12%|█▏        | 145/1212 [01:06<07:55,  2.24it/s]

Gradient norm: 12.621757152345078


Epoch 2 of 5 | Iteration:  12%|█▏        | 146/1212 [01:07<07:32,  2.36it/s]

Gradient norm: 12.827382407884999


Epoch 2 of 5 | Iteration:  12%|█▏        | 147/1212 [01:07<07:41,  2.31it/s]

Gradient norm: 15.984820156358586


Epoch 2 of 5 | Iteration:  12%|█▏        | 148/1212 [01:08<07:33,  2.34it/s]

Gradient norm: 16.403672546184133


Epoch 2 of 5 | Iteration:  12%|█▏        | 149/1212 [01:08<07:14,  2.45it/s]

Gradient norm: 16.51914146649439


Epoch 2 of 5 | Iteration:  12%|█▏        | 150/1212 [01:08<07:03,  2.51it/s]

Gradient norm: 18.60370334600614


Epoch 2 of 5 | Iteration:  12%|█▏        | 151/1212 [01:09<07:04,  2.50it/s]

Gradient norm: 19.314938559411853


Epoch 2 of 5 | Iteration:  13%|█▎        | 152/1212 [01:09<06:57,  2.54it/s]

Gradient norm: 20.023895762613773


Epoch 2 of 5 | Iteration:  13%|█▎        | 153/1212 [01:09<06:53,  2.56it/s]

Gradient norm: 20.027809524977503


Epoch 2 of 5 | Iteration:  13%|█▎        | 154/1212 [01:10<06:43,  2.62it/s]

Gradient norm: 21.077904901681883


Epoch 2 of 5 | Iteration:  13%|█▎        | 155/1212 [01:10<07:11,  2.45it/s]

Gradient norm: 21.103729108298026


Epoch 2 of 5 | Iteration:  13%|█▎        | 156/1212 [01:11<07:01,  2.51it/s]

Gradient norm: 21.19956300086757


Epoch 2 of 5 | Iteration:  13%|█▎        | 157/1212 [01:11<07:01,  2.50it/s]

Gradient norm: 21.17309025337796


Epoch 2 of 5 | Iteration:  13%|█▎        | 158/1212 [01:12<08:18,  2.11it/s]

Gradient norm: 20.786733980970894


Epoch 2 of 5 | Iteration:  13%|█▎        | 159/1212 [01:12<08:13,  2.14it/s]

Gradient norm: 21.094297085384333


Epoch 2 of 5 | Iteration:  13%|█▎        | 160/1212 [01:13<07:58,  2.20it/s]

Gradient norm: 20.91244215292348


Epoch 2 of 5 | Iteration:  13%|█▎        | 161/1212 [01:13<08:45,  2.00it/s]

Gradient norm: 4.689672002224977


Epoch 2 of 5 | Iteration:  13%|█▎        | 162/1212 [01:14<08:41,  2.01it/s]

Gradient norm: 4.950458505808448


Epoch 2 of 5 | Iteration:  13%|█▎        | 163/1212 [01:14<08:51,  1.97it/s]

Gradient norm: 5.661596305680552


Epoch 2 of 5 | Iteration:  14%|█▎        | 164/1212 [01:15<08:50,  1.98it/s]

Gradient norm: 6.093816670995251


Epoch 2 of 5 | Iteration:  14%|█▎        | 165/1212 [01:15<09:26,  1.85it/s]

Gradient norm: 6.614254373308475


Epoch 2 of 5 | Iteration:  14%|█▎        | 166/1212 [01:16<09:31,  1.83it/s]

Gradient norm: 6.677092642707903


Epoch 2 of 5 | Iteration:  14%|█▍        | 167/1212 [01:16<09:06,  1.91it/s]

Gradient norm: 7.583085028639145


Epoch 2 of 5 | Iteration:  14%|█▍        | 168/1212 [01:17<08:13,  2.12it/s]

Gradient norm: 8.51986446185933


Epoch 2 of 5 | Iteration:  14%|█▍        | 169/1212 [01:17<08:38,  2.01it/s]

Gradient norm: 8.455930732147413


Epoch 2 of 5 | Iteration:  14%|█▍        | 170/1212 [01:18<09:16,  1.87it/s]

Gradient norm: 10.517192494973717


Epoch 2 of 5 | Iteration:  14%|█▍        | 171/1212 [01:18<09:23,  1.85it/s]

Gradient norm: 11.437439083100232


Epoch 2 of 5 | Iteration:  14%|█▍        | 172/1212 [01:19<09:15,  1.87it/s]

Gradient norm: 13.26369356829506


Epoch 2 of 5 | Iteration:  14%|█▍        | 173/1212 [01:19<08:22,  2.07it/s]

Gradient norm: 14.113291500188327


Epoch 2 of 5 | Iteration:  14%|█▍        | 174/1212 [01:20<09:15,  1.87it/s]

Gradient norm: 14.27205643467871


Epoch 2 of 5 | Iteration:  14%|█▍        | 175/1212 [01:21<09:16,  1.87it/s]

Gradient norm: 19.176901789032176


Epoch 2 of 5 | Iteration:  15%|█▍        | 176/1212 [01:21<09:14,  1.87it/s]

Gradient norm: 19.03411781365921


Epoch 2 of 5 | Iteration:  15%|█▍        | 177/1212 [01:22<08:49,  1.95it/s]

Gradient norm: 2.302696046327408


Epoch 2 of 5 | Iteration:  15%|█▍        | 178/1212 [01:22<09:26,  1.83it/s]

Gradient norm: 5.802159745757234


Epoch 2 of 5 | Iteration:  15%|█▍        | 179/1212 [01:23<09:25,  1.83it/s]

Gradient norm: 6.203962363343594


Epoch 2 of 5 | Iteration:  15%|█▍        | 180/1212 [01:23<08:26,  2.04it/s]

Gradient norm: 6.145075214856124


Epoch 2 of 5 | Iteration:  15%|█▍        | 181/1212 [01:23<07:46,  2.21it/s]

Gradient norm: 10.222133621483568


Epoch 2 of 5 | Iteration:  15%|█▌        | 182/1212 [01:24<07:34,  2.27it/s]

Gradient norm: 10.484246383619343


Epoch 2 of 5 | Iteration:  15%|█▌        | 183/1212 [01:24<07:51,  2.18it/s]

Gradient norm: 10.433670802005759


Epoch 2 of 5 | Iteration:  15%|█▌        | 184/1212 [01:25<07:51,  2.18it/s]

Gradient norm: 10.860398760987975


Epoch 2 of 5 | Iteration:  15%|█▌        | 185/1212 [01:25<07:39,  2.24it/s]

Gradient norm: 16.653375766412243


Epoch 2 of 5 | Iteration:  15%|█▌        | 186/1212 [01:26<07:12,  2.37it/s]

Gradient norm: 26.04058632034397


Epoch 2 of 5 | Iteration:  15%|█▌        | 187/1212 [01:26<07:51,  2.18it/s]

Gradient norm: 26.08352796148614


Epoch 2 of 5 | Iteration:  16%|█▌        | 188/1212 [01:27<08:08,  2.10it/s]

Gradient norm: 31.158733294924687


Epoch 2 of 5 | Iteration:  16%|█▌        | 189/1212 [01:27<08:06,  2.10it/s]

Gradient norm: 28.795106908243877


Epoch 2 of 5 | Iteration:  16%|█▌        | 190/1212 [01:28<09:14,  1.84it/s]

Gradient norm: 28.187828211442945


Epoch 2 of 5 | Iteration:  16%|█▌        | 191/1212 [01:28<08:56,  1.90it/s]

Gradient norm: 134.7166938556475


Epoch 2 of 5 | Iteration:  16%|█▌        | 192/1212 [01:29<09:08,  1.86it/s]

Gradient norm: 134.81669326647088


Epoch 2 of 5 | Iteration:  16%|█▌        | 193/1212 [01:29<09:05,  1.87it/s]

Gradient norm: 3.2516451543337572


Epoch 2 of 5 | Iteration:  16%|█▌        | 194/1212 [01:30<08:27,  2.01it/s]

Gradient norm: 4.54929848767498


Epoch 2 of 5 | Iteration:  16%|█▌        | 195/1212 [01:30<09:04,  1.87it/s]

Gradient norm: 6.184300296459975


Epoch 2 of 5 | Iteration:  16%|█▌        | 196/1212 [01:31<09:20,  1.81it/s]

Gradient norm: 9.011827019106864


Epoch 2 of 5 | Iteration:  16%|█▋        | 197/1212 [01:31<08:15,  2.05it/s]

Gradient norm: 9.042785249934527


Epoch 2 of 5 | Iteration:  16%|█▋        | 198/1212 [01:32<08:40,  1.95it/s]

Gradient norm: 9.129336161288581


Epoch 2 of 5 | Iteration:  16%|█▋        | 199/1212 [01:32<07:48,  2.16it/s]

Gradient norm: 10.074749334559602


Epoch 2 of 5 | Iteration:  17%|█▋        | 200/1212 [01:33<07:13,  2.34it/s]

Gradient norm: 11.197765284657098


Epoch 2 of 5 | Iteration:  17%|█▋        | 201/1212 [01:33<07:18,  2.30it/s]

Gradient norm: 11.70370259083813


Epoch 2 of 5 | Iteration:  17%|█▋        | 202/1212 [01:33<06:50,  2.46it/s]

Gradient norm: 11.941551966865205


Epoch 2 of 5 | Iteration:  17%|█▋        | 203/1212 [01:34<07:41,  2.19it/s]

Gradient norm: 11.999248893647714


Epoch 2 of 5 | Iteration:  17%|█▋        | 204/1212 [01:34<07:09,  2.35it/s]

Gradient norm: 12.320625289911128


Epoch 2 of 5 | Iteration:  17%|█▋        | 205/1212 [01:35<07:58,  2.10it/s]

Gradient norm: 13.867445352655192


Epoch 2 of 5 | Iteration:  17%|█▋        | 206/1212 [01:36<08:49,  1.90it/s]

Gradient norm: 13.911343508575138


Epoch 2 of 5 | Iteration:  17%|█▋        | 207/1212 [01:36<08:20,  2.01it/s]

Gradient norm: 14.007318422021715


Epoch 2 of 5 | Iteration:  17%|█▋        | 208/1212 [01:36<07:53,  2.12it/s]

Gradient norm: 16.567714439065305


Epoch 2 of 5 | Iteration:  17%|█▋        | 209/1212 [01:37<07:17,  2.29it/s]

Gradient norm: 75.66287724435533


Epoch 2 of 5 | Iteration:  17%|█▋        | 210/1212 [01:37<07:50,  2.13it/s]

Gradient norm: 82.44809762977323


Epoch 2 of 5 | Iteration:  17%|█▋        | 211/1212 [01:38<08:41,  1.92it/s]

Gradient norm: 144.25596822244793


Epoch 2 of 5 | Iteration:  17%|█▋        | 212/1212 [01:39<08:52,  1.88it/s]

Gradient norm: 144.8372662383938


Epoch 2 of 5 | Iteration:  18%|█▊        | 213/1212 [01:39<08:02,  2.07it/s]

Gradient norm: 144.66629030276567


Epoch 2 of 5 | Iteration:  18%|█▊        | 214/1212 [01:39<07:36,  2.19it/s]

Gradient norm: 144.7563735364708


Epoch 2 of 5 | Iteration:  18%|█▊        | 215/1212 [01:40<08:15,  2.01it/s]

Gradient norm: 143.991499800501


Epoch 2 of 5 | Iteration:  18%|█▊        | 216/1212 [01:40<08:52,  1.87it/s]

Gradient norm: 144.0911379942925


Epoch 2 of 5 | Iteration:  18%|█▊        | 217/1212 [01:41<08:47,  1.89it/s]

Gradient norm: 143.20345532538855


Epoch 2 of 5 | Iteration:  18%|█▊        | 218/1212 [01:42<08:40,  1.91it/s]

Gradient norm: 143.74447713155254


Epoch 2 of 5 | Iteration:  18%|█▊        | 219/1212 [01:42<09:15,  1.79it/s]

Gradient norm: 145.0911419050949


Epoch 2 of 5 | Iteration:  18%|█▊        | 220/1212 [01:43<09:13,  1.79it/s]

Gradient norm: 144.92125305973417


Epoch 2 of 5 | Iteration:  18%|█▊        | 221/1212 [01:43<08:18,  1.99it/s]

Gradient norm: 144.9630498587038


Epoch 2 of 5 | Iteration:  18%|█▊        | 222/1212 [01:43<07:37,  2.16it/s]

Gradient norm: 147.35182225349996


Epoch 2 of 5 | Iteration:  18%|█▊        | 223/1212 [01:44<07:20,  2.25it/s]

Gradient norm: 147.5781472522961


Epoch 2 of 5 | Iteration:  18%|█▊        | 224/1212 [01:44<07:12,  2.29it/s]

Gradient norm: 146.62274986498295


Epoch 2 of 5 | Iteration:  19%|█▊        | 225/1212 [01:45<07:31,  2.19it/s]

Gradient norm: 5.118069792291057


Epoch 2 of 5 | Iteration:  19%|█▊        | 226/1212 [01:45<07:00,  2.34it/s]

Gradient norm: 5.931436721510878


Epoch 2 of 5 | Iteration:  19%|█▊        | 227/1212 [01:46<06:57,  2.36it/s]

Gradient norm: 10.389512979247586


Epoch 2 of 5 | Iteration:  19%|█▉        | 228/1212 [01:46<07:03,  2.32it/s]

Gradient norm: 22.457186603747804


Epoch 2 of 5 | Iteration:  19%|█▉        | 229/1212 [01:46<07:11,  2.28it/s]

Gradient norm: 22.2064310991318


Epoch 2 of 5 | Iteration:  19%|█▉        | 230/1212 [01:47<06:50,  2.39it/s]

Gradient norm: 27.87808245504072


Epoch 2 of 5 | Iteration:  19%|█▉        | 231/1212 [01:47<06:51,  2.39it/s]

Gradient norm: 27.95689752270979


Epoch 2 of 5 | Iteration:  19%|█▉        | 232/1212 [01:48<07:16,  2.24it/s]

Gradient norm: 28.46878947325872


Epoch 2 of 5 | Iteration:  19%|█▉        | 233/1212 [01:48<08:01,  2.03it/s]

Gradient norm: 28.709137755286047


Epoch 2 of 5 | Iteration:  19%|█▉        | 234/1212 [01:49<07:45,  2.10it/s]

Gradient norm: 28.465592787192243


Epoch 2 of 5 | Iteration:  19%|█▉        | 235/1212 [01:49<08:11,  1.99it/s]

Gradient norm: 35.44043428504844


Epoch 2 of 5 | Iteration:  19%|█▉        | 236/1212 [01:50<07:38,  2.13it/s]

Gradient norm: 126.57812375393657


Epoch 2 of 5 | Iteration:  20%|█▉        | 237/1212 [01:50<08:05,  2.01it/s]

Gradient norm: 126.6607734016743


Epoch 2 of 5 | Iteration:  20%|█▉        | 238/1212 [01:51<07:30,  2.16it/s]

Gradient norm: 154.99589983323924


Epoch 2 of 5 | Iteration:  20%|█▉        | 239/1212 [01:51<06:56,  2.33it/s]

Gradient norm: 155.03065434773256


Epoch 2 of 5 | Iteration:  20%|█▉        | 240/1212 [01:51<07:01,  2.31it/s]

Gradient norm: 155.57649920935006


Epoch 2 of 5 | Iteration:  20%|█▉        | 241/1212 [01:52<06:41,  2.42it/s]

Gradient norm: 7.254034203154877


Epoch 2 of 5 | Iteration:  20%|█▉        | 242/1212 [01:52<06:26,  2.51it/s]

Gradient norm: 7.173002725311747


Epoch 2 of 5 | Iteration:  20%|██        | 243/1212 [01:53<06:22,  2.54it/s]

Gradient norm: 7.3974853240220515


Epoch 2 of 5 | Iteration:  20%|██        | 244/1212 [01:53<07:18,  2.21it/s]

Gradient norm: 9.373497904920454


Epoch 2 of 5 | Iteration:  20%|██        | 245/1212 [01:54<08:03,  2.00it/s]

Gradient norm: 19.333726720213686


Epoch 2 of 5 | Iteration:  20%|██        | 246/1212 [01:54<08:05,  1.99it/s]

Gradient norm: 19.63148157921672


Epoch 2 of 5 | Iteration:  20%|██        | 247/1212 [01:55<08:09,  1.97it/s]

Gradient norm: 19.58623854229205


Epoch 2 of 5 | Iteration:  20%|██        | 248/1212 [01:55<08:11,  1.96it/s]

Gradient norm: 22.22059848235255


Epoch 2 of 5 | Iteration:  21%|██        | 249/1212 [01:56<09:04,  1.77it/s]

Gradient norm: 22.298472052516086


Epoch 2 of 5 | Iteration:  21%|██        | 250/1212 [01:57<08:41,  1.85it/s]

Gradient norm: 22.66435344483749


Epoch 2 of 5 | Iteration:  21%|██        | 251/1212 [01:57<07:56,  2.02it/s]

Gradient norm: 22.919881230282055


Epoch 2 of 5 | Iteration:  21%|██        | 252/1212 [01:57<07:13,  2.22it/s]

Gradient norm: 22.800974076431604


Epoch 2 of 5 | Iteration:  21%|██        | 253/1212 [01:58<07:50,  2.04it/s]

Gradient norm: 23.00112040703398


Epoch 2 of 5 | Iteration:  21%|██        | 254/1212 [01:58<07:38,  2.09it/s]

Gradient norm: 24.764440330946865


Epoch 2 of 5 | Iteration:  21%|██        | 255/1212 [01:59<07:09,  2.23it/s]

Gradient norm: 35.53067361081554


Epoch 2 of 5 | Iteration:  21%|██        | 256/1212 [01:59<07:00,  2.27it/s]

Gradient norm: 35.80219716202148


Epoch 2 of 5 | Iteration:  21%|██        | 257/1212 [01:59<06:41,  2.38it/s]

Gradient norm: 0.4365240159284783


Epoch 2 of 5 | Iteration:  21%|██▏       | 258/1212 [02:00<07:22,  2.15it/s]

Gradient norm: 49.782288883715026


Epoch 2 of 5 | Iteration:  21%|██▏       | 259/1212 [02:00<07:06,  2.23it/s]

Gradient norm: 49.72469518842768


Epoch 2 of 5 | Iteration:  21%|██▏       | 260/1212 [02:01<06:55,  2.29it/s]

Gradient norm: 49.88460173925114


Epoch 2 of 5 | Iteration:  22%|██▏       | 261/1212 [02:01<06:29,  2.44it/s]

Gradient norm: 50.77811951958628


Epoch 2 of 5 | Iteration:  22%|██▏       | 262/1212 [02:02<06:23,  2.48it/s]

Gradient norm: 50.469407610506124


Epoch 2 of 5 | Iteration:  22%|██▏       | 263/1212 [02:02<06:49,  2.32it/s]

Gradient norm: 50.38906703797573


Epoch 2 of 5 | Iteration:  22%|██▏       | 264/1212 [02:02<06:30,  2.43it/s]

Gradient norm: 50.677639199548544


Epoch 2 of 5 | Iteration:  22%|██▏       | 265/1212 [02:03<06:21,  2.48it/s]

Gradient norm: 50.64910103750166


Epoch 2 of 5 | Iteration:  22%|██▏       | 266/1212 [02:03<06:39,  2.37it/s]

Gradient norm: 50.44061618122582


Epoch 2 of 5 | Iteration:  22%|██▏       | 267/1212 [02:04<06:57,  2.27it/s]

Gradient norm: 50.42199621807654


Epoch 2 of 5 | Iteration:  22%|██▏       | 268/1212 [02:04<06:38,  2.37it/s]

Gradient norm: 51.9225132112157


Epoch 2 of 5 | Iteration:  22%|██▏       | 269/1212 [02:05<06:20,  2.48it/s]

Gradient norm: 52.10214319304986


Epoch 2 of 5 | Iteration:  22%|██▏       | 270/1212 [02:05<06:10,  2.54it/s]

Gradient norm: 52.13801291804583


Epoch 2 of 5 | Iteration:  22%|██▏       | 271/1212 [02:05<06:01,  2.61it/s]

Gradient norm: 52.16928290378435


Epoch 2 of 5 | Iteration:  22%|██▏       | 272/1212 [02:06<07:11,  2.18it/s]

Gradient norm: 52.1385375222049


Epoch 2 of 5 | Iteration:  23%|██▎       | 273/1212 [02:06<07:39,  2.05it/s]

Gradient norm: 11.030626947132035


Epoch 2 of 5 | Iteration:  23%|██▎       | 274/1212 [02:07<07:57,  1.96it/s]

Gradient norm: 11.204148624509255


Epoch 2 of 5 | Iteration:  23%|██▎       | 275/1212 [02:08<07:58,  1.96it/s]

Gradient norm: 42.18791286148105


Epoch 2 of 5 | Iteration:  23%|██▎       | 276/1212 [02:08<07:53,  1.98it/s]

Gradient norm: 42.21654487089895


Epoch 2 of 5 | Iteration:  23%|██▎       | 277/1212 [02:09<07:50,  1.99it/s]

Gradient norm: 42.94100623672861


Epoch 2 of 5 | Iteration:  23%|██▎       | 278/1212 [02:09<08:02,  1.94it/s]

Gradient norm: 42.934790296097766


Epoch 2 of 5 | Iteration:  23%|██▎       | 279/1212 [02:10<07:41,  2.02it/s]

Gradient norm: 42.779038440770336


Epoch 2 of 5 | Iteration:  23%|██▎       | 280/1212 [02:10<06:59,  2.22it/s]

Gradient norm: 42.578750340160674


Epoch 2 of 5 | Iteration:  23%|██▎       | 281/1212 [02:10<06:48,  2.28it/s]

Gradient norm: 42.94942497960509


Epoch 2 of 5 | Iteration:  23%|██▎       | 282/1212 [02:11<06:24,  2.42it/s]

Gradient norm: 316.006986612569


Epoch 2 of 5 | Iteration:  23%|██▎       | 283/1212 [02:11<07:28,  2.07it/s]

Gradient norm: 315.61007166120095


Epoch 2 of 5 | Iteration:  23%|██▎       | 284/1212 [02:12<07:10,  2.15it/s]

Gradient norm: 320.04257239741895


Epoch 2 of 5 | Iteration:  24%|██▎       | 285/1212 [02:12<07:21,  2.10it/s]

Gradient norm: 320.0870525076524


Epoch 2 of 5 | Iteration:  24%|██▎       | 286/1212 [02:13<06:57,  2.22it/s]

Gradient norm: 319.64628768143746


Epoch 2 of 5 | Iteration:  24%|██▎       | 287/1212 [02:13<06:32,  2.36it/s]

Gradient norm: 319.7465939212479


Epoch 2 of 5 | Iteration:  24%|██▍       | 288/1212 [02:13<06:44,  2.28it/s]

Gradient norm: 320.1422207276874


Epoch 2 of 5 | Iteration:  24%|██▍       | 289/1212 [02:14<06:57,  2.21it/s]

Gradient norm: 12.522333367307647


Epoch 2 of 5 | Iteration:  24%|██▍       | 290/1212 [02:14<06:34,  2.34it/s]

Gradient norm: 12.452805554148753


Epoch 2 of 5 | Iteration:  24%|██▍       | 291/1212 [02:15<06:59,  2.20it/s]

Gradient norm: 12.958870881766625


Epoch 2 of 5 | Iteration:  24%|██▍       | 292/1212 [02:15<06:36,  2.32it/s]

Gradient norm: 13.794951932218023


Epoch 2 of 5 | Iteration:  24%|██▍       | 293/1212 [02:16<06:26,  2.38it/s]

Gradient norm: 17.39100201047069


Epoch 2 of 5 | Iteration:  24%|██▍       | 294/1212 [02:16<06:52,  2.23it/s]

Gradient norm: 28.073326157575377


Epoch 2 of 5 | Iteration:  24%|██▍       | 295/1212 [02:17<07:06,  2.15it/s]

Gradient norm: 28.13388840400277


Epoch 2 of 5 | Iteration:  24%|██▍       | 296/1212 [02:17<06:40,  2.29it/s]

Gradient norm: 28.000823382781032


Epoch 2 of 5 | Iteration:  25%|██▍       | 297/1212 [02:18<07:16,  2.10it/s]

Gradient norm: 32.57860851722918


Epoch 2 of 5 | Iteration:  25%|██▍       | 298/1212 [02:18<07:07,  2.14it/s]

Gradient norm: 32.91913886374067


Epoch 2 of 5 | Iteration:  25%|██▍       | 299/1212 [02:18<06:34,  2.31it/s]

Gradient norm: 32.76724901895138


Epoch 2 of 5 | Iteration:  25%|██▍       | 300/1212 [02:19<06:16,  2.42it/s]

Gradient norm: 32.498717041483935


Epoch 2 of 5 | Iteration:  25%|██▍       | 301/1212 [02:19<06:02,  2.52it/s]

Gradient norm: 32.5973374903238


Epoch 2 of 5 | Iteration:  25%|██▍       | 302/1212 [02:19<06:07,  2.47it/s]

Gradient norm: 30.91040339284697


Epoch 2 of 5 | Iteration:  25%|██▌       | 303/1212 [02:20<06:32,  2.32it/s]

Gradient norm: 30.904041887379854


Epoch 2 of 5 | Iteration:  25%|██▌       | 304/1212 [02:20<06:52,  2.20it/s]

Gradient norm: 30.88397349290427


Epoch 2 of 5 | Iteration:  25%|██▌       | 305/1212 [02:21<07:07,  2.12it/s]

Gradient norm: 2.7720493518908484


Epoch 2 of 5 | Iteration:  25%|██▌       | 306/1212 [02:21<07:03,  2.14it/s]

Gradient norm: 7.104847385574939


Epoch 2 of 5 | Iteration:  25%|██▌       | 307/1212 [02:22<07:26,  2.03it/s]

Gradient norm: 7.5693739983244575


Epoch 2 of 5 | Iteration:  25%|██▌       | 308/1212 [02:23<07:54,  1.91it/s]

Gradient norm: 7.765044814401468


Epoch 2 of 5 | Iteration:  25%|██▌       | 309/1212 [02:23<07:18,  2.06it/s]

Gradient norm: 7.711621774170877


Epoch 2 of 5 | Iteration:  26%|██▌       | 310/1212 [02:23<06:42,  2.24it/s]

Gradient norm: 8.248487591220387


Epoch 2 of 5 | Iteration:  26%|██▌       | 311/1212 [02:24<07:30,  2.00it/s]

Gradient norm: 13.336873092864689


Epoch 2 of 5 | Iteration:  26%|██▌       | 312/1212 [02:24<07:39,  1.96it/s]

Gradient norm: 313.06237005515027


Epoch 2 of 5 | Iteration:  26%|██▌       | 313/1212 [02:25<07:40,  1.95it/s]

Gradient norm: 313.23728251913855


Epoch 2 of 5 | Iteration:  26%|██▌       | 314/1212 [02:26<07:49,  1.91it/s]

Gradient norm: 313.2760900339608


Epoch 2 of 5 | Iteration:  26%|██▌       | 315/1212 [02:26<07:17,  2.05it/s]

Gradient norm: 317.3101320474944


Epoch 2 of 5 | Iteration:  26%|██▌       | 316/1212 [02:26<07:07,  2.10it/s]

Gradient norm: 317.2515683927747


Epoch 2 of 5 | Iteration:  26%|██▌       | 317/1212 [02:27<06:44,  2.21it/s]

Gradient norm: 315.7241506661039


Epoch 2 of 5 | Iteration:  26%|██▌       | 318/1212 [02:27<06:47,  2.20it/s]

Gradient norm: 315.35578731825797


Epoch 2 of 5 | Iteration:  26%|██▋       | 319/1212 [02:28<07:09,  2.08it/s]

Gradient norm: 315.3469039169858


Epoch 2 of 5 | Iteration:  26%|██▋       | 320/1212 [02:28<06:39,  2.23it/s]

Gradient norm: 315.50317450415076


Epoch 2 of 5 | Iteration:  26%|██▋       | 321/1212 [02:29<07:06,  2.09it/s]

Gradient norm: 2.074760296122182


Epoch 2 of 5 | Iteration:  27%|██▋       | 322/1212 [02:29<06:34,  2.26it/s]

Gradient norm: 11.059848954353917


Epoch 2 of 5 | Iteration:  27%|██▋       | 323/1212 [02:29<06:17,  2.36it/s]

Gradient norm: 12.29510308737825


Epoch 2 of 5 | Iteration:  27%|██▋       | 324/1212 [02:30<06:30,  2.27it/s]

Gradient norm: 12.663274743102198


Epoch 2 of 5 | Iteration:  27%|██▋       | 325/1212 [02:30<06:03,  2.44it/s]

Gradient norm: 12.779118275358812


Epoch 2 of 5 | Iteration:  27%|██▋       | 326/1212 [02:31<05:52,  2.51it/s]

Gradient norm: 32.00368817313589


Epoch 2 of 5 | Iteration:  27%|██▋       | 327/1212 [02:31<07:01,  2.10it/s]

Gradient norm: 32.84842772010699


Epoch 2 of 5 | Iteration:  27%|██▋       | 328/1212 [02:32<07:15,  2.03it/s]

Gradient norm: 32.90166600735026


Epoch 2 of 5 | Iteration:  27%|██▋       | 329/1212 [02:32<07:02,  2.09it/s]

Gradient norm: 32.92858593590447


Epoch 2 of 5 | Iteration:  27%|██▋       | 330/1212 [02:33<07:09,  2.05it/s]

Gradient norm: 33.07719693339042


Epoch 2 of 5 | Iteration:  27%|██▋       | 331/1212 [02:33<07:22,  1.99it/s]

Gradient norm: 33.24058368979973


Epoch 2 of 5 | Iteration:  27%|██▋       | 332/1212 [02:34<07:27,  1.96it/s]

Gradient norm: 33.12891239544304


Epoch 2 of 5 | Iteration:  27%|██▋       | 333/1212 [02:34<07:53,  1.86it/s]

Gradient norm: 33.052503490494104


Epoch 2 of 5 | Iteration:  28%|██▊       | 334/1212 [02:35<07:54,  1.85it/s]

Gradient norm: 33.41624829183834


Epoch 2 of 5 | Iteration:  28%|██▊       | 335/1212 [02:36<08:09,  1.79it/s]

Gradient norm: 74.77240620527442


Epoch 2 of 5 | Iteration:  28%|██▊       | 336/1212 [02:36<08:25,  1.73it/s]

Gradient norm: 74.80458690364641


Epoch 2 of 5 | Iteration:  28%|██▊       | 337/1212 [02:37<07:51,  1.86it/s]

Gradient norm: 3.8887006015137637


Epoch 2 of 5 | Iteration:  28%|██▊       | 338/1212 [02:37<08:52,  1.64it/s]

Gradient norm: 4.385710957278266


Epoch 2 of 5 | Iteration:  28%|██▊       | 339/1212 [02:38<08:47,  1.65it/s]

Gradient norm: 8.698433522846292


Epoch 2 of 5 | Iteration:  28%|██▊       | 340/1212 [02:38<07:42,  1.89it/s]

Gradient norm: 8.678770823652426


Epoch 2 of 5 | Iteration:  28%|██▊       | 341/1212 [02:39<08:10,  1.78it/s]

Gradient norm: 8.359874077645504


Epoch 2 of 5 | Iteration:  28%|██▊       | 342/1212 [02:40<07:45,  1.87it/s]

Gradient norm: 8.517226568135278


Epoch 2 of 5 | Iteration:  28%|██▊       | 343/1212 [02:40<08:13,  1.76it/s]

Gradient norm: 15.214721914080627


Epoch 2 of 5 | Iteration:  28%|██▊       | 344/1212 [02:41<07:20,  1.97it/s]

Gradient norm: 59.53609685986315


Epoch 2 of 5 | Iteration:  28%|██▊       | 345/1212 [02:41<07:03,  2.05it/s]

Gradient norm: 58.41516615824687


Epoch 2 of 5 | Iteration:  29%|██▊       | 346/1212 [02:41<06:33,  2.20it/s]

Gradient norm: 58.334572436785095


Epoch 2 of 5 | Iteration:  29%|██▊       | 347/1212 [02:42<06:11,  2.33it/s]

Gradient norm: 81.7149673510882


Epoch 2 of 5 | Iteration:  29%|██▊       | 348/1212 [02:42<06:18,  2.28it/s]

Gradient norm: 82.17086724935459


Epoch 2 of 5 | Iteration:  29%|██▉       | 349/1212 [02:43<06:25,  2.24it/s]

Gradient norm: 82.01127496365449


Epoch 2 of 5 | Iteration:  29%|██▉       | 350/1212 [02:43<06:06,  2.35it/s]

Gradient norm: 82.2807960094024


Epoch 2 of 5 | Iteration:  29%|██▉       | 351/1212 [02:43<05:49,  2.46it/s]

Gradient norm: 82.37318815861282


Epoch 2 of 5 | Iteration:  29%|██▉       | 352/1212 [02:44<06:43,  2.13it/s]

Gradient norm: 82.22273524803843


Epoch 2 of 5 | Iteration:  29%|██▉       | 353/1212 [02:44<06:21,  2.25it/s]

Gradient norm: 4.493530868308858


Epoch 2 of 5 | Iteration:  29%|██▉       | 354/1212 [02:45<06:14,  2.29it/s]

Gradient norm: 4.473272342106655


Epoch 2 of 5 | Iteration:  29%|██▉       | 355/1212 [02:45<06:21,  2.25it/s]

Gradient norm: 4.172883813252178


Epoch 2 of 5 | Iteration:  29%|██▉       | 356/1212 [02:46<06:07,  2.33it/s]

Gradient norm: 8.596787644431961


Epoch 2 of 5 | Iteration:  29%|██▉       | 357/1212 [02:46<05:58,  2.38it/s]

Gradient norm: 8.843789219853768


Epoch 2 of 5 | Iteration:  30%|██▉       | 358/1212 [02:47<06:14,  2.28it/s]

Gradient norm: 10.55651368187942


Epoch 2 of 5 | Iteration:  30%|██▉       | 359/1212 [02:47<06:22,  2.23it/s]

Gradient norm: 10.599181729806519


Epoch 2 of 5 | Iteration:  30%|██▉       | 360/1212 [02:48<06:44,  2.11it/s]

Gradient norm: 17.866254663477882


Epoch 2 of 5 | Iteration:  30%|██▉       | 361/1212 [02:48<06:44,  2.10it/s]

Gradient norm: 17.84718067122604


Epoch 2 of 5 | Iteration:  30%|██▉       | 362/1212 [02:49<07:21,  1.92it/s]

Gradient norm: 18.23370281595282


Epoch 2 of 5 | Iteration:  30%|██▉       | 363/1212 [02:49<08:05,  1.75it/s]

Gradient norm: 18.183900699078443


Epoch 2 of 5 | Iteration:  30%|███       | 364/1212 [02:50<07:26,  1.90it/s]

Gradient norm: 180.9372497051919


Epoch 2 of 5 | Iteration:  30%|███       | 365/1212 [02:50<07:30,  1.88it/s]

Gradient norm: 181.35621553776926


Epoch 2 of 5 | Iteration:  30%|███       | 366/1212 [02:51<07:17,  1.94it/s]

Gradient norm: 181.48150029208793


Epoch 2 of 5 | Iteration:  30%|███       | 367/1212 [02:51<06:38,  2.12it/s]

Gradient norm: 181.4939560407584


Epoch 2 of 5 | Iteration:  30%|███       | 368/1212 [02:52<06:18,  2.23it/s]

Gradient norm: 320.5871430732118


Epoch 2 of 5 | Iteration:  30%|███       | 369/1212 [02:52<05:57,  2.36it/s]

Gradient norm: 7.832703270596166


Epoch 2 of 5 | Iteration:  31%|███       | 370/1212 [02:52<05:42,  2.46it/s]

Gradient norm: 10.236335022306212


Epoch 2 of 5 | Iteration:  31%|███       | 371/1212 [02:53<06:07,  2.29it/s]

Gradient norm: 13.046125055189906


Epoch 2 of 5 | Iteration:  31%|███       | 372/1212 [02:53<05:52,  2.38it/s]

Gradient norm: 12.927728052770695


Epoch 2 of 5 | Iteration:  31%|███       | 373/1212 [02:54<06:13,  2.24it/s]

Gradient norm: 13.612138351228753


Epoch 2 of 5 | Iteration:  31%|███       | 374/1212 [02:54<05:51,  2.39it/s]

Gradient norm: 13.664410100617848


Epoch 2 of 5 | Iteration:  31%|███       | 375/1212 [02:54<05:35,  2.50it/s]

Gradient norm: 13.206221859655606


Epoch 2 of 5 | Iteration:  31%|███       | 376/1212 [02:55<05:28,  2.54it/s]

Gradient norm: 13.16165526771184


Epoch 2 of 5 | Iteration:  31%|███       | 377/1212 [02:55<06:08,  2.27it/s]

Gradient norm: 13.165103185837397


Epoch 2 of 5 | Iteration:  31%|███       | 378/1212 [02:56<05:56,  2.34it/s]

Gradient norm: 18.370608400745432


Epoch 2 of 5 | Iteration:  31%|███▏      | 379/1212 [02:56<06:07,  2.26it/s]

Gradient norm: 18.588986296857193


Epoch 2 of 5 | Iteration:  31%|███▏      | 380/1212 [02:57<07:09,  1.94it/s]

Gradient norm: 18.649435921252795


Epoch 2 of 5 | Iteration:  31%|███▏      | 381/1212 [02:57<06:29,  2.14it/s]

Gradient norm: 18.87586212285339


Epoch 2 of 5 | Iteration:  32%|███▏      | 382/1212 [02:58<06:04,  2.28it/s]

Gradient norm: 19.53476435711635


Epoch 2 of 5 | Iteration:  32%|███▏      | 383/1212 [02:58<06:13,  2.22it/s]

Gradient norm: 20.24783078173999


Epoch 2 of 5 | Iteration:  32%|███▏      | 384/1212 [02:59<06:03,  2.28it/s]

Gradient norm: 54.06759438908092


Epoch 2 of 5 | Iteration:  32%|███▏      | 385/1212 [02:59<06:26,  2.14it/s]

Gradient norm: 225.86714864432386


Epoch 2 of 5 | Iteration:  32%|███▏      | 386/1212 [03:00<07:48,  1.76it/s]

Gradient norm: 227.00353863275103


Epoch 2 of 5 | Iteration:  32%|███▏      | 387/1212 [03:00<07:31,  1.83it/s]

Gradient norm: 277.8997615681667


Epoch 2 of 5 | Iteration:  32%|███▏      | 388/1212 [03:01<07:40,  1.79it/s]

Gradient norm: 272.42978293619024


Epoch 2 of 5 | Iteration:  32%|███▏      | 389/1212 [03:01<07:25,  1.85it/s]

Gradient norm: 272.30030253147476


Epoch 2 of 5 | Iteration:  32%|███▏      | 390/1212 [03:02<07:24,  1.85it/s]

Gradient norm: 272.52892986734537


Epoch 2 of 5 | Iteration:  32%|███▏      | 391/1212 [03:03<07:49,  1.75it/s]

Gradient norm: 271.9669505674343


Epoch 2 of 5 | Iteration:  32%|███▏      | 392/1212 [03:03<07:30,  1.82it/s]

Gradient norm: 272.0750815621204


Epoch 2 of 5 | Iteration:  32%|███▏      | 393/1212 [03:04<07:07,  1.92it/s]

Gradient norm: 272.17500446496496


Epoch 2 of 5 | Iteration:  33%|███▎      | 394/1212 [03:04<06:26,  2.11it/s]

Gradient norm: 271.998626376455


Epoch 2 of 5 | Iteration:  33%|███▎      | 395/1212 [03:04<06:01,  2.26it/s]

Gradient norm: 272.087732771957


Epoch 2 of 5 | Iteration:  33%|███▎      | 396/1212 [03:05<06:22,  2.13it/s]

Gradient norm: 272.46298941106136


Epoch 2 of 5 | Iteration:  33%|███▎      | 397/1212 [03:05<05:59,  2.27it/s]

Gradient norm: 272.68186647457634


Epoch 2 of 5 | Iteration:  33%|███▎      | 398/1212 [03:06<05:53,  2.31it/s]

Gradient norm: 272.8169389394861


Epoch 2 of 5 | Iteration:  33%|███▎      | 399/1212 [03:06<05:48,  2.33it/s]

Gradient norm: 273.7897368055214


Epoch 2 of 5 | Iteration:  33%|███▎      | 400/1212 [03:06<05:34,  2.43it/s]

Gradient norm: 275.1754384165121


Epoch 2 of 5 | Iteration:  33%|███▎      | 401/1212 [03:07<05:21,  2.52it/s]

Gradient norm: 3.5688800612344074


Epoch 2 of 5 | Iteration:  33%|███▎      | 402/1212 [03:07<05:14,  2.58it/s]

Gradient norm: 4.671050946856276


Epoch 2 of 5 | Iteration:  33%|███▎      | 403/1212 [03:08<05:29,  2.46it/s]

Gradient norm: 7.410569861124514


Epoch 2 of 5 | Iteration:  33%|███▎      | 404/1212 [03:08<05:19,  2.53it/s]

Gradient norm: 14.362110824810975


Epoch 2 of 5 | Iteration:  33%|███▎      | 405/1212 [03:08<05:45,  2.33it/s]

Gradient norm: 14.591798680360432


Epoch 2 of 5 | Iteration:  33%|███▎      | 406/1212 [03:09<05:42,  2.36it/s]

Gradient norm: 20.71015751765886


Epoch 2 of 5 | Iteration:  34%|███▎      | 407/1212 [03:09<05:32,  2.42it/s]

Gradient norm: 133.15935409923358


Epoch 2 of 5 | Iteration:  34%|███▎      | 408/1212 [03:10<05:46,  2.32it/s]

Gradient norm: 134.42035707520643


Epoch 2 of 5 | Iteration:  34%|███▎      | 409/1212 [03:10<05:28,  2.45it/s]

Gradient norm: 134.43374434637508


Epoch 2 of 5 | Iteration:  34%|███▍      | 410/1212 [03:10<05:20,  2.50it/s]

Gradient norm: 134.2733931743463


Epoch 2 of 5 | Iteration:  34%|███▍      | 411/1212 [03:11<06:12,  2.15it/s]

Gradient norm: 134.5556738770463


Epoch 2 of 5 | Iteration:  34%|███▍      | 412/1212 [03:12<06:15,  2.13it/s]

Gradient norm: 134.77419672278756


Epoch 2 of 5 | Iteration:  34%|███▍      | 413/1212 [03:12<06:23,  2.08it/s]

Gradient norm: 134.84154114177775


Epoch 2 of 5 | Iteration:  34%|███▍      | 414/1212 [03:12<05:59,  2.22it/s]

Gradient norm: 134.86257746166754


Epoch 2 of 5 | Iteration:  34%|███▍      | 415/1212 [03:13<05:56,  2.24it/s]

Gradient norm: 134.6823068994005


Epoch 2 of 5 | Iteration:  34%|███▍      | 416/1212 [03:13<06:20,  2.09it/s]

Gradient norm: 134.89128344576991


Epoch 2 of 5 | Iteration:  34%|███▍      | 417/1212 [03:14<06:31,  2.03it/s]

Gradient norm: 3.262423717100872


Epoch 2 of 5 | Iteration:  34%|███▍      | 418/1212 [03:14<06:40,  1.98it/s]

Gradient norm: 3.294264296419925


Epoch 2 of 5 | Iteration:  35%|███▍      | 419/1212 [03:15<06:43,  1.97it/s]

Gradient norm: 164.24415421818026


Epoch 2 of 5 | Iteration:  35%|███▍      | 420/1212 [03:16<07:35,  1.74it/s]

Gradient norm: 162.60156008078195


Epoch 2 of 5 | Iteration:  35%|███▍      | 421/1212 [03:16<07:48,  1.69it/s]

Gradient norm: 161.04972878508391


Epoch 2 of 5 | Iteration:  35%|███▍      | 422/1212 [03:17<07:05,  1.86it/s]

Gradient norm: 161.35607738409573


Epoch 2 of 5 | Iteration:  35%|███▍      | 423/1212 [03:17<07:07,  1.85it/s]

Gradient norm: 161.6738533853567


Epoch 2 of 5 | Iteration:  35%|███▍      | 424/1212 [03:18<06:36,  1.99it/s]

Gradient norm: 161.57055521756206


Epoch 2 of 5 | Iteration:  35%|███▌      | 425/1212 [03:18<06:36,  1.99it/s]

Gradient norm: 161.9829046044293


Epoch 2 of 5 | Iteration:  35%|███▌      | 426/1212 [03:19<06:03,  2.17it/s]

Gradient norm: 162.39712401850247


Epoch 2 of 5 | Iteration:  35%|███▌      | 427/1212 [03:19<07:11,  1.82it/s]

Gradient norm: 162.62688532665325


Epoch 2 of 5 | Iteration:  35%|███▌      | 428/1212 [03:20<07:05,  1.84it/s]

Gradient norm: 161.84559755125395


Epoch 2 of 5 | Iteration:  35%|███▌      | 429/1212 [03:20<06:33,  1.99it/s]

Gradient norm: 160.98852555082306


Epoch 2 of 5 | Iteration:  35%|███▌      | 430/1212 [03:21<06:02,  2.16it/s]

Gradient norm: 160.9387443669375


Epoch 2 of 5 | Iteration:  36%|███▌      | 431/1212 [03:21<05:57,  2.19it/s]

Gradient norm: 161.22866041794506


Epoch 2 of 5 | Iteration:  36%|███▌      | 432/1212 [03:22<05:38,  2.31it/s]

Gradient norm: 161.29538255747858


Epoch 2 of 5 | Iteration:  36%|███▌      | 433/1212 [03:22<05:18,  2.44it/s]

Gradient norm: 2.480289404225463


Epoch 2 of 5 | Iteration:  36%|███▌      | 434/1212 [03:22<05:25,  2.39it/s]

Gradient norm: 2.872947662613358


Epoch 2 of 5 | Iteration:  36%|███▌      | 435/1212 [03:23<05:20,  2.42it/s]

Gradient norm: 2.888408698179304


Epoch 2 of 5 | Iteration:  36%|███▌      | 436/1212 [03:23<05:51,  2.21it/s]

Gradient norm: 15.799715889289729


Epoch 2 of 5 | Iteration:  36%|███▌      | 437/1212 [03:24<05:49,  2.22it/s]

Gradient norm: 15.84716482215921


Epoch 2 of 5 | Iteration:  36%|███▌      | 438/1212 [03:24<06:35,  1.96it/s]

Gradient norm: 19.31027860502338


Epoch 2 of 5 | Iteration:  36%|███▌      | 439/1212 [03:25<06:39,  1.93it/s]

Gradient norm: 19.36017059134592


Epoch 2 of 5 | Iteration:  36%|███▋      | 440/1212 [03:26<07:19,  1.76it/s]

Gradient norm: 19.46471185252967


Epoch 2 of 5 | Iteration:  36%|███▋      | 441/1212 [03:26<07:37,  1.69it/s]

Gradient norm: 47.46400358043299


Epoch 2 of 5 | Iteration:  36%|███▋      | 442/1212 [03:27<07:22,  1.74it/s]

Gradient norm: 47.50650448389875


Epoch 2 of 5 | Iteration:  37%|███▋      | 443/1212 [03:27<06:57,  1.84it/s]

Gradient norm: 47.53632961929376


Epoch 2 of 5 | Iteration:  37%|███▋      | 444/1212 [03:28<07:34,  1.69it/s]

Gradient norm: 47.694809013220784


Epoch 2 of 5 | Iteration:  37%|███▋      | 445/1212 [03:28<07:06,  1.80it/s]

Gradient norm: 47.17002642485158


Epoch 2 of 5 | Iteration:  37%|███▋      | 446/1212 [03:29<06:58,  1.83it/s]

Gradient norm: 49.873856429430965


Epoch 2 of 5 | Iteration:  37%|███▋      | 447/1212 [03:29<07:02,  1.81it/s]

Gradient norm: 50.26996424235628


Epoch 2 of 5 | Iteration:  37%|███▋      | 448/1212 [03:30<06:35,  1.93it/s]

Gradient norm: 50.32301728548597


Epoch 2 of 5 | Iteration:  37%|███▋      | 449/1212 [03:30<06:32,  1.94it/s]

Gradient norm: 6.57666942471718


Epoch 2 of 5 | Iteration:  37%|███▋      | 450/1212 [03:31<06:22,  1.99it/s]

Gradient norm: 6.582962086872311


Epoch 2 of 5 | Iteration:  37%|███▋      | 451/1212 [03:31<05:53,  2.15it/s]

Gradient norm: 10.497342234729146


Epoch 2 of 5 | Iteration:  37%|███▋      | 452/1212 [03:32<05:31,  2.29it/s]

Gradient norm: 11.413362649802693


Epoch 2 of 5 | Iteration:  37%|███▋      | 453/1212 [03:32<05:17,  2.39it/s]

Gradient norm: 13.374175128389007


Epoch 2 of 5 | Iteration:  37%|███▋      | 454/1212 [03:32<05:04,  2.49it/s]

Gradient norm: 13.775285285426227


Epoch 2 of 5 | Iteration:  38%|███▊      | 455/1212 [03:33<04:55,  2.56it/s]

Gradient norm: 13.550635018252187


Epoch 2 of 5 | Iteration:  38%|███▊      | 456/1212 [03:33<04:50,  2.60it/s]

Gradient norm: 14.275097853722498


Epoch 2 of 5 | Iteration:  38%|███▊      | 457/1212 [03:33<04:47,  2.62it/s]

Gradient norm: 14.526974066055354


Epoch 2 of 5 | Iteration:  38%|███▊      | 458/1212 [03:34<04:41,  2.68it/s]

Gradient norm: 14.612873895313452


Epoch 2 of 5 | Iteration:  38%|███▊      | 459/1212 [03:34<04:49,  2.60it/s]

Gradient norm: 14.647707776653961


Epoch 2 of 5 | Iteration:  38%|███▊      | 460/1212 [03:35<05:08,  2.44it/s]

Gradient norm: 14.652679424276661


Epoch 2 of 5 | Iteration:  38%|███▊      | 461/1212 [03:35<05:01,  2.49it/s]

Gradient norm: 19.3375944647431


Epoch 2 of 5 | Iteration:  38%|███▊      | 462/1212 [03:35<04:48,  2.60it/s]

Gradient norm: 19.708058775935527


Epoch 2 of 5 | Iteration:  38%|███▊      | 463/1212 [03:36<05:29,  2.27it/s]

Gradient norm: 19.78181977378831


Epoch 2 of 5 | Iteration:  38%|███▊      | 464/1212 [03:37<05:55,  2.11it/s]

Gradient norm: 33.221140683975676


Epoch 2 of 5 | Iteration:  38%|███▊      | 465/1212 [03:37<06:04,  2.05it/s]

Gradient norm: 5.716347775467432


Epoch 2 of 5 | Iteration:  38%|███▊      | 466/1212 [03:37<05:41,  2.19it/s]

Gradient norm: 9.227857129140235


Epoch 2 of 5 | Iteration:  39%|███▊      | 467/1212 [03:38<05:17,  2.35it/s]

Gradient norm: 9.31697574751406


Epoch 2 of 5 | Iteration:  39%|███▊      | 468/1212 [03:38<05:07,  2.42it/s]

Gradient norm: 9.235467712385798


Epoch 2 of 5 | Iteration:  39%|███▊      | 469/1212 [03:39<04:58,  2.49it/s]

Gradient norm: 9.406478599415623


Epoch 2 of 5 | Iteration:  39%|███▉      | 470/1212 [03:39<04:54,  2.52it/s]

Gradient norm: 8.558369643329984


Epoch 2 of 5 | Iteration:  39%|███▉      | 471/1212 [03:39<05:19,  2.32it/s]

Gradient norm: 21.10445144480983


Epoch 2 of 5 | Iteration:  39%|███▉      | 472/1212 [03:40<06:13,  1.98it/s]

Gradient norm: 22.984660536329635


Epoch 2 of 5 | Iteration:  39%|███▉      | 473/1212 [03:41<06:16,  1.96it/s]

Gradient norm: 23.1296958900558


Epoch 2 of 5 | Iteration:  39%|███▉      | 474/1212 [03:41<06:53,  1.78it/s]

Gradient norm: 24.061577354630913


Epoch 2 of 5 | Iteration:  39%|███▉      | 475/1212 [03:42<06:38,  1.85it/s]

Gradient norm: 24.190384411394696


Epoch 2 of 5 | Iteration:  39%|███▉      | 476/1212 [03:42<06:54,  1.78it/s]

Gradient norm: 24.47079269339601


Epoch 2 of 5 | Iteration:  39%|███▉      | 477/1212 [03:43<07:06,  1.72it/s]

Gradient norm: 26.657168886757752


Epoch 2 of 5 | Iteration:  39%|███▉      | 478/1212 [03:44<06:32,  1.87it/s]

Gradient norm: 28.007477698934544


Epoch 2 of 5 | Iteration:  40%|███▉      | 479/1212 [03:44<06:14,  1.96it/s]

Gradient norm: 27.742944099930543


Epoch 2 of 5 | Iteration:  40%|███▉      | 480/1212 [03:44<06:05,  2.00it/s]

Gradient norm: 28.698617420956598


Epoch 2 of 5 | Iteration:  40%|███▉      | 481/1212 [03:45<05:38,  2.16it/s]

Gradient norm: 120.54389439524924


Epoch 2 of 5 | Iteration:  40%|███▉      | 482/1212 [03:45<05:19,  2.28it/s]

Gradient norm: 120.08142444644227


Epoch 2 of 5 | Iteration:  40%|███▉      | 483/1212 [03:46<05:25,  2.24it/s]

Gradient norm: 437.0806931273013


Epoch 2 of 5 | Iteration:  40%|███▉      | 484/1212 [03:46<05:17,  2.30it/s]

Gradient norm: 438.422328611232


Epoch 2 of 5 | Iteration:  40%|████      | 485/1212 [03:47<05:32,  2.18it/s]

Gradient norm: 438.48698191534385


Epoch 2 of 5 | Iteration:  40%|████      | 486/1212 [03:47<05:09,  2.34it/s]

Gradient norm: 443.1978096560907


Epoch 2 of 5 | Iteration:  40%|████      | 487/1212 [03:48<05:35,  2.16it/s]

Gradient norm: 455.364949843522


Epoch 2 of 5 | Iteration:  40%|████      | 488/1212 [03:48<05:22,  2.25it/s]

Gradient norm: 454.70166887727015


Epoch 2 of 5 | Iteration:  40%|████      | 489/1212 [03:48<05:02,  2.39it/s]

Gradient norm: 459.0658052890176


Epoch 2 of 5 | Iteration:  40%|████      | 490/1212 [03:49<05:10,  2.33it/s]

Gradient norm: 459.50730564569443


Epoch 2 of 5 | Iteration:  41%|████      | 491/1212 [03:49<05:26,  2.21it/s]

Gradient norm: 459.04738043614736


Epoch 2 of 5 | Iteration:  41%|████      | 492/1212 [03:50<05:05,  2.36it/s]

Gradient norm: 459.0908131365955


Epoch 2 of 5 | Iteration:  41%|████      | 493/1212 [03:50<04:56,  2.42it/s]

Gradient norm: 458.92602875911774


Epoch 2 of 5 | Iteration:  41%|████      | 494/1212 [03:51<05:27,  2.19it/s]

Gradient norm: 458.8026150845416


Epoch 2 of 5 | Iteration:  41%|████      | 495/1212 [03:51<05:36,  2.13it/s]

Gradient norm: 458.2976172590007


Epoch 2 of 5 | Iteration:  41%|████      | 496/1212 [03:52<05:51,  2.04it/s]

Gradient norm: 460.356826461705


Epoch 2 of 5 | Iteration:  41%|████      | 497/1212 [03:52<05:34,  2.14it/s]

Gradient norm: 1.6736442416271602


Epoch 2 of 5 | Iteration:  41%|████      | 498/1212 [03:53<06:12,  1.92it/s]

Gradient norm: 2.487859677650426


Epoch 2 of 5 | Iteration:  41%|████      | 499/1212 [03:53<05:39,  2.10it/s]

Gradient norm: 6.063486750497


Epoch 2 of 5 | Iteration:  41%|████▏     | 500/1212 [03:54<05:47,  2.05it/s]

Gradient norm: 9.996127565099396


Epoch 2 of 5 | Iteration:  41%|████▏     | 501/1212 [03:54<05:43,  2.07it/s]

Gradient norm: 11.493396835233723


Epoch 2 of 5 | Iteration:  41%|████▏     | 502/1212 [03:55<06:09,  1.92it/s]

Gradient norm: 104.81135971255385


Epoch 2 of 5 | Iteration:  42%|████▏     | 503/1212 [03:55<06:21,  1.86it/s]

Gradient norm: 105.18247715892392


Epoch 2 of 5 | Iteration:  42%|████▏     | 504/1212 [03:56<06:05,  1.93it/s]

Gradient norm: 104.48668744285936


Epoch 2 of 5 | Iteration:  42%|████▏     | 505/1212 [03:56<06:13,  1.90it/s]

Gradient norm: 104.13443990711343


Epoch 2 of 5 | Iteration:  42%|████▏     | 506/1212 [03:57<06:05,  1.93it/s]

Gradient norm: 104.20780470481479


Epoch 2 of 5 | Iteration:  42%|████▏     | 507/1212 [03:57<05:33,  2.11it/s]

Gradient norm: 108.00648211765612


Epoch 2 of 5 | Iteration:  42%|████▏     | 508/1212 [03:57<05:22,  2.18it/s]

Gradient norm: 107.88550454522516


Epoch 2 of 5 | Iteration:  42%|████▏     | 509/1212 [03:58<05:06,  2.29it/s]

Gradient norm: 609.7231220919389


Epoch 2 of 5 | Iteration:  42%|████▏     | 510/1212 [03:58<04:59,  2.34it/s]

Gradient norm: 609.6340526173672


Epoch 2 of 5 | Iteration:  42%|████▏     | 511/1212 [03:59<05:03,  2.31it/s]

Gradient norm: 609.8793178248484


Epoch 2 of 5 | Iteration:  42%|████▏     | 512/1212 [03:59<04:51,  2.40it/s]

Gradient norm: 609.9476031465699


Epoch 2 of 5 | Iteration:  42%|████▏     | 513/1212 [03:59<04:37,  2.52it/s]

Gradient norm: 0.18079585376887947


Epoch 2 of 5 | Iteration:  42%|████▏     | 514/1212 [04:00<04:30,  2.58it/s]

Gradient norm: 1126.980964979715


Epoch 2 of 5 | Iteration:  42%|████▏     | 515/1212 [04:00<04:25,  2.62it/s]

Gradient norm: 1126.795984906472


Epoch 2 of 5 | Iteration:  43%|████▎     | 516/1212 [04:01<04:27,  2.60it/s]

Gradient norm: 1126.775090895022


Epoch 2 of 5 | Iteration:  43%|████▎     | 517/1212 [04:01<05:06,  2.27it/s]

Gradient norm: 1126.9942919067332


Epoch 2 of 5 | Iteration:  43%|████▎     | 518/1212 [04:01<04:47,  2.41it/s]

Gradient norm: 1127.5388877008063


Epoch 2 of 5 | Iteration:  43%|████▎     | 519/1212 [04:02<04:36,  2.50it/s]

Gradient norm: 1127.5235268449978


Epoch 2 of 5 | Iteration:  43%|████▎     | 520/1212 [04:02<04:33,  2.53it/s]

Gradient norm: 1127.5823655779939


Epoch 2 of 5 | Iteration:  43%|████▎     | 521/1212 [04:03<04:24,  2.61it/s]

Gradient norm: 1118.3307692242531


Epoch 2 of 5 | Iteration:  43%|████▎     | 522/1212 [04:03<04:34,  2.51it/s]

Gradient norm: 1118.0962271816022


Epoch 2 of 5 | Iteration:  43%|████▎     | 523/1212 [04:04<05:07,  2.24it/s]

Gradient norm: 1118.4724733320145


Epoch 2 of 5 | Iteration:  43%|████▎     | 524/1212 [04:04<05:27,  2.10it/s]

Gradient norm: 1118.5583349473725


Epoch 2 of 5 | Iteration:  43%|████▎     | 525/1212 [04:05<05:43,  2.00it/s]

Gradient norm: 1120.9331299933415


Epoch 2 of 5 | Iteration:  43%|████▎     | 526/1212 [04:05<06:02,  1.89it/s]

Gradient norm: 1121.1258033676593


Epoch 2 of 5 | Iteration:  43%|████▎     | 527/1212 [04:06<05:46,  1.98it/s]

Gradient norm: 1120.367812316565


Epoch 2 of 5 | Iteration:  44%|████▎     | 528/1212 [04:06<05:32,  2.06it/s]

Gradient norm: 1120.4821606609933


Epoch 2 of 5 | Iteration:  44%|████▎     | 529/1212 [04:07<06:00,  1.89it/s]

Gradient norm: 5.768864632720574


Epoch 2 of 5 | Iteration:  44%|████▎     | 530/1212 [04:07<05:47,  1.96it/s]

Gradient norm: 26.360352778090892


Epoch 2 of 5 | Iteration:  44%|████▍     | 531/1212 [04:08<05:50,  1.94it/s]

Gradient norm: 26.48175834891858


Epoch 2 of 5 | Iteration:  44%|████▍     | 532/1212 [04:08<05:52,  1.93it/s]

Gradient norm: 28.89144607579219


Epoch 2 of 5 | Iteration:  44%|████▍     | 533/1212 [04:09<05:52,  1.93it/s]

Gradient norm: 28.96825966129034


Epoch 2 of 5 | Iteration:  44%|████▍     | 534/1212 [04:10<06:40,  1.69it/s]

Gradient norm: 29.057142268324274


Epoch 2 of 5 | Iteration:  44%|████▍     | 535/1212 [04:10<06:04,  1.86it/s]

Gradient norm: 29.09892912873725


Epoch 2 of 5 | Iteration:  44%|████▍     | 536/1212 [04:10<05:38,  2.00it/s]

Gradient norm: 29.075633157291335


Epoch 2 of 5 | Iteration:  44%|████▍     | 537/1212 [04:11<05:10,  2.17it/s]

Gradient norm: 36.418239042905725


Epoch 2 of 5 | Iteration:  44%|████▍     | 538/1212 [04:11<05:08,  2.19it/s]

Gradient norm: 36.21952886707128


Epoch 2 of 5 | Iteration:  44%|████▍     | 539/1212 [04:12<04:49,  2.33it/s]

Gradient norm: 36.13032836494771


Epoch 2 of 5 | Iteration:  45%|████▍     | 540/1212 [04:12<04:36,  2.43it/s]

Gradient norm: 36.12190670784669


Epoch 2 of 5 | Iteration:  45%|████▍     | 541/1212 [04:12<04:30,  2.48it/s]

Gradient norm: 39.67574175158891


Epoch 2 of 5 | Iteration:  45%|████▍     | 542/1212 [04:13<04:33,  2.45it/s]

Gradient norm: 39.586026948483436


Epoch 2 of 5 | Iteration:  45%|████▍     | 543/1212 [04:13<04:22,  2.55it/s]

Gradient norm: 39.59341450382508


Epoch 2 of 5 | Iteration:  45%|████▍     | 544/1212 [04:14<04:27,  2.49it/s]

Gradient norm: 39.68281212355707


Epoch 2 of 5 | Iteration:  45%|████▍     | 545/1212 [04:14<05:16,  2.11it/s]

Gradient norm: 0.8546991041049437


Epoch 2 of 5 | Iteration:  45%|████▌     | 546/1212 [04:15<04:53,  2.27it/s]

Gradient norm: 0.8508866323095209


Epoch 2 of 5 | Iteration:  45%|████▌     | 547/1212 [04:15<04:39,  2.38it/s]

Gradient norm: 2.1545642440307473


Epoch 2 of 5 | Iteration:  45%|████▌     | 548/1212 [04:15<04:38,  2.39it/s]

Gradient norm: 3.3778284715505347


Epoch 2 of 5 | Iteration:  45%|████▌     | 549/1212 [04:16<05:47,  1.91it/s]

Gradient norm: 13.984194001005315


Epoch 2 of 5 | Iteration:  45%|████▌     | 550/1212 [04:17<05:19,  2.07it/s]

Gradient norm: 15.77062311563179


Epoch 2 of 5 | Iteration:  45%|████▌     | 551/1212 [04:17<04:56,  2.23it/s]

Gradient norm: 15.93284911676047


Epoch 2 of 5 | Iteration:  46%|████▌     | 552/1212 [04:17<04:42,  2.33it/s]

Gradient norm: 15.750051271216917


Epoch 2 of 5 | Iteration:  46%|████▌     | 553/1212 [04:18<04:31,  2.43it/s]

Gradient norm: 15.908137489770048


Epoch 2 of 5 | Iteration:  46%|████▌     | 554/1212 [04:18<04:37,  2.37it/s]

Gradient norm: 15.976657049207288


Epoch 2 of 5 | Iteration:  46%|████▌     | 555/1212 [04:19<05:03,  2.16it/s]

Gradient norm: 16.797072570582642


Epoch 2 of 5 | Iteration:  46%|████▌     | 556/1212 [04:19<05:03,  2.16it/s]

Gradient norm: 22.861903905314474


Epoch 2 of 5 | Iteration:  46%|████▌     | 557/1212 [04:19<04:48,  2.27it/s]

Gradient norm: 23.418781723786545


Epoch 2 of 5 | Iteration:  46%|████▌     | 558/1212 [04:20<04:58,  2.19it/s]

Gradient norm: 23.638888118171103


Epoch 2 of 5 | Iteration:  46%|████▌     | 559/1212 [04:20<05:01,  2.17it/s]

Gradient norm: 81.2106056956563


Epoch 2 of 5 | Iteration:  46%|████▌     | 560/1212 [04:21<05:12,  2.09it/s]

Gradient norm: 81.21912003636265


Epoch 2 of 5 | Iteration:  46%|████▋     | 561/1212 [04:21<05:15,  2.06it/s]

Gradient norm: 8.45148243863465


Epoch 2 of 5 | Iteration:  46%|████▋     | 562/1212 [04:22<05:22,  2.01it/s]

Gradient norm: 8.708574863396873


Epoch 2 of 5 | Iteration:  46%|████▋     | 563/1212 [04:23<05:23,  2.01it/s]

Gradient norm: 8.957626770623058


Epoch 2 of 5 | Iteration:  47%|████▋     | 564/1212 [04:23<05:32,  1.95it/s]

Gradient norm: 41.67644565519975


Epoch 2 of 5 | Iteration:  47%|████▋     | 565/1212 [04:24<06:21,  1.69it/s]

Gradient norm: 41.651986022227526


Epoch 2 of 5 | Iteration:  47%|████▋     | 566/1212 [04:24<06:02,  1.78it/s]

Gradient norm: 41.379030418195825


Epoch 2 of 5 | Iteration:  47%|████▋     | 567/1212 [04:25<05:30,  1.95it/s]

Gradient norm: 40.82160650802361


Epoch 2 of 5 | Iteration:  47%|████▋     | 568/1212 [04:25<05:18,  2.02it/s]

Gradient norm: 40.89198347904365


Epoch 2 of 5 | Iteration:  47%|████▋     | 569/1212 [04:26<04:52,  2.20it/s]

Gradient norm: 40.34267826739167


Epoch 2 of 5 | Iteration:  47%|████▋     | 570/1212 [04:26<04:35,  2.33it/s]

Gradient norm: 40.61825153784795


Epoch 2 of 5 | Iteration:  47%|████▋     | 571/1212 [04:26<04:22,  2.45it/s]

Gradient norm: 40.26354393747904


Epoch 2 of 5 | Iteration:  47%|████▋     | 572/1212 [04:27<04:13,  2.53it/s]

Gradient norm: 40.44998671535896


Epoch 2 of 5 | Iteration:  47%|████▋     | 573/1212 [04:27<04:10,  2.55it/s]

Gradient norm: 42.05699598110832


Epoch 2 of 5 | Iteration:  47%|████▋     | 574/1212 [04:27<04:04,  2.60it/s]

Gradient norm: 42.111108162960456


Epoch 2 of 5 | Iteration:  47%|████▋     | 575/1212 [04:28<04:02,  2.63it/s]

Gradient norm: 41.89857955108472


Epoch 2 of 5 | Iteration:  48%|████▊     | 576/1212 [04:28<04:13,  2.51it/s]

Gradient norm: 41.855599067478416


Epoch 2 of 5 | Iteration:  48%|████▊     | 577/1212 [04:29<04:15,  2.49it/s]

Gradient norm: 2.999360659109029


Epoch 2 of 5 | Iteration:  48%|████▊     | 578/1212 [04:29<04:40,  2.26it/s]

Gradient norm: 4408.478084003579


Epoch 2 of 5 | Iteration:  48%|████▊     | 579/1212 [04:29<04:25,  2.39it/s]

Gradient norm: 4408.522518408506


Epoch 2 of 5 | Iteration:  48%|████▊     | 580/1212 [04:30<04:23,  2.40it/s]

Gradient norm: 4407.9093618604065


Epoch 2 of 5 | Iteration:  48%|████▊     | 581/1212 [04:30<04:12,  2.50it/s]

Gradient norm: 4407.158048523653


Epoch 2 of 5 | Iteration:  48%|████▊     | 582/1212 [04:31<04:03,  2.58it/s]

Gradient norm: 4404.827819779423


Epoch 2 of 5 | Iteration:  48%|████▊     | 583/1212 [04:31<04:02,  2.59it/s]

Gradient norm: 4406.182209215259


Epoch 2 of 5 | Iteration:  48%|████▊     | 584/1212 [04:32<04:30,  2.32it/s]

Gradient norm: 4409.425693120405


Epoch 2 of 5 | Iteration:  48%|████▊     | 585/1212 [04:32<04:21,  2.40it/s]

Gradient norm: 4409.708235942648


Epoch 2 of 5 | Iteration:  48%|████▊     | 586/1212 [04:32<04:28,  2.33it/s]

Gradient norm: 4409.853002940167


Epoch 2 of 5 | Iteration:  48%|████▊     | 587/1212 [04:33<04:15,  2.45it/s]

Gradient norm: 4410.498682221455


Epoch 2 of 5 | Iteration:  49%|████▊     | 588/1212 [04:33<04:19,  2.40it/s]

Gradient norm: 4410.478684156774


Epoch 2 of 5 | Iteration:  49%|████▊     | 589/1212 [04:34<04:32,  2.29it/s]

Gradient norm: 4410.510304843073


Epoch 2 of 5 | Iteration:  49%|████▊     | 590/1212 [04:34<04:40,  2.22it/s]

Gradient norm: 4434.819107994683


Epoch 2 of 5 | Iteration:  49%|████▉     | 591/1212 [04:35<04:59,  2.07it/s]

Gradient norm: 4434.853901776353


Epoch 2 of 5 | Iteration:  49%|████▉     | 592/1212 [04:35<05:04,  2.04it/s]

Gradient norm: 4436.577188888178


Epoch 2 of 5 | Iteration:  49%|████▉     | 593/1212 [04:36<05:19,  1.94it/s]

Gradient norm: 26.794053796423057


Epoch 2 of 5 | Iteration:  49%|████▉     | 594/1212 [04:36<05:31,  1.86it/s]

Gradient norm: 29.204125604081696


Epoch 2 of 5 | Iteration:  49%|████▉     | 595/1212 [04:37<04:59,  2.06it/s]

Gradient norm: 29.016572890992695


Epoch 2 of 5 | Iteration:  49%|████▉     | 596/1212 [04:37<05:11,  1.97it/s]

Gradient norm: 29.430860537268476


Epoch 2 of 5 | Iteration:  49%|████▉     | 597/1212 [04:38<05:02,  2.03it/s]

Gradient norm: 29.78234598974489


Epoch 2 of 5 | Iteration:  49%|████▉     | 598/1212 [04:38<04:44,  2.16it/s]

Gradient norm: 29.823098018996156


Epoch 2 of 5 | Iteration:  49%|████▉     | 599/1212 [04:39<04:25,  2.31it/s]

Gradient norm: 29.899348954445593


Epoch 2 of 5 | Iteration:  50%|████▉     | 600/1212 [04:39<04:44,  2.15it/s]

Gradient norm: 29.870838695363787


Epoch 2 of 5 | Iteration:  50%|████▉     | 601/1212 [04:39<04:37,  2.21it/s]

Gradient norm: 30.00106521980331


Epoch 2 of 5 | Iteration:  50%|████▉     | 602/1212 [04:40<04:35,  2.22it/s]

Gradient norm: 30.000554140396666


Epoch 2 of 5 | Iteration:  50%|████▉     | 603/1212 [04:41<05:09,  1.97it/s]

Gradient norm: 30.617675219279345


Epoch 2 of 5 | Iteration:  50%|████▉     | 604/1212 [04:41<04:45,  2.13it/s]

Gradient norm: 34.46816955145925


Epoch 2 of 5 | Iteration:  50%|████▉     | 605/1212 [04:41<04:26,  2.28it/s]

Gradient norm: 35.125012496525784


Epoch 2 of 5 | Iteration:  50%|█████     | 606/1212 [04:42<04:12,  2.40it/s]

Gradient norm: 35.201799435539094


Epoch 2 of 5 | Iteration:  50%|█████     | 607/1212 [04:42<04:01,  2.50it/s]

Gradient norm: 35.5847643930545


Epoch 2 of 5 | Iteration:  50%|█████     | 608/1212 [04:42<04:00,  2.51it/s]

Gradient norm: 35.82871865736424


Epoch 2 of 5 | Iteration:  50%|█████     | 609/1212 [04:43<04:14,  2.37it/s]

Gradient norm: 3.610909549434826


Epoch 2 of 5 | Iteration:  50%|█████     | 610/1212 [04:43<04:39,  2.15it/s]

Gradient norm: 5.446169096754125


Epoch 2 of 5 | Iteration:  50%|█████     | 611/1212 [04:44<04:44,  2.11it/s]

Gradient norm: 138.37553403389396


Epoch 2 of 5 | Iteration:  50%|█████     | 612/1212 [04:44<04:23,  2.28it/s]

Gradient norm: 138.16747929824876


Epoch 2 of 5 | Iteration:  51%|█████     | 613/1212 [04:45<04:23,  2.28it/s]

Gradient norm: 138.15720479277616


Epoch 2 of 5 | Iteration:  51%|█████     | 614/1212 [04:45<04:37,  2.16it/s]

Gradient norm: 138.08042021978088


Epoch 2 of 5 | Iteration:  51%|█████     | 615/1212 [04:46<04:20,  2.29it/s]

Gradient norm: 138.10382936404966


Epoch 2 of 5 | Iteration:  51%|█████     | 616/1212 [04:46<04:26,  2.23it/s]

Gradient norm: 137.90457947477225


Epoch 2 of 5 | Iteration:  51%|█████     | 617/1212 [04:47<04:28,  2.22it/s]

Gradient norm: 138.43979948186927


Epoch 2 of 5 | Iteration:  51%|█████     | 618/1212 [04:47<04:43,  2.10it/s]

Gradient norm: 138.3024866834228


Epoch 2 of 5 | Iteration:  51%|█████     | 619/1212 [04:48<04:54,  2.01it/s]

Gradient norm: 138.1861519321065


Epoch 2 of 5 | Iteration:  51%|█████     | 620/1212 [04:48<04:57,  1.99it/s]

Gradient norm: 138.47530032077603


Epoch 2 of 5 | Iteration:  51%|█████     | 621/1212 [04:49<04:58,  1.98it/s]

Gradient norm: 138.47530032077603


Epoch 2 of 5 | Iteration:  51%|█████▏    | 622/1212 [04:49<05:20,  1.84it/s]

Gradient norm: 138.2395956924059


Epoch 2 of 5 | Iteration:  51%|█████▏    | 623/1212 [04:50<05:15,  1.87it/s]

Gradient norm: 138.1513689562901


Epoch 2 of 5 | Iteration:  51%|█████▏    | 624/1212 [04:50<04:45,  2.06it/s]

Gradient norm: 138.34177165339045


Epoch 2 of 5 | Iteration:  52%|█████▏    | 625/1212 [04:51<04:25,  2.21it/s]

Gradient norm: 3.089833751432894


Epoch 2 of 5 | Iteration:  52%|█████▏    | 626/1212 [04:51<04:26,  2.20it/s]

Gradient norm: 10.515792011818792


Epoch 2 of 5 | Iteration:  52%|█████▏    | 627/1212 [04:51<04:14,  2.30it/s]

Gradient norm: 10.543882163193937


Epoch 2 of 5 | Iteration:  52%|█████▏    | 628/1212 [04:52<04:26,  2.20it/s]

Gradient norm: 15.6264196270509


Epoch 2 of 5 | Iteration:  52%|█████▏    | 629/1212 [04:52<04:31,  2.14it/s]

Gradient norm: 16.886985541799106


Epoch 2 of 5 | Iteration:  52%|█████▏    | 630/1212 [04:53<05:04,  1.91it/s]

Gradient norm: 16.825281620204166


Epoch 2 of 5 | Iteration:  52%|█████▏    | 631/1212 [04:53<04:34,  2.12it/s]

Gradient norm: 17.68049744426606


Epoch 2 of 5 | Iteration:  52%|█████▏    | 632/1212 [04:54<04:17,  2.25it/s]

Gradient norm: 18.28173050889266


Epoch 2 of 5 | Iteration:  52%|█████▏    | 633/1212 [04:54<04:01,  2.39it/s]

Gradient norm: 33.065190218507134


Epoch 2 of 5 | Iteration:  52%|█████▏    | 634/1212 [04:55<03:57,  2.44it/s]

Gradient norm: 33.153599136233105


Epoch 2 of 5 | Iteration:  52%|█████▏    | 635/1212 [04:55<04:10,  2.31it/s]

Gradient norm: 33.66967753634713


Epoch 2 of 5 | Iteration:  52%|█████▏    | 636/1212 [04:55<03:59,  2.40it/s]

Gradient norm: 40.27386226796584


Epoch 2 of 5 | Iteration:  53%|█████▎    | 637/1212 [04:56<03:57,  2.42it/s]

Gradient norm: 40.15838864362592


Epoch 2 of 5 | Iteration:  53%|█████▎    | 638/1212 [04:56<03:56,  2.42it/s]

Gradient norm: 39.91850528712124


Epoch 2 of 5 | Iteration:  53%|█████▎    | 639/1212 [04:57<03:53,  2.46it/s]

Gradient norm: 40.152647739615674


Epoch 2 of 5 | Iteration:  53%|█████▎    | 640/1212 [04:57<04:10,  2.28it/s]

Gradient norm: 40.82436948658463


Epoch 2 of 5 | Iteration:  53%|█████▎    | 641/1212 [04:58<03:57,  2.40it/s]

Gradient norm: 2.193302580871582


Epoch 2 of 5 | Iteration:  53%|█████▎    | 642/1212 [04:58<04:17,  2.21it/s]

Gradient norm: 2.6739185048832455


Epoch 2 of 5 | Iteration:  53%|█████▎    | 643/1212 [04:59<04:23,  2.16it/s]

Gradient norm: 2.7591322848353936


Epoch 2 of 5 | Iteration:  53%|█████▎    | 644/1212 [04:59<04:53,  1.94it/s]

Gradient norm: 54.03873791367794


Epoch 2 of 5 | Iteration:  53%|█████▎    | 645/1212 [05:00<04:25,  2.14it/s]

Gradient norm: 65.03059564739796


Epoch 2 of 5 | Iteration:  53%|█████▎    | 646/1212 [05:00<04:54,  1.92it/s]

Gradient norm: 64.95339916404802


Epoch 2 of 5 | Iteration:  53%|█████▎    | 647/1212 [05:01<04:49,  1.95it/s]

Gradient norm: 65.50256328387715


Epoch 2 of 5 | Iteration:  53%|█████▎    | 648/1212 [05:01<04:51,  1.93it/s]

Gradient norm: 65.83902551420718


Epoch 2 of 5 | Iteration:  54%|█████▎    | 649/1212 [05:02<04:43,  1.99it/s]

Gradient norm: 65.91308746381557


Epoch 2 of 5 | Iteration:  54%|█████▎    | 650/1212 [05:02<04:41,  1.99it/s]

Gradient norm: 65.85217344638028


Epoch 2 of 5 | Iteration:  54%|█████▎    | 651/1212 [05:03<04:51,  1.92it/s]

Gradient norm: 64.99734827582712


Epoch 2 of 5 | Iteration:  54%|█████▍    | 652/1212 [05:03<04:46,  1.95it/s]

Gradient norm: 65.52106298556723


Epoch 2 of 5 | Iteration:  54%|█████▍    | 653/1212 [05:04<04:19,  2.15it/s]

Gradient norm: 65.14499760903962


Epoch 2 of 5 | Iteration:  54%|█████▍    | 654/1212 [05:04<04:05,  2.28it/s]

Gradient norm: 66.75660490171644


Epoch 2 of 5 | Iteration:  54%|█████▍    | 655/1212 [05:04<03:53,  2.38it/s]

Gradient norm: 67.26043557604856


Epoch 2 of 5 | Iteration:  54%|█████▍    | 656/1212 [05:05<04:03,  2.29it/s]

Gradient norm: 67.74363379492084


Epoch 2 of 5 | Iteration:  54%|█████▍    | 657/1212 [05:05<03:54,  2.37it/s]

Gradient norm: 38.193487285349036


Epoch 2 of 5 | Iteration:  54%|█████▍    | 658/1212 [05:06<03:42,  2.49it/s]

Gradient norm: 38.68568783381344


Epoch 2 of 5 | Iteration:  54%|█████▍    | 659/1212 [05:06<04:23,  2.10it/s]

Gradient norm: 45.466690120515835


Epoch 2 of 5 | Iteration:  54%|█████▍    | 660/1212 [05:07<04:25,  2.08it/s]

Gradient norm: 46.139426226972155


Epoch 2 of 5 | Iteration:  55%|█████▍    | 661/1212 [05:07<04:16,  2.15it/s]

Gradient norm: 46.21033884963317


Epoch 2 of 5 | Iteration:  55%|█████▍    | 662/1212 [05:08<04:24,  2.08it/s]

Gradient norm: 46.42801753757878


Epoch 2 of 5 | Iteration:  55%|█████▍    | 663/1212 [05:08<04:07,  2.22it/s]

Gradient norm: 52.28897292077073


Epoch 2 of 5 | Iteration:  55%|█████▍    | 664/1212 [05:09<04:15,  2.15it/s]

Gradient norm: 51.674052453563796


Epoch 2 of 5 | Iteration:  55%|█████▍    | 665/1212 [05:09<04:43,  1.93it/s]

Gradient norm: 51.543590987180124


Epoch 2 of 5 | Iteration:  55%|█████▍    | 666/1212 [05:10<04:33,  1.99it/s]

Gradient norm: 81.28749882755844


Epoch 2 of 5 | Iteration:  55%|█████▌    | 667/1212 [05:10<04:31,  2.01it/s]

Gradient norm: 83.15022682573058


Epoch 2 of 5 | Iteration:  55%|█████▌    | 668/1212 [05:11<04:27,  2.03it/s]

Gradient norm: 83.37992363955739


Epoch 2 of 5 | Iteration:  55%|█████▌    | 669/1212 [05:11<04:07,  2.20it/s]

Gradient norm: 83.3188697264626


Epoch 2 of 5 | Iteration:  55%|█████▌    | 670/1212 [05:11<04:02,  2.23it/s]

Gradient norm: 83.16754680396537


Epoch 2 of 5 | Iteration:  55%|█████▌    | 671/1212 [05:12<04:13,  2.13it/s]

Gradient norm: 83.3635179937129


Epoch 2 of 5 | Iteration:  55%|█████▌    | 672/1212 [05:12<04:01,  2.24it/s]

Gradient norm: 92.3819531915686


Epoch 2 of 5 | Iteration:  56%|█████▌    | 673/1212 [05:13<03:49,  2.35it/s]

Gradient norm: 0.9003318761298734


Epoch 2 of 5 | Iteration:  56%|█████▌    | 674/1212 [05:13<03:58,  2.25it/s]

Gradient norm: 2.304108378233068


Epoch 2 of 5 | Iteration:  56%|█████▌    | 675/1212 [05:14<04:13,  2.12it/s]

Gradient norm: 2.4294555929802426


Epoch 2 of 5 | Iteration:  56%|█████▌    | 676/1212 [05:14<04:35,  1.94it/s]

Gradient norm: 12.384977151124936


Epoch 2 of 5 | Iteration:  56%|█████▌    | 677/1212 [05:15<04:35,  1.94it/s]

Gradient norm: 14.345194862467048


Epoch 2 of 5 | Iteration:  56%|█████▌    | 678/1212 [05:15<04:39,  1.91it/s]

Gradient norm: 17.71383526260168


Epoch 2 of 5 | Iteration:  56%|█████▌    | 679/1212 [05:16<04:38,  1.91it/s]

Gradient norm: 182.4173569813105


Epoch 2 of 5 | Iteration:  56%|█████▌    | 680/1212 [05:17<04:55,  1.80it/s]

Gradient norm: 170.17021934338135


Epoch 2 of 5 | Iteration:  56%|█████▌    | 681/1212 [05:17<04:24,  2.01it/s]

Gradient norm: 170.54180044347677


Epoch 2 of 5 | Iteration:  56%|█████▋    | 682/1212 [05:17<04:01,  2.19it/s]

Gradient norm: 170.86861701355966


Epoch 2 of 5 | Iteration:  56%|█████▋    | 683/1212 [05:18<04:14,  2.08it/s]

Gradient norm: 170.23127871957425


Epoch 2 of 5 | Iteration:  56%|█████▋    | 684/1212 [05:18<04:20,  2.03it/s]

Gradient norm: 170.3149051904719


Epoch 2 of 5 | Iteration:  57%|█████▋    | 685/1212 [05:19<04:00,  2.19it/s]

Gradient norm: 169.46774117863845


Epoch 2 of 5 | Iteration:  57%|█████▋    | 686/1212 [05:19<03:45,  2.33it/s]

Gradient norm: 169.40154993855487


Epoch 2 of 5 | Iteration:  57%|█████▋    | 687/1212 [05:19<03:33,  2.46it/s]

Gradient norm: 169.4626329085925


Epoch 2 of 5 | Iteration:  57%|█████▋    | 688/1212 [05:20<03:33,  2.45it/s]

Gradient norm: 169.4140669168296


Epoch 2 of 5 | Iteration:  57%|█████▋    | 689/1212 [05:20<03:25,  2.55it/s]

Gradient norm: 1.3367209991592026


Epoch 2 of 5 | Iteration:  57%|█████▋    | 690/1212 [05:21<03:57,  2.20it/s]

Gradient norm: 2.366104492629674


Epoch 2 of 5 | Iteration:  57%|█████▋    | 691/1212 [05:21<03:42,  2.34it/s]

Gradient norm: 7.148360984430404


Epoch 2 of 5 | Iteration:  57%|█████▋    | 692/1212 [05:22<03:35,  2.42it/s]

Gradient norm: 8.426814627335505


Epoch 2 of 5 | Iteration:  57%|█████▋    | 693/1212 [05:22<03:28,  2.48it/s]

Gradient norm: 92.94124299618535


Epoch 2 of 5 | Iteration:  57%|█████▋    | 694/1212 [05:22<03:23,  2.55it/s]

Gradient norm: 93.83912524507748


Epoch 2 of 5 | Iteration:  57%|█████▋    | 695/1212 [05:23<03:34,  2.41it/s]

Gradient norm: 93.06874218805525


Epoch 2 of 5 | Iteration:  57%|█████▋    | 696/1212 [05:23<03:30,  2.45it/s]

Gradient norm: 94.14449524735703


Epoch 2 of 5 | Iteration:  58%|█████▊    | 697/1212 [05:24<03:33,  2.42it/s]

Gradient norm: 95.13270641835928


Epoch 2 of 5 | Iteration:  58%|█████▊    | 698/1212 [05:24<04:01,  2.13it/s]

Gradient norm: 94.2377133816522


Epoch 2 of 5 | Iteration:  58%|█████▊    | 699/1212 [05:25<03:59,  2.14it/s]

Gradient norm: 94.29157902263721


Epoch 2 of 5 | Iteration:  58%|█████▊    | 700/1212 [05:25<04:12,  2.03it/s]

Gradient norm: 96.57729563940413


Epoch 2 of 5 | Iteration:  58%|█████▊    | 701/1212 [05:26<04:08,  2.06it/s]

Gradient norm: 98.15946926302983


Epoch 2 of 5 | Iteration:  58%|█████▊    | 702/1212 [05:26<03:52,  2.20it/s]

Gradient norm: 98.13939049819466


Epoch 2 of 5 | Iteration:  58%|█████▊    | 703/1212 [05:26<03:50,  2.21it/s]

Gradient norm: 98.16089402097309


Epoch 2 of 5 | Iteration:  58%|█████▊    | 704/1212 [05:27<03:59,  2.12it/s]

Gradient norm: 97.50261642699496


Epoch 2 of 5 | Iteration:  58%|█████▊    | 705/1212 [05:28<04:19,  1.96it/s]

Gradient norm: 31.954869632135182


Epoch 2 of 5 | Iteration:  58%|█████▊    | 706/1212 [05:28<04:36,  1.83it/s]

Gradient norm: 31.50169261289186


Epoch 2 of 5 | Iteration:  58%|█████▊    | 707/1212 [05:29<04:26,  1.89it/s]

Gradient norm: 31.841834798994498


Epoch 2 of 5 | Iteration:  58%|█████▊    | 708/1212 [05:29<04:18,  1.95it/s]

Gradient norm: 36.857440499308744


Epoch 2 of 5 | Iteration:  58%|█████▊    | 709/1212 [05:30<04:22,  1.91it/s]

Gradient norm: 38.14278877121142


Epoch 2 of 5 | Iteration:  59%|█████▊    | 710/1212 [05:30<04:27,  1.88it/s]

Gradient norm: 39.1240228190259


Epoch 2 of 5 | Iteration:  59%|█████▊    | 711/1212 [05:31<04:16,  1.95it/s]

Gradient norm: 37.38278879602987


Epoch 2 of 5 | Iteration:  59%|█████▊    | 712/1212 [05:31<03:55,  2.13it/s]

Gradient norm: 37.38547591145346


Epoch 2 of 5 | Iteration:  59%|█████▉    | 713/1212 [05:31<03:37,  2.30it/s]

Gradient norm: 38.090974407110444


Epoch 2 of 5 | Iteration:  59%|█████▉    | 714/1212 [05:32<04:03,  2.05it/s]

Gradient norm: 38.93129371725109


Epoch 2 of 5 | Iteration:  59%|█████▉    | 715/1212 [05:32<03:50,  2.16it/s]

Gradient norm: 46.47314257066862


Epoch 2 of 5 | Iteration:  59%|█████▉    | 716/1212 [05:33<03:35,  2.30it/s]

Gradient norm: 46.44457250737504


Epoch 2 of 5 | Iteration:  59%|█████▉    | 717/1212 [05:33<03:24,  2.42it/s]

Gradient norm: 46.490007659965904


Epoch 2 of 5 | Iteration:  59%|█████▉    | 718/1212 [05:34<03:54,  2.11it/s]

Gradient norm: 51.90899017612094


Epoch 2 of 5 | Iteration:  59%|█████▉    | 719/1212 [05:34<03:38,  2.26it/s]

Gradient norm: 51.9862773325198


Epoch 2 of 5 | Iteration:  59%|█████▉    | 720/1212 [05:35<04:04,  2.01it/s]

Gradient norm: 52.23547259672432


Epoch 2 of 5 | Iteration:  59%|█████▉    | 721/1212 [05:35<04:06,  1.99it/s]

Gradient norm: 8.080790296505969


Epoch 2 of 5 | Iteration:  60%|█████▉    | 722/1212 [05:36<04:28,  1.82it/s]

Gradient norm: 16.043485986892808


Epoch 2 of 5 | Iteration:  60%|█████▉    | 723/1212 [05:36<04:01,  2.02it/s]

Gradient norm: 21.021021838847137


Epoch 2 of 5 | Iteration:  60%|█████▉    | 724/1212 [05:37<04:12,  1.93it/s]

Gradient norm: 21.871040758339664


Epoch 2 of 5 | Iteration:  60%|█████▉    | 725/1212 [05:37<03:55,  2.07it/s]

Gradient norm: 91.31404075734562


Epoch 2 of 5 | Iteration:  60%|█████▉    | 726/1212 [05:38<03:36,  2.24it/s]

Gradient norm: 91.04497771681183


Epoch 2 of 5 | Iteration:  60%|█████▉    | 727/1212 [05:38<03:47,  2.13it/s]

Gradient norm: 91.16337449936174


Epoch 2 of 5 | Iteration:  60%|██████    | 728/1212 [05:39<04:10,  1.93it/s]

Gradient norm: 91.09757761948062


Epoch 2 of 5 | Iteration:  60%|██████    | 729/1212 [05:40<04:28,  1.80it/s]

Gradient norm: 90.66324911570669


Epoch 2 of 5 | Iteration:  60%|██████    | 730/1212 [05:40<04:07,  1.95it/s]

Gradient norm: 93.09480139284312


Epoch 2 of 5 | Iteration:  60%|██████    | 731/1212 [05:40<04:09,  1.93it/s]

Gradient norm: 92.5214688613359


Epoch 2 of 5 | Iteration:  60%|██████    | 732/1212 [05:41<04:04,  1.96it/s]

Gradient norm: 92.2203595226193


Epoch 2 of 5 | Iteration:  60%|██████    | 733/1212 [05:41<04:04,  1.96it/s]

Gradient norm: 93.45950748869018


Epoch 2 of 5 | Iteration:  61%|██████    | 734/1212 [05:42<04:00,  1.99it/s]

Gradient norm: 93.82425803821596


Epoch 2 of 5 | Iteration:  61%|██████    | 735/1212 [05:42<03:57,  2.01it/s]

Gradient norm: 93.50366584426737


Epoch 2 of 5 | Iteration:  61%|██████    | 736/1212 [05:43<04:10,  1.90it/s]

Gradient norm: 94.29040223119871


Epoch 2 of 5 | Iteration:  61%|██████    | 737/1212 [05:44<04:09,  1.90it/s]

Gradient norm: 8.451293132197279


Epoch 2 of 5 | Iteration:  61%|██████    | 738/1212 [05:44<04:18,  1.83it/s]

Gradient norm: 7.803180956251574


Epoch 2 of 5 | Iteration:  61%|██████    | 739/1212 [05:45<03:54,  2.02it/s]

Gradient norm: 8.14979493407404


Epoch 2 of 5 | Iteration:  61%|██████    | 740/1212 [05:45<03:41,  2.13it/s]

Gradient norm: 11.751697809194162


Epoch 2 of 5 | Iteration:  61%|██████    | 741/1212 [05:45<03:29,  2.25it/s]

Gradient norm: 11.927289582564736


Epoch 2 of 5 | Iteration:  61%|██████    | 742/1212 [05:46<03:18,  2.37it/s]

Gradient norm: 12.36638714339155


Epoch 2 of 5 | Iteration:  61%|██████▏   | 743/1212 [05:46<03:11,  2.44it/s]

Gradient norm: 12.471928267178017


Epoch 2 of 5 | Iteration:  61%|██████▏   | 744/1212 [05:47<03:40,  2.12it/s]

Gradient norm: 12.701730062221012


Epoch 2 of 5 | Iteration:  61%|██████▏   | 745/1212 [05:47<03:59,  1.95it/s]

Gradient norm: 12.934609992070433


Epoch 2 of 5 | Iteration:  62%|██████▏   | 746/1212 [05:48<03:42,  2.09it/s]

Gradient norm: 23.024077227969705


Epoch 2 of 5 | Iteration:  62%|██████▏   | 747/1212 [05:48<03:45,  2.06it/s]

Gradient norm: 22.995608801994234


Epoch 2 of 5 | Iteration:  62%|██████▏   | 748/1212 [05:49<03:45,  2.06it/s]

Gradient norm: 23.434804604654992


Epoch 2 of 5 | Iteration:  62%|██████▏   | 749/1212 [05:49<03:27,  2.23it/s]

Gradient norm: 37.6453116259909


Epoch 2 of 5 | Iteration:  62%|██████▏   | 750/1212 [05:49<03:15,  2.36it/s]

Gradient norm: 37.86125583373996


Epoch 2 of 5 | Iteration:  62%|██████▏   | 751/1212 [05:50<03:08,  2.45it/s]

Gradient norm: 37.88002541525766


Epoch 2 of 5 | Iteration:  62%|██████▏   | 752/1212 [05:50<03:02,  2.52it/s]

Gradient norm: 38.1772100560755


Epoch 2 of 5 | Iteration:  62%|██████▏   | 753/1212 [05:51<03:01,  2.53it/s]

Gradient norm: 3.5107239309715315


Epoch 2 of 5 | Iteration:  62%|██████▏   | 754/1212 [05:51<03:41,  2.07it/s]

Gradient norm: 4.914641480952927


Epoch 2 of 5 | Iteration:  62%|██████▏   | 755/1212 [05:52<03:25,  2.22it/s]

Gradient norm: 5.166398864062981


Epoch 2 of 5 | Iteration:  62%|██████▏   | 756/1212 [05:52<03:16,  2.32it/s]

Gradient norm: 14.636906246654178


Epoch 2 of 5 | Iteration:  62%|██████▏   | 757/1212 [05:52<03:12,  2.36it/s]

Gradient norm: 489.17872276399817


Epoch 2 of 5 | Iteration:  63%|██████▎   | 758/1212 [05:53<03:15,  2.32it/s]

Gradient norm: 489.2510714052049


Epoch 2 of 5 | Iteration:  63%|██████▎   | 759/1212 [05:53<03:08,  2.40it/s]

Gradient norm: 489.53457379925476


Epoch 2 of 5 | Iteration:  63%|██████▎   | 760/1212 [05:54<03:16,  2.30it/s]

Gradient norm: 489.4933441541792


Epoch 2 of 5 | Iteration:  63%|██████▎   | 761/1212 [05:54<03:24,  2.20it/s]

Gradient norm: 489.4438141552226


Epoch 2 of 5 | Iteration:  63%|██████▎   | 762/1212 [05:55<03:33,  2.11it/s]

Gradient norm: 489.16173093124746


Epoch 2 of 5 | Iteration:  63%|██████▎   | 763/1212 [05:55<03:37,  2.06it/s]

Gradient norm: 489.0591805766324


Epoch 2 of 5 | Iteration:  63%|██████▎   | 764/1212 [05:56<03:48,  1.96it/s]

Gradient norm: 489.0720309143806


Epoch 2 of 5 | Iteration:  63%|██████▎   | 765/1212 [05:56<03:44,  1.99it/s]

Gradient norm: 488.887720559662


Epoch 2 of 5 | Iteration:  63%|██████▎   | 766/1212 [05:57<03:51,  1.93it/s]

Gradient norm: 488.89159960017173


Epoch 2 of 5 | Iteration:  63%|██████▎   | 767/1212 [05:57<03:36,  2.05it/s]

Gradient norm: 488.7341178227313


Epoch 2 of 5 | Iteration:  63%|██████▎   | 768/1212 [05:58<04:00,  1.85it/s]

Gradient norm: 492.91766112698804


Epoch 2 of 5 | Iteration:  63%|██████▎   | 769/1212 [05:58<03:39,  2.02it/s]

Gradient norm: 35.97498079091051


Epoch 2 of 5 | Iteration:  64%|██████▎   | 770/1212 [05:59<03:53,  1.89it/s]

Gradient norm: 36.313392961962535


Epoch 2 of 5 | Iteration:  64%|██████▎   | 771/1212 [05:59<03:31,  2.08it/s]

Gradient norm: 36.27438555382771


Epoch 2 of 5 | Iteration:  64%|██████▎   | 772/1212 [06:00<03:15,  2.25it/s]

Gradient norm: 36.037837669972724


Epoch 2 of 5 | Iteration:  64%|██████▍   | 773/1212 [06:00<03:03,  2.39it/s]

Gradient norm: 44.74827608746224


Epoch 2 of 5 | Iteration:  64%|██████▍   | 774/1212 [06:00<02:56,  2.48it/s]

Gradient norm: 81.66162518550263


Epoch 2 of 5 | Iteration:  64%|██████▍   | 775/1212 [06:01<03:16,  2.22it/s]

Gradient norm: 81.98550588155182


Epoch 2 of 5 | Iteration:  64%|██████▍   | 776/1212 [06:02<03:34,  2.03it/s]

Gradient norm: 78.04999628533518


Epoch 2 of 5 | Iteration:  64%|██████▍   | 777/1212 [06:02<03:48,  1.90it/s]

Gradient norm: 77.25134811352049


Epoch 2 of 5 | Iteration:  64%|██████▍   | 778/1212 [06:03<03:47,  1.91it/s]

Gradient norm: 77.59740616038073


Epoch 2 of 5 | Iteration:  64%|██████▍   | 779/1212 [06:03<03:28,  2.07it/s]

Gradient norm: 503.0871588919202


Epoch 2 of 5 | Iteration:  64%|██████▍   | 780/1212 [06:03<03:14,  2.22it/s]

Gradient norm: 504.6027699245669


Epoch 2 of 5 | Iteration:  64%|██████▍   | 781/1212 [06:04<03:04,  2.34it/s]

Gradient norm: 504.08152271401593


Epoch 2 of 5 | Iteration:  65%|██████▍   | 782/1212 [06:04<02:57,  2.42it/s]

Gradient norm: 504.57058125333845


Epoch 2 of 5 | Iteration:  65%|██████▍   | 783/1212 [06:05<02:50,  2.52it/s]

Gradient norm: 517.072695771526


Epoch 2 of 5 | Iteration:  65%|██████▍   | 784/1212 [06:05<03:01,  2.35it/s]

Gradient norm: 517.4851432508591


Epoch 2 of 5 | Iteration:  65%|██████▍   | 785/1212 [06:06<03:21,  2.12it/s]

Gradient norm: 2.2321594037201704


Epoch 2 of 5 | Iteration:  65%|██████▍   | 786/1212 [06:06<03:11,  2.23it/s]

Gradient norm: 3.7602321400834953


Epoch 2 of 5 | Iteration:  65%|██████▍   | 787/1212 [06:06<03:01,  2.34it/s]

Gradient norm: 12.456824701265878


Epoch 2 of 5 | Iteration:  65%|██████▌   | 788/1212 [06:07<02:56,  2.41it/s]

Gradient norm: 31.016620261244537


Epoch 2 of 5 | Iteration:  65%|██████▌   | 789/1212 [06:07<03:03,  2.30it/s]

Gradient norm: 1634.151018875414


Epoch 2 of 5 | Iteration:  65%|██████▌   | 790/1212 [06:08<03:15,  2.16it/s]

Gradient norm: 1633.969018321213


Epoch 2 of 5 | Iteration:  65%|██████▌   | 791/1212 [06:08<03:19,  2.11it/s]

Gradient norm: 1633.0847785964531


Epoch 2 of 5 | Iteration:  65%|██████▌   | 792/1212 [06:09<03:35,  1.95it/s]

Gradient norm: 1633.1327696485146


Epoch 2 of 5 | Iteration:  65%|██████▌   | 793/1212 [06:09<03:30,  1.99it/s]

Gradient norm: 1633.1615543015696


Epoch 2 of 5 | Iteration:  66%|██████▌   | 794/1212 [06:10<03:27,  2.01it/s]

Gradient norm: 1633.1215355385539


Epoch 2 of 5 | Iteration:  66%|██████▌   | 795/1212 [06:10<03:34,  1.94it/s]

Gradient norm: 1633.1320517074032


Epoch 2 of 5 | Iteration:  66%|██████▌   | 796/1212 [06:11<03:21,  2.06it/s]

Gradient norm: 1629.9944319125864


Epoch 2 of 5 | Iteration:  66%|██████▌   | 797/1212 [06:11<03:19,  2.08it/s]

Gradient norm: 1635.5086944830648


Epoch 2 of 5 | Iteration:  66%|██████▌   | 798/1212 [06:12<03:04,  2.24it/s]

Gradient norm: 1635.6346532318178


Epoch 2 of 5 | Iteration:  66%|██████▌   | 799/1212 [06:12<02:56,  2.34it/s]

Gradient norm: 1635.8537742993667


Epoch 2 of 5 | Iteration:  66%|██████▌   | 800/1212 [06:12<02:56,  2.33it/s]

Gradient norm: 1635.55594235853


Epoch 2 of 5 | Iteration:  66%|██████▌   | 801/1212 [06:13<02:48,  2.44it/s]

Gradient norm: 0.6718149320082658


Epoch 2 of 5 | Iteration:  66%|██████▌   | 802/1212 [06:13<02:56,  2.32it/s]

Gradient norm: 235.04321741689265


Epoch 2 of 5 | Iteration:  66%|██████▋   | 803/1212 [06:14<02:49,  2.42it/s]

Gradient norm: 234.94591422131964


Epoch 2 of 5 | Iteration:  66%|██████▋   | 804/1212 [06:14<02:44,  2.48it/s]

Gradient norm: 235.2974803372392


Epoch 2 of 5 | Iteration:  66%|██████▋   | 805/1212 [06:14<02:50,  2.39it/s]

Gradient norm: 235.3258806302948


Epoch 2 of 5 | Iteration:  67%|██████▋   | 806/1212 [06:15<02:55,  2.32it/s]

Gradient norm: 235.3593550763136


Epoch 2 of 5 | Iteration:  67%|██████▋   | 807/1212 [06:15<02:57,  2.28it/s]

Gradient norm: 235.1963932805831


Epoch 2 of 5 | Iteration:  67%|██████▋   | 808/1212 [06:16<03:03,  2.21it/s]

Gradient norm: 234.83297747925195


Epoch 2 of 5 | Iteration:  67%|██████▋   | 809/1212 [06:16<02:52,  2.34it/s]

Gradient norm: 234.72226322943922


Epoch 2 of 5 | Iteration:  67%|██████▋   | 810/1212 [06:17<02:51,  2.35it/s]

Gradient norm: 234.75795662003824


Epoch 2 of 5 | Iteration:  67%|██████▋   | 811/1212 [06:17<02:49,  2.37it/s]

Gradient norm: 234.6235714549175


Epoch 2 of 5 | Iteration:  67%|██████▋   | 812/1212 [06:18<02:46,  2.40it/s]

Gradient norm: 235.12529876414848


Epoch 2 of 5 | Iteration:  67%|██████▋   | 813/1212 [06:18<03:05,  2.15it/s]

Gradient norm: 235.0837543271539


Epoch 2 of 5 | Iteration:  67%|██████▋   | 814/1212 [06:18<02:53,  2.29it/s]

Gradient norm: 234.97126707446927


Epoch 2 of 5 | Iteration:  67%|██████▋   | 815/1212 [06:19<02:48,  2.36it/s]

Gradient norm: 236.8382928956419


Epoch 2 of 5 | Iteration:  67%|██████▋   | 816/1212 [06:19<03:01,  2.19it/s]

Gradient norm: 236.68528333004713


Epoch 2 of 5 | Iteration:  67%|██████▋   | 817/1212 [06:20<02:52,  2.29it/s]

Gradient norm: 2.664269314987797


Epoch 2 of 5 | Iteration:  67%|██████▋   | 818/1212 [06:20<02:59,  2.19it/s]

Gradient norm: 7.258529647281575


Epoch 2 of 5 | Iteration:  68%|██████▊   | 819/1212 [06:21<03:13,  2.03it/s]

Gradient norm: 14.678158506407076


Epoch 2 of 5 | Iteration:  68%|██████▊   | 820/1212 [06:21<03:15,  2.00it/s]

Gradient norm: 15.356300296257318


Epoch 2 of 5 | Iteration:  68%|██████▊   | 821/1212 [06:22<03:29,  1.87it/s]

Gradient norm: 15.697650332230435


Epoch 2 of 5 | Iteration:  68%|██████▊   | 822/1212 [06:22<03:25,  1.90it/s]

Gradient norm: 16.08370304383634


Epoch 2 of 5 | Iteration:  68%|██████▊   | 823/1212 [06:23<03:49,  1.70it/s]

Gradient norm: 18.623390685289692


Epoch 2 of 5 | Iteration:  68%|██████▊   | 824/1212 [06:24<03:56,  1.64it/s]

Gradient norm: 20.881676473635963


Epoch 2 of 5 | Iteration:  68%|██████▊   | 825/1212 [06:25<04:05,  1.58it/s]

Gradient norm: 20.74607647540912


Epoch 2 of 5 | Iteration:  68%|██████▊   | 826/1212 [06:25<03:34,  1.80it/s]

Gradient norm: 21.668065406697444


Epoch 2 of 5 | Iteration:  68%|██████▊   | 827/1212 [06:25<03:33,  1.81it/s]

Gradient norm: 21.565895353886233


Epoch 2 of 5 | Iteration:  68%|██████▊   | 828/1212 [06:26<03:28,  1.84it/s]

Gradient norm: 102.20152417145964


Epoch 2 of 5 | Iteration:  68%|██████▊   | 829/1212 [06:26<03:21,  1.90it/s]

Gradient norm: 103.52459053528288


Epoch 2 of 5 | Iteration:  68%|██████▊   | 830/1212 [06:27<03:13,  1.97it/s]

Gradient norm: 108.06279596597082


Epoch 2 of 5 | Iteration:  69%|██████▊   | 831/1212 [06:27<02:57,  2.15it/s]

Gradient norm: 108.09710973604354


Epoch 2 of 5 | Iteration:  69%|██████▊   | 832/1212 [06:28<02:47,  2.27it/s]

Gradient norm: 108.19984360758309


Epoch 2 of 5 | Iteration:  69%|██████▊   | 833/1212 [06:28<02:56,  2.15it/s]

Gradient norm: 5.9602752891670105


Epoch 2 of 5 | Iteration:  69%|██████▉   | 834/1212 [06:29<02:47,  2.26it/s]

Gradient norm: 6.088749423200348


Epoch 2 of 5 | Iteration:  69%|██████▉   | 835/1212 [06:29<02:45,  2.27it/s]

Gradient norm: 6.908022432934559


Epoch 2 of 5 | Iteration:  69%|██████▉   | 836/1212 [06:29<02:37,  2.39it/s]

Gradient norm: 108.83894326266866


Epoch 2 of 5 | Iteration:  69%|██████▉   | 837/1212 [06:30<02:34,  2.43it/s]

Gradient norm: 108.77751480849466


Epoch 2 of 5 | Iteration:  69%|██████▉   | 838/1212 [06:30<02:39,  2.34it/s]

Gradient norm: 108.6501525933895


Epoch 2 of 5 | Iteration:  69%|██████▉   | 839/1212 [06:31<02:47,  2.23it/s]

Gradient norm: 109.02419792415903


Epoch 2 of 5 | Iteration:  69%|██████▉   | 840/1212 [06:31<02:37,  2.36it/s]

Gradient norm: 108.44516167268148


Epoch 2 of 5 | Iteration:  69%|██████▉   | 841/1212 [06:32<02:49,  2.19it/s]

Gradient norm: 108.47172412978507


Epoch 2 of 5 | Iteration:  69%|██████▉   | 842/1212 [06:32<02:44,  2.25it/s]

Gradient norm: 108.60235797211988


Epoch 2 of 5 | Iteration:  70%|██████▉   | 843/1212 [06:33<02:49,  2.18it/s]

Gradient norm: 140.6424417380477


Epoch 2 of 5 | Iteration:  70%|██████▉   | 844/1212 [06:33<02:44,  2.24it/s]

Gradient norm: 140.59389938656122


Epoch 2 of 5 | Iteration:  70%|██████▉   | 845/1212 [06:33<02:36,  2.34it/s]

Gradient norm: 140.21248028044192


Epoch 2 of 5 | Iteration:  70%|██████▉   | 846/1212 [06:34<02:30,  2.43it/s]

Gradient norm: 139.19206190354728


Epoch 2 of 5 | Iteration:  70%|██████▉   | 847/1212 [06:34<02:32,  2.39it/s]

Gradient norm: 139.32793770573852


Epoch 2 of 5 | Iteration:  70%|██████▉   | 848/1212 [06:35<02:44,  2.21it/s]

Gradient norm: 142.13829900613936


Epoch 2 of 5 | Iteration:  70%|███████   | 849/1212 [06:35<02:51,  2.11it/s]

Gradient norm: 23.366106470504594


Epoch 2 of 5 | Iteration:  70%|███████   | 850/1212 [06:36<03:02,  1.99it/s]

Gradient norm: 29.55989077934094


Epoch 2 of 5 | Iteration:  70%|███████   | 851/1212 [06:36<02:59,  2.01it/s]

Gradient norm: 29.779266879500195


Epoch 2 of 5 | Iteration:  70%|███████   | 852/1212 [06:37<03:04,  1.95it/s]

Gradient norm: 30.870454946045953


Epoch 2 of 5 | Iteration:  70%|███████   | 853/1212 [06:37<03:09,  1.90it/s]

Gradient norm: 30.812576057142337


Epoch 2 of 5 | Iteration:  70%|███████   | 854/1212 [06:38<03:02,  1.96it/s]

Gradient norm: 31.62598737669107


Epoch 2 of 5 | Iteration:  71%|███████   | 855/1212 [06:38<03:12,  1.86it/s]

Gradient norm: 31.06115950790768


Epoch 2 of 5 | Iteration:  71%|███████   | 856/1212 [06:39<03:19,  1.78it/s]

Gradient norm: 56.63326002337046


Epoch 2 of 5 | Iteration:  71%|███████   | 857/1212 [06:40<03:13,  1.84it/s]

Gradient norm: 56.34678986890402


Epoch 2 of 5 | Iteration:  71%|███████   | 858/1212 [06:40<02:58,  1.99it/s]

Gradient norm: 56.10947486161493


Epoch 2 of 5 | Iteration:  71%|███████   | 859/1212 [06:40<02:43,  2.16it/s]

Gradient norm: 56.15165258602596


Epoch 2 of 5 | Iteration:  71%|███████   | 860/1212 [06:41<02:42,  2.16it/s]

Gradient norm: 78.44868228841631


Epoch 2 of 5 | Iteration:  71%|███████   | 861/1212 [06:41<02:35,  2.26it/s]

Gradient norm: 82.44382796970494


Epoch 2 of 5 | Iteration:  71%|███████   | 862/1212 [06:42<02:45,  2.11it/s]

Gradient norm: 229.41823744339308


Epoch 2 of 5 | Iteration:  71%|███████   | 863/1212 [06:42<02:53,  2.01it/s]

Gradient norm: 251.79533875897042


Epoch 2 of 5 | Iteration:  71%|███████▏  | 864/1212 [06:43<02:41,  2.16it/s]

Gradient norm: 251.6108194244638


Epoch 2 of 5 | Iteration:  71%|███████▏  | 865/1212 [06:43<02:30,  2.30it/s]

Gradient norm: 3.7343088183579956


Epoch 2 of 5 | Iteration:  71%|███████▏  | 866/1212 [06:44<02:36,  2.22it/s]

Gradient norm: 3.9460905295614777


Epoch 2 of 5 | Iteration:  72%|███████▏  | 867/1212 [06:44<02:44,  2.09it/s]

Gradient norm: 4.868484272427708


Epoch 2 of 5 | Iteration:  72%|███████▏  | 868/1212 [06:45<02:37,  2.19it/s]

Gradient norm: 7.569622718994511


Epoch 2 of 5 | Iteration:  72%|███████▏  | 869/1212 [06:45<02:39,  2.16it/s]

Gradient norm: 7.923257496762707


Epoch 2 of 5 | Iteration:  72%|███████▏  | 870/1212 [06:45<02:31,  2.26it/s]

Gradient norm: 8.388194330685822


Epoch 2 of 5 | Iteration:  72%|███████▏  | 871/1212 [06:46<02:32,  2.24it/s]

Gradient norm: 23.994794134770544


Epoch 2 of 5 | Iteration:  72%|███████▏  | 872/1212 [06:46<02:51,  1.98it/s]

Gradient norm: 24.065532986224706


Epoch 2 of 5 | Iteration:  72%|███████▏  | 873/1212 [06:47<02:46,  2.03it/s]

Gradient norm: 24.260668849287818


Epoch 2 of 5 | Iteration:  72%|███████▏  | 874/1212 [06:47<02:43,  2.06it/s]

Gradient norm: 24.259508150823027


Epoch 2 of 5 | Iteration:  72%|███████▏  | 875/1212 [06:48<02:50,  1.98it/s]

Gradient norm: 24.95914686267367


Epoch 2 of 5 | Iteration:  72%|███████▏  | 876/1212 [06:48<02:48,  2.00it/s]

Gradient norm: 26.061612708780125


Epoch 2 of 5 | Iteration:  72%|███████▏  | 877/1212 [06:49<02:51,  1.96it/s]

Gradient norm: 25.86174379302852


Epoch 2 of 5 | Iteration:  72%|███████▏  | 878/1212 [06:50<02:49,  1.97it/s]

Gradient norm: 378.16556665801414


Epoch 2 of 5 | Iteration:  73%|███████▎  | 879/1212 [06:50<02:49,  1.96it/s]

Gradient norm: 377.86399183570677


Epoch 2 of 5 | Iteration:  73%|███████▎  | 880/1212 [06:51<02:57,  1.87it/s]

Gradient norm: 376.43421649178185


Epoch 2 of 5 | Iteration:  73%|███████▎  | 881/1212 [06:51<03:03,  1.81it/s]

Gradient norm: 14.722581807322214


Epoch 2 of 5 | Iteration:  73%|███████▎  | 882/1212 [06:52<02:49,  1.95it/s]

Gradient norm: 15.25487450600188


Epoch 2 of 5 | Iteration:  73%|███████▎  | 883/1212 [06:52<02:34,  2.13it/s]

Gradient norm: 15.144406738212988


Epoch 2 of 5 | Iteration:  73%|███████▎  | 884/1212 [06:52<02:22,  2.30it/s]

Gradient norm: 15.029243594336917


Epoch 2 of 5 | Iteration:  73%|███████▎  | 885/1212 [06:53<02:15,  2.41it/s]

Gradient norm: 439.96578653554667


Epoch 2 of 5 | Iteration:  73%|███████▎  | 886/1212 [06:53<02:10,  2.51it/s]

Gradient norm: 447.35326931234323


Epoch 2 of 5 | Iteration:  73%|███████▎  | 887/1212 [06:54<02:17,  2.37it/s]

Gradient norm: 449.77091345380074


Epoch 2 of 5 | Iteration:  73%|███████▎  | 888/1212 [06:54<02:12,  2.44it/s]

Gradient norm: 450.95264525611003


Epoch 2 of 5 | Iteration:  73%|███████▎  | 889/1212 [06:54<02:13,  2.43it/s]

Gradient norm: 450.46552812414325


Epoch 2 of 5 | Iteration:  73%|███████▎  | 890/1212 [06:55<02:11,  2.45it/s]

Gradient norm: 468.14339081872845


Epoch 2 of 5 | Iteration:  74%|███████▎  | 891/1212 [06:55<02:06,  2.54it/s]

Gradient norm: 562.1766857193858


Epoch 2 of 5 | Iteration:  74%|███████▎  | 892/1212 [06:55<02:04,  2.58it/s]

Gradient norm: 562.1460079907141


Epoch 2 of 5 | Iteration:  74%|███████▎  | 893/1212 [06:56<02:03,  2.58it/s]

Gradient norm: 561.1629683061097


Epoch 2 of 5 | Iteration:  74%|███████▍  | 894/1212 [06:56<02:01,  2.61it/s]

Gradient norm: 561.9908949337797


Epoch 2 of 5 | Iteration:  74%|███████▍  | 895/1212 [06:57<02:01,  2.60it/s]

Gradient norm: 561.6658210608948


Epoch 2 of 5 | Iteration:  74%|███████▍  | 896/1212 [06:57<02:12,  2.38it/s]

Gradient norm: 558.6079366072545


Epoch 2 of 5 | Iteration:  74%|███████▍  | 897/1212 [06:58<02:07,  2.47it/s]

Gradient norm: 1.517759101719408


Epoch 2 of 5 | Iteration:  74%|███████▍  | 898/1212 [06:58<02:17,  2.28it/s]

Gradient norm: 2.3337569534486486


Epoch 2 of 5 | Iteration:  74%|███████▍  | 899/1212 [06:58<02:11,  2.38it/s]

Gradient norm: 6.082057290464602


Epoch 2 of 5 | Iteration:  74%|███████▍  | 900/1212 [06:59<02:21,  2.20it/s]

Gradient norm: 6.080525626027642


Epoch 2 of 5 | Iteration:  74%|███████▍  | 901/1212 [06:59<02:12,  2.35it/s]

Gradient norm: 17.796519551617717


Epoch 2 of 5 | Iteration:  74%|███████▍  | 902/1212 [07:00<02:11,  2.36it/s]

Gradient norm: 18.348708759596178


Epoch 2 of 5 | Iteration:  75%|███████▍  | 903/1212 [07:00<02:10,  2.36it/s]

Gradient norm: 18.498101981568013


Epoch 2 of 5 | Iteration:  75%|███████▍  | 904/1212 [07:01<02:05,  2.46it/s]

Gradient norm: 80.95609199911483


Epoch 2 of 5 | Iteration:  75%|███████▍  | 905/1212 [07:01<02:05,  2.45it/s]

Gradient norm: 80.963204515044


Epoch 2 of 5 | Iteration:  75%|███████▍  | 906/1212 [07:02<02:32,  2.00it/s]

Gradient norm: 81.08618893490578


Epoch 2 of 5 | Iteration:  75%|███████▍  | 907/1212 [07:02<02:40,  1.90it/s]

Gradient norm: 80.56537720134678


Epoch 2 of 5 | Iteration:  75%|███████▍  | 908/1212 [07:03<02:36,  1.95it/s]

Gradient norm: 83.13314883204562


Epoch 2 of 5 | Iteration:  75%|███████▌  | 909/1212 [07:03<02:45,  1.83it/s]

Gradient norm: 83.27409933945201


Epoch 2 of 5 | Iteration:  75%|███████▌  | 910/1212 [07:04<02:45,  1.83it/s]

Gradient norm: 83.82876757564651


Epoch 2 of 5 | Iteration:  75%|███████▌  | 911/1212 [07:04<02:41,  1.86it/s]

Gradient norm: 113.32225388972081


Epoch 2 of 5 | Iteration:  75%|███████▌  | 912/1212 [07:05<02:36,  1.92it/s]

Gradient norm: 113.05456611869992


Epoch 2 of 5 | Iteration:  75%|███████▌  | 913/1212 [07:05<02:22,  2.10it/s]

Gradient norm: 0.7249167962495218


Epoch 2 of 5 | Iteration:  75%|███████▌  | 914/1212 [07:06<02:10,  2.28it/s]

Gradient norm: 0.7639301011024452


Epoch 2 of 5 | Iteration:  75%|███████▌  | 915/1212 [07:06<02:24,  2.06it/s]

Gradient norm: 5.98144504035432


Epoch 2 of 5 | Iteration:  76%|███████▌  | 916/1212 [07:07<02:32,  1.95it/s]

Gradient norm: 6.4523621574082215


Epoch 2 of 5 | Iteration:  76%|███████▌  | 917/1212 [07:07<02:18,  2.13it/s]

Gradient norm: 6.776063428119056


Epoch 2 of 5 | Iteration:  76%|███████▌  | 918/1212 [07:08<02:25,  2.01it/s]

Gradient norm: 36.51276850509194


Epoch 2 of 5 | Iteration:  76%|███████▌  | 919/1212 [07:08<02:14,  2.19it/s]

Gradient norm: 36.46358789815516


Epoch 2 of 5 | Iteration:  76%|███████▌  | 920/1212 [07:08<02:05,  2.32it/s]

Gradient norm: 36.75568271939275


Epoch 2 of 5 | Iteration:  76%|███████▌  | 921/1212 [07:09<02:01,  2.40it/s]

Gradient norm: 37.6294654854503


Epoch 2 of 5 | Iteration:  76%|███████▌  | 922/1212 [07:09<02:20,  2.06it/s]

Gradient norm: 37.97158016167071


Epoch 2 of 5 | Iteration:  76%|███████▌  | 923/1212 [07:10<02:11,  2.20it/s]

Gradient norm: 67.92210675400082


Epoch 2 of 5 | Iteration:  76%|███████▌  | 924/1212 [07:10<02:12,  2.17it/s]

Gradient norm: 67.85608245297891


Epoch 2 of 5 | Iteration:  76%|███████▋  | 925/1212 [07:11<02:16,  2.11it/s]

Gradient norm: 67.68785433679976


Epoch 2 of 5 | Iteration:  76%|███████▋  | 926/1212 [07:11<02:06,  2.26it/s]

Gradient norm: 70.20736175552308


Epoch 2 of 5 | Iteration:  76%|███████▋  | 927/1212 [07:12<01:59,  2.38it/s]

Gradient norm: 69.65490715986643


Epoch 2 of 5 | Iteration:  77%|███████▋  | 928/1212 [07:12<01:56,  2.45it/s]

Gradient norm: 69.56848778876802


Epoch 2 of 5 | Iteration:  77%|███████▋  | 929/1212 [07:12<01:53,  2.50it/s]

Gradient norm: 10064.870773370032


Epoch 2 of 5 | Iteration:  77%|███████▋  | 930/1212 [07:13<02:03,  2.28it/s]

Gradient norm: 10064.87923649021


Epoch 2 of 5 | Iteration:  77%|███████▋  | 931/1212 [07:13<02:01,  2.30it/s]

Gradient norm: 10064.652275284081


Epoch 2 of 5 | Iteration:  77%|███████▋  | 932/1212 [07:14<01:56,  2.40it/s]

Gradient norm: 10064.14879036321


Epoch 2 of 5 | Iteration:  77%|███████▋  | 933/1212 [07:14<01:51,  2.51it/s]

Gradient norm: 10065.526645970953


Epoch 2 of 5 | Iteration:  77%|███████▋  | 934/1212 [07:14<01:52,  2.48it/s]

Gradient norm: 10066.00349456645


Epoch 2 of 5 | Iteration:  77%|███████▋  | 935/1212 [07:15<02:00,  2.29it/s]

Gradient norm: 10065.954312821921


Epoch 2 of 5 | Iteration:  77%|███████▋  | 936/1212 [07:16<02:24,  1.91it/s]

Gradient norm: 10066.30887200916


Epoch 2 of 5 | Iteration:  77%|███████▋  | 937/1212 [07:16<02:24,  1.90it/s]

Gradient norm: 10065.718951124005


Epoch 2 of 5 | Iteration:  77%|███████▋  | 938/1212 [07:17<02:23,  1.91it/s]

Gradient norm: 10066.011900889898


Epoch 2 of 5 | Iteration:  77%|███████▋  | 939/1212 [07:17<02:38,  1.73it/s]

Gradient norm: 10068.939715970157


Epoch 2 of 5 | Iteration:  78%|███████▊  | 940/1212 [07:18<02:40,  1.70it/s]

Gradient norm: 10068.973143592684


Epoch 2 of 5 | Iteration:  78%|███████▊  | 941/1212 [07:18<02:24,  1.87it/s]

Gradient norm: 10069.853669000182


Epoch 2 of 5 | Iteration:  78%|███████▊  | 942/1212 [07:19<02:13,  2.02it/s]

Gradient norm: 10070.756275232701


Epoch 2 of 5 | Iteration:  78%|███████▊  | 943/1212 [07:19<02:02,  2.20it/s]

Gradient norm: 10070.720430375872


Epoch 2 of 5 | Iteration:  78%|███████▊  | 944/1212 [07:20<02:05,  2.13it/s]

Gradient norm: 10070.821023844896


Epoch 2 of 5 | Iteration:  78%|███████▊  | 945/1212 [07:20<02:04,  2.14it/s]

Gradient norm: 30.980155898750684


Epoch 2 of 5 | Iteration:  78%|███████▊  | 946/1212 [07:21<02:02,  2.17it/s]

Gradient norm: 31.25526584462993


Epoch 2 of 5 | Iteration:  78%|███████▊  | 947/1212 [07:21<02:07,  2.08it/s]

Gradient norm: 64.78932157770411


Epoch 2 of 5 | Iteration:  78%|███████▊  | 948/1212 [07:22<02:04,  2.12it/s]

Gradient norm: 106.98123668892654


Epoch 2 of 5 | Iteration:  78%|███████▊  | 949/1212 [07:22<02:04,  2.11it/s]

Gradient norm: 106.90587150499164


Epoch 2 of 5 | Iteration:  78%|███████▊  | 950/1212 [07:23<02:10,  2.01it/s]

Gradient norm: 106.33633420519625


Epoch 2 of 5 | Iteration:  78%|███████▊  | 951/1212 [07:23<01:59,  2.18it/s]

Gradient norm: 109.77447388459159


Epoch 2 of 5 | Iteration:  79%|███████▊  | 952/1212 [07:23<01:52,  2.32it/s]

Gradient norm: 110.1415122835765


Epoch 2 of 5 | Iteration:  79%|███████▊  | 953/1212 [07:24<01:56,  2.22it/s]

Gradient norm: 110.64972168643939


Epoch 2 of 5 | Iteration:  79%|███████▊  | 954/1212 [07:24<01:49,  2.36it/s]

Gradient norm: 111.58468148136606


Epoch 2 of 5 | Iteration:  79%|███████▉  | 955/1212 [07:25<01:45,  2.43it/s]

Gradient norm: 111.97312902626135


Epoch 2 of 5 | Iteration:  79%|███████▉  | 956/1212 [07:25<01:43,  2.48it/s]

Gradient norm: 112.33286092037399


Epoch 2 of 5 | Iteration:  79%|███████▉  | 957/1212 [07:25<01:43,  2.46it/s]

Gradient norm: 112.37932136597512


Epoch 2 of 5 | Iteration:  79%|███████▉  | 958/1212 [07:26<01:45,  2.41it/s]

Gradient norm: 112.39382293264111


Epoch 2 of 5 | Iteration:  79%|███████▉  | 959/1212 [07:26<01:51,  2.26it/s]

Gradient norm: 114.2936661529726


Epoch 2 of 5 | Iteration:  79%|███████▉  | 960/1212 [07:27<01:53,  2.21it/s]

Gradient norm: 114.83463610401165


Epoch 2 of 5 | Iteration:  79%|███████▉  | 961/1212 [07:27<01:47,  2.33it/s]

Gradient norm: 0.0


Epoch 2 of 5 | Iteration:  79%|███████▉  | 962/1212 [07:28<02:02,  2.04it/s]

Gradient norm: 7.483853428425049


Epoch 2 of 5 | Iteration:  79%|███████▉  | 963/1212 [07:28<02:03,  2.02it/s]

Gradient norm: 7.57933866442172


Epoch 2 of 5 | Iteration:  80%|███████▉  | 964/1212 [07:29<02:07,  1.95it/s]

Gradient norm: 9.147499804290101


Epoch 2 of 5 | Iteration:  80%|███████▉  | 965/1212 [07:29<02:12,  1.87it/s]

Gradient norm: 17.95597182559169


Epoch 2 of 5 | Iteration:  80%|███████▉  | 966/1212 [07:30<02:08,  1.91it/s]

Gradient norm: 17.94596439047489


Epoch 2 of 5 | Iteration:  80%|███████▉  | 967/1212 [07:30<02:03,  1.98it/s]

Gradient norm: 17.90915240048834


Epoch 2 of 5 | Iteration:  80%|███████▉  | 968/1212 [07:31<02:06,  1.93it/s]

Gradient norm: 20.052616021791682


Epoch 2 of 5 | Iteration:  80%|███████▉  | 969/1212 [07:32<02:08,  1.89it/s]

Gradient norm: 20.572066890762287


Epoch 2 of 5 | Iteration:  80%|████████  | 970/1212 [07:32<02:03,  1.96it/s]

Gradient norm: 20.512711295941692


Epoch 2 of 5 | Iteration:  80%|████████  | 971/1212 [07:32<01:53,  2.13it/s]

Gradient norm: 21.924564009821125


Epoch 2 of 5 | Iteration:  80%|████████  | 972/1212 [07:33<01:54,  2.09it/s]

Gradient norm: 21.833056569927457


Epoch 2 of 5 | Iteration:  80%|████████  | 973/1212 [07:34<02:06,  1.89it/s]

Gradient norm: 21.885158382876508


Epoch 2 of 5 | Iteration:  80%|████████  | 974/1212 [07:34<01:58,  2.01it/s]

Gradient norm: 21.828392158564746


Epoch 2 of 5 | Iteration:  80%|████████  | 975/1212 [07:34<01:48,  2.18it/s]

Gradient norm: 85.12431650948719


Epoch 2 of 5 | Iteration:  81%|████████  | 976/1212 [07:35<01:54,  2.06it/s]

Gradient norm: 85.19141862583513


Epoch 2 of 5 | Iteration:  81%|████████  | 977/1212 [07:35<01:51,  2.10it/s]

Gradient norm: 1.596408264874689


Epoch 2 of 5 | Iteration:  81%|████████  | 978/1212 [07:36<01:44,  2.24it/s]

Gradient norm: 2.9272019443725603


Epoch 2 of 5 | Iteration:  81%|████████  | 979/1212 [07:36<01:38,  2.36it/s]

Gradient norm: 28.23431171683203


Epoch 2 of 5 | Iteration:  81%|████████  | 980/1212 [07:36<01:33,  2.49it/s]

Gradient norm: 31.849149972575887


Epoch 2 of 5 | Iteration:  81%|████████  | 981/1212 [07:37<01:34,  2.44it/s]

Gradient norm: 31.96440535514253


Epoch 2 of 5 | Iteration:  81%|████████  | 982/1212 [07:37<01:35,  2.40it/s]

Gradient norm: 32.30788377303154


Epoch 2 of 5 | Iteration:  81%|████████  | 983/1212 [07:38<01:53,  2.01it/s]

Gradient norm: 32.602051367211004


Epoch 2 of 5 | Iteration:  81%|████████  | 984/1212 [07:38<01:44,  2.17it/s]

Gradient norm: 33.48253056610791


Epoch 2 of 5 | Iteration:  81%|████████▏ | 985/1212 [07:39<01:45,  2.15it/s]

Gradient norm: 34.021817366358526


Epoch 2 of 5 | Iteration:  81%|████████▏ | 986/1212 [07:39<01:40,  2.26it/s]

Gradient norm: 711.0523252607627


Epoch 2 of 5 | Iteration:  81%|████████▏ | 987/1212 [07:40<01:40,  2.25it/s]

Gradient norm: 711.1140314325314


Epoch 2 of 5 | Iteration:  82%|████████▏ | 988/1212 [07:40<01:34,  2.37it/s]

Gradient norm: 1076.3289427788234


Epoch 2 of 5 | Iteration:  82%|████████▏ | 989/1212 [07:40<01:30,  2.47it/s]

Gradient norm: 1076.2386671916665


Epoch 2 of 5 | Iteration:  82%|████████▏ | 990/1212 [07:41<01:26,  2.56it/s]

Gradient norm: 1076.217465392492


Epoch 2 of 5 | Iteration:  82%|████████▏ | 991/1212 [07:41<01:33,  2.37it/s]

Gradient norm: 1075.8608295252527


Epoch 2 of 5 | Iteration:  82%|████████▏ | 992/1212 [07:42<01:40,  2.20it/s]

Gradient norm: 1076.3912313230874


Epoch 2 of 5 | Iteration:  82%|████████▏ | 993/1212 [07:42<01:42,  2.13it/s]

Gradient norm: 3.7235583853372227


Epoch 2 of 5 | Iteration:  82%|████████▏ | 994/1212 [07:43<01:47,  2.03it/s]

Gradient norm: 4.368967206149947


Epoch 2 of 5 | Iteration:  82%|████████▏ | 995/1212 [07:43<01:48,  2.01it/s]

Gradient norm: 4.394638735086071


Epoch 2 of 5 | Iteration:  82%|████████▏ | 996/1212 [07:44<01:47,  2.01it/s]

Gradient norm: 7.167750218669396


Epoch 2 of 5 | Iteration:  82%|████████▏ | 997/1212 [07:44<01:50,  1.95it/s]

Gradient norm: 6.870426241649062


Epoch 2 of 5 | Iteration:  82%|████████▏ | 998/1212 [07:45<01:48,  1.98it/s]

Gradient norm: 7.206025610188242


Epoch 2 of 5 | Iteration:  82%|████████▏ | 999/1212 [07:45<01:39,  2.15it/s]

Gradient norm: 10.87408982015466


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1000/1212 [07:46<01:38,  2.15it/s]

Gradient norm: 10.799442685081903


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1001/1212 [07:46<01:36,  2.18it/s]

Gradient norm: 16.006262233019964


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1002/1212 [07:47<01:34,  2.21it/s]

Gradient norm: 15.90898973960897


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1003/1212 [07:47<01:34,  2.22it/s]

Gradient norm: 16.11235198155089


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1004/1212 [07:48<01:39,  2.08it/s]

Gradient norm: 19.5850744796469


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1005/1212 [07:48<01:49,  1.89it/s]

Gradient norm: 20.097516926458358


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1006/1212 [07:49<01:38,  2.09it/s]

Gradient norm: 40.87890644850826


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1007/1212 [07:49<01:39,  2.07it/s]

Gradient norm: 40.810952009594196


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1008/1212 [07:50<01:37,  2.09it/s]

Gradient norm: 40.05771867564974


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1009/1212 [07:50<01:30,  2.24it/s]

Gradient norm: 32.26239536817775


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1010/1212 [07:50<01:24,  2.39it/s]

Gradient norm: 32.31806840525887


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1011/1212 [07:51<01:26,  2.32it/s]

Gradient norm: 34.40650555808082


Epoch 2 of 5 | Iteration:  83%|████████▎ | 1012/1212 [07:51<01:27,  2.29it/s]

Gradient norm: 34.33795781159372


Epoch 2 of 5 | Iteration:  84%|████████▎ | 1013/1212 [07:52<01:22,  2.40it/s]

Gradient norm: 37.49327282127901


Epoch 2 of 5 | Iteration:  84%|████████▎ | 1014/1212 [07:52<01:28,  2.23it/s]

Gradient norm: 37.513561236799994


Epoch 2 of 5 | Iteration:  84%|████████▎ | 1015/1212 [07:53<01:34,  2.09it/s]

Gradient norm: 37.629721418862715


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1016/1212 [07:53<01:32,  2.12it/s]

Gradient norm: 37.48517928010494


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1017/1212 [07:53<01:25,  2.27it/s]

Gradient norm: 42.8505229265039


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1018/1212 [07:54<01:21,  2.38it/s]

Gradient norm: 44.40238336145368


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1019/1212 [07:54<01:17,  2.48it/s]

Gradient norm: 43.795069752624016


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1020/1212 [07:55<01:15,  2.55it/s]

Gradient norm: 43.71164233285257


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1021/1212 [07:55<01:24,  2.27it/s]

Gradient norm: 43.901038643756415


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1022/1212 [07:56<01:28,  2.14it/s]

Gradient norm: 43.982627616283885


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1023/1212 [07:56<01:30,  2.08it/s]

Gradient norm: 43.763981209876164


Epoch 2 of 5 | Iteration:  84%|████████▍ | 1024/1212 [07:57<01:30,  2.07it/s]

Gradient norm: 253.25897337312747


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1025/1212 [07:57<01:35,  1.97it/s]

Gradient norm: 11.59023336292755


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1026/1212 [07:58<01:37,  1.90it/s]

Gradient norm: 12.235844776379839


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1027/1212 [07:58<01:40,  1.83it/s]

Gradient norm: 12.289257582096658


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1028/1212 [07:59<01:29,  2.05it/s]

Gradient norm: 12.380485269709501


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1029/1212 [07:59<01:32,  1.98it/s]

Gradient norm: 19.707575045604198


Epoch 2 of 5 | Iteration:  85%|████████▍ | 1030/1212 [08:00<01:27,  2.08it/s]

Gradient norm: 19.678537175145458


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1031/1212 [08:00<01:26,  2.10it/s]

Gradient norm: 19.673032904015145


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1032/1212 [08:01<01:24,  2.14it/s]

Gradient norm: 31.76283795594901


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1033/1212 [08:01<01:28,  2.03it/s]

Gradient norm: 54.90025462692905


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1034/1212 [08:01<01:20,  2.22it/s]

Gradient norm: 54.935341785838766


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1035/1212 [08:02<01:19,  2.21it/s]

Gradient norm: 54.876960135757905


Epoch 2 of 5 | Iteration:  85%|████████▌ | 1036/1212 [08:03<01:27,  2.02it/s]

Gradient norm: 58.02834530371911


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1037/1212 [08:03<01:20,  2.17it/s]

Gradient norm: 57.99202642959499


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1038/1212 [08:03<01:17,  2.24it/s]

Gradient norm: 57.9606713181754


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1039/1212 [08:04<01:13,  2.36it/s]

Gradient norm: 58.22173983391545


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1040/1212 [08:04<01:30,  1.89it/s]

Gradient norm: 57.92666955017809


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1041/1212 [08:05<01:32,  1.85it/s]

Gradient norm: 86.69771845537431


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1042/1212 [08:06<01:32,  1.85it/s]

Gradient norm: 86.54182001010125


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1043/1212 [08:06<01:26,  1.96it/s]

Gradient norm: 450.13099516942094


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1044/1212 [08:06<01:18,  2.15it/s]

Gradient norm: 449.98536616748834


Epoch 2 of 5 | Iteration:  86%|████████▌ | 1045/1212 [08:07<01:32,  1.80it/s]

Gradient norm: 677.3252335876545


Epoch 2 of 5 | Iteration:  86%|████████▋ | 1046/1212 [08:08<01:25,  1.94it/s]

Gradient norm: 677.0966448270253


Epoch 2 of 5 | Iteration:  86%|████████▋ | 1047/1212 [08:08<01:26,  1.92it/s]

Gradient norm: 677.1999929181846


Epoch 2 of 5 | Iteration:  86%|████████▋ | 1048/1212 [08:09<01:24,  1.93it/s]

Gradient norm: 677.1653092332541


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1049/1212 [08:09<01:22,  1.97it/s]

Gradient norm: 677.0992371033756


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1050/1212 [08:10<01:24,  1.91it/s]

Gradient norm: 677.0023363169012


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1051/1212 [08:10<01:23,  1.93it/s]

Gradient norm: 677.0172515082357


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1052/1212 [08:11<01:28,  1.81it/s]

Gradient norm: 677.2369876617972


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1053/1212 [08:11<01:29,  1.78it/s]

Gradient norm: 677.6493015556194


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1054/1212 [08:12<01:34,  1.67it/s]

Gradient norm: 680.6301429853329


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1055/1212 [08:12<01:24,  1.87it/s]

Gradient norm: 680.0853516143907


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1056/1212 [08:13<01:29,  1.75it/s]

Gradient norm: 680.6006135572933


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1057/1212 [08:14<01:20,  1.93it/s]

Gradient norm: 0.550839534364786


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1058/1212 [08:14<01:12,  2.12it/s]

Gradient norm: 9.32976727355122


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1059/1212 [08:14<01:07,  2.26it/s]

Gradient norm: 44.07602185527319


Epoch 2 of 5 | Iteration:  87%|████████▋ | 1060/1212 [08:15<01:03,  2.39it/s]

Gradient norm: 43.79079556720338


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1061/1212 [08:15<01:08,  2.19it/s]

Gradient norm: 44.0533418533256


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1062/1212 [08:16<01:05,  2.29it/s]

Gradient norm: 268.8698806702438


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1063/1212 [08:16<01:04,  2.31it/s]

Gradient norm: 267.3467584342527


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1064/1212 [08:16<01:02,  2.35it/s]

Gradient norm: 267.35409438522686


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1065/1212 [08:17<01:00,  2.45it/s]

Gradient norm: 267.4297964488537


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1066/1212 [08:17<01:02,  2.33it/s]

Gradient norm: 267.3235101062519


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1067/1212 [08:18<00:59,  2.42it/s]

Gradient norm: 267.46551260791387


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1068/1212 [08:18<00:59,  2.41it/s]

Gradient norm: 269.25347947292795


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1069/1212 [08:18<00:57,  2.49it/s]

Gradient norm: 268.99050647012405


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1070/1212 [08:19<00:57,  2.45it/s]

Gradient norm: 268.580188657224


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1071/1212 [08:19<00:58,  2.42it/s]

Gradient norm: 268.8197717315892


Epoch 2 of 5 | Iteration:  88%|████████▊ | 1072/1212 [08:20<00:57,  2.45it/s]

Gradient norm: 279.60908874125323


Epoch 2 of 5 | Iteration:  89%|████████▊ | 1073/1212 [08:20<00:54,  2.53it/s]

Gradient norm: 45.15809493359084


Epoch 2 of 5 | Iteration:  89%|████████▊ | 1074/1212 [08:20<00:58,  2.35it/s]

Gradient norm: 44.973307389794144


Epoch 2 of 5 | Iteration:  89%|████████▊ | 1075/1212 [08:21<01:00,  2.28it/s]

Gradient norm: 45.02751606848067


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1076/1212 [08:21<00:57,  2.38it/s]

Gradient norm: 44.052097274549496


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1077/1212 [08:22<00:59,  2.26it/s]

Gradient norm: 43.73624893815185


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1078/1212 [08:22<01:03,  2.10it/s]

Gradient norm: 44.213488136160635


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1079/1212 [08:23<01:07,  1.96it/s]

Gradient norm: 48.495207774953194


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1080/1212 [08:23<01:05,  2.02it/s]

Gradient norm: 48.52197809773997


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1081/1212 [08:24<01:05,  2.00it/s]

Gradient norm: 63.77364956675069


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1082/1212 [08:25<01:09,  1.88it/s]

Gradient norm: 63.87350331407102


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1083/1212 [08:25<01:07,  1.90it/s]

Gradient norm: 63.939773549923565


Epoch 2 of 5 | Iteration:  89%|████████▉ | 1084/1212 [08:25<01:00,  2.10it/s]

Gradient norm: 73.20396545793565


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1085/1212 [08:26<00:57,  2.20it/s]

Gradient norm: 75.41556520674833


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1086/1212 [08:26<00:59,  2.11it/s]

Gradient norm: 75.41547643187654


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1087/1212 [08:27<00:56,  2.23it/s]

Gradient norm: 75.6115661959867


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1088/1212 [08:27<00:53,  2.32it/s]

Gradient norm: 76.22558274990618


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1089/1212 [08:28<00:53,  2.32it/s]

Gradient norm: 14.585228677175394


Epoch 2 of 5 | Iteration:  90%|████████▉ | 1090/1212 [08:28<00:50,  2.42it/s]

Gradient norm: 68.83720981529933


Epoch 2 of 5 | Iteration:  90%|█████████ | 1091/1212 [08:28<00:51,  2.34it/s]

Gradient norm: 69.03040838978991


Epoch 2 of 5 | Iteration:  90%|█████████ | 1092/1212 [08:29<00:53,  2.25it/s]

Gradient norm: 69.74882679405759


Epoch 2 of 5 | Iteration:  90%|█████████ | 1093/1212 [08:29<00:50,  2.36it/s]

Gradient norm: 75.78839360178112


Epoch 2 of 5 | Iteration:  90%|█████████ | 1094/1212 [08:30<00:47,  2.47it/s]

Gradient norm: 76.01909629016335


Epoch 2 of 5 | Iteration:  90%|█████████ | 1095/1212 [08:30<00:46,  2.49it/s]

Gradient norm: 217.65430052085014


Epoch 2 of 5 | Iteration:  90%|█████████ | 1096/1212 [08:30<00:48,  2.41it/s]

Gradient norm: 217.62853742879176


Epoch 2 of 5 | Iteration:  91%|█████████ | 1097/1212 [08:31<00:47,  2.43it/s]

Gradient norm: 220.849249101467


Epoch 2 of 5 | Iteration:  91%|█████████ | 1098/1212 [08:31<00:44,  2.54it/s]

Gradient norm: 220.70958715548954


Epoch 2 of 5 | Iteration:  91%|█████████ | 1099/1212 [08:32<00:54,  2.07it/s]

Gradient norm: 220.78230091433832


Epoch 2 of 5 | Iteration:  91%|█████████ | 1100/1212 [08:33<00:58,  1.90it/s]

Gradient norm: 221.23892929238355


Epoch 2 of 5 | Iteration:  91%|█████████ | 1101/1212 [08:33<00:53,  2.07it/s]

Gradient norm: 221.7163738495704


Epoch 2 of 5 | Iteration:  91%|█████████ | 1102/1212 [08:33<00:52,  2.09it/s]

Gradient norm: 234.73272218575391


Epoch 2 of 5 | Iteration:  91%|█████████ | 1103/1212 [08:34<00:51,  2.12it/s]

Gradient norm: 236.20442142218317


Epoch 2 of 5 | Iteration:  91%|█████████ | 1104/1212 [08:34<00:52,  2.06it/s]

Gradient norm: 236.45014875057998


Epoch 2 of 5 | Iteration:  91%|█████████ | 1105/1212 [08:35<00:54,  1.97it/s]

Gradient norm: 4.914811586695927


Epoch 2 of 5 | Iteration:  91%|█████████▏| 1106/1212 [08:35<00:53,  1.98it/s]

Gradient norm: 6.125883236110405


Epoch 2 of 5 | Iteration:  91%|█████████▏| 1107/1212 [08:36<00:53,  1.98it/s]

Gradient norm: 7.316652369163289


Epoch 2 of 5 | Iteration:  91%|█████████▏| 1108/1212 [08:37<00:58,  1.79it/s]

Gradient norm: 9.596106990470718


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1109/1212 [08:37<00:57,  1.80it/s]

Gradient norm: 146.8811033081954


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1110/1212 [08:38<01:01,  1.66it/s]

Gradient norm: 154.84880257951588


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1111/1212 [08:38<00:59,  1.71it/s]

Gradient norm: 156.55365444529698


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1112/1212 [08:39<00:52,  1.92it/s]

Gradient norm: 156.26477849535428


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1113/1212 [08:39<00:46,  2.11it/s]

Gradient norm: 156.01951656174643


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1114/1212 [08:40<00:48,  2.01it/s]

Gradient norm: 156.18082726968765


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1115/1212 [08:40<00:44,  2.16it/s]

Gradient norm: 156.20300846101873


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1116/1212 [08:40<00:41,  2.30it/s]

Gradient norm: 156.10291128683863


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1117/1212 [08:41<00:42,  2.25it/s]

Gradient norm: 156.6121541746386


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1118/1212 [08:41<00:39,  2.38it/s]

Gradient norm: 156.7010626887244


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1119/1212 [08:42<00:40,  2.31it/s]

Gradient norm: 167.61903410754005


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1120/1212 [08:42<00:38,  2.36it/s]

Gradient norm: 189.94041698892707


Epoch 2 of 5 | Iteration:  92%|█████████▏| 1121/1212 [08:43<00:40,  2.27it/s]

Gradient norm: 5.606374813411398


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1122/1212 [08:43<00:41,  2.15it/s]

Gradient norm: 5.321393038465149


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1123/1212 [08:44<00:46,  1.93it/s]

Gradient norm: 6.642876721393091


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1124/1212 [08:44<00:41,  2.12it/s]

Gradient norm: 6.653045382187144


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1125/1212 [08:45<00:38,  2.25it/s]

Gradient norm: 14.960229342154058


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1126/1212 [08:45<00:36,  2.35it/s]

Gradient norm: 15.027670917263128


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1127/1212 [08:45<00:34,  2.44it/s]

Gradient norm: 15.050097694700087


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1128/1212 [08:46<00:33,  2.52it/s]

Gradient norm: 22.415594697223824


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1129/1212 [08:46<00:34,  2.41it/s]

Gradient norm: 26.731561065820795


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1130/1212 [08:46<00:33,  2.47it/s]

Gradient norm: 27.99168042357851


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1131/1212 [08:47<00:34,  2.35it/s]

Gradient norm: 28.759583260447858


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1132/1212 [08:47<00:32,  2.43it/s]

Gradient norm: 29.408640945393284


Epoch 2 of 5 | Iteration:  93%|█████████▎| 1133/1212 [08:48<00:31,  2.51it/s]

Gradient norm: 29.92410505295008


Epoch 2 of 5 | Iteration:  94%|█████████▎| 1134/1212 [08:48<00:31,  2.49it/s]

Gradient norm: 29.786412602019883


Epoch 2 of 5 | Iteration:  94%|█████████▎| 1135/1212 [08:49<00:35,  2.20it/s]

Gradient norm: 29.972100178934216


Epoch 2 of 5 | Iteration:  94%|█████████▎| 1136/1212 [08:49<00:38,  1.97it/s]

Gradient norm: 29.873545001951033


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1137/1212 [08:50<00:40,  1.86it/s]

Gradient norm: 4.9616214966357886


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1138/1212 [08:50<00:39,  1.86it/s]

Gradient norm: 6.775870979476951


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1139/1212 [08:51<00:38,  1.90it/s]

Gradient norm: 9.557883782836342


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1140/1212 [08:52<00:43,  1.66it/s]

Gradient norm: 10.111606064316383


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1141/1212 [08:52<00:40,  1.76it/s]

Gradient norm: 10.24825713866193


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1142/1212 [08:53<00:35,  1.96it/s]

Gradient norm: 10.501260335879682


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1143/1212 [08:53<00:36,  1.89it/s]

Gradient norm: 432.6433262611705


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1144/1212 [08:54<00:32,  2.09it/s]

Gradient norm: 432.35321671942097


Epoch 2 of 5 | Iteration:  94%|█████████▍| 1145/1212 [08:54<00:31,  2.15it/s]

Gradient norm: 431.62186334361274


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1146/1212 [08:54<00:29,  2.23it/s]

Gradient norm: 622.4106885356414


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1147/1212 [08:55<00:30,  2.14it/s]

Gradient norm: 622.5192604610986


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1148/1212 [08:55<00:30,  2.13it/s]

Gradient norm: 622.4240800476714


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1149/1212 [08:56<00:28,  2.19it/s]

Gradient norm: 621.3310268027919


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1150/1212 [08:56<00:26,  2.33it/s]

Gradient norm: 621.31944512459


Epoch 2 of 5 | Iteration:  95%|█████████▍| 1151/1212 [08:57<00:26,  2.29it/s]

Gradient norm: 621.7643326724839


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1152/1212 [08:57<00:27,  2.21it/s]

Gradient norm: 621.8347680836073


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1153/1212 [08:58<00:26,  2.25it/s]

Gradient norm: 5.425632718749504


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1154/1212 [08:58<00:24,  2.36it/s]

Gradient norm: 47.18226204032792


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1155/1212 [08:58<00:26,  2.14it/s]

Gradient norm: 48.05052472772159


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1156/1212 [08:59<00:27,  2.03it/s]

Gradient norm: 47.903948795209445


Epoch 2 of 5 | Iteration:  95%|█████████▌| 1157/1212 [08:59<00:25,  2.17it/s]

Gradient norm: 88.63174455712573


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1158/1212 [09:00<00:27,  1.94it/s]

Gradient norm: 90.38483483558957


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1159/1212 [09:00<00:25,  2.12it/s]

Gradient norm: 93.02524680116967


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1160/1212 [09:01<00:23,  2.24it/s]

Gradient norm: 190.65468015741678


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1161/1212 [09:01<00:23,  2.13it/s]

Gradient norm: 190.6646028539641


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1162/1212 [09:02<00:21,  2.29it/s]

Gradient norm: 189.5350319944912


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1163/1212 [09:02<00:22,  2.17it/s]

Gradient norm: 189.165607850912


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1164/1212 [09:03<00:22,  2.15it/s]

Gradient norm: 189.10908965443065


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1165/1212 [09:03<00:22,  2.07it/s]

Gradient norm: 189.05461508718375


Epoch 2 of 5 | Iteration:  96%|█████████▌| 1166/1212 [09:04<00:22,  2.04it/s]

Gradient norm: 189.01102136185244


Epoch 2 of 5 | Iteration:  96%|█████████▋| 1167/1212 [09:04<00:22,  1.96it/s]

Gradient norm: 189.00876081201977


Epoch 2 of 5 | Iteration:  96%|█████████▋| 1168/1212 [09:05<00:24,  1.82it/s]

Gradient norm: 188.60016407822994


Epoch 2 of 5 | Iteration:  96%|█████████▋| 1169/1212 [09:05<00:22,  1.93it/s]

Gradient norm: 0.8156799241598301


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1170/1212 [09:06<00:20,  2.06it/s]

Gradient norm: 2.2870963841224645


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1171/1212 [09:06<00:18,  2.23it/s]

Gradient norm: 3.0044280329795745


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1172/1212 [09:07<00:16,  2.37it/s]

Gradient norm: 6.440652391484083


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1173/1212 [09:07<00:15,  2.44it/s]

Gradient norm: 6.913251827320031


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1174/1212 [09:08<00:18,  2.07it/s]

Gradient norm: 8.045947063864748


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1175/1212 [09:08<00:18,  2.01it/s]

Gradient norm: 29.257342409189306


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1176/1212 [09:08<00:16,  2.16it/s]

Gradient norm: 30.046571101793237


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1177/1212 [09:09<00:18,  1.89it/s]

Gradient norm: 31.277466890752446


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1178/1212 [09:10<00:16,  2.07it/s]

Gradient norm: 31.32646337144543


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1179/1212 [09:10<00:14,  2.21it/s]

Gradient norm: 29.095212345409898


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1180/1212 [09:10<00:13,  2.34it/s]

Gradient norm: 28.915541498171915


Epoch 2 of 5 | Iteration:  97%|█████████▋| 1181/1212 [09:11<00:13,  2.30it/s]

Gradient norm: 28.773152681814803


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1182/1212 [09:11<00:12,  2.36it/s]

Gradient norm: 29.54410504168816


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1183/1212 [09:12<00:12,  2.38it/s]

Gradient norm: 29.471217394650335


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1184/1212 [09:12<00:11,  2.41it/s]

Gradient norm: 32.5155455744513


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1185/1212 [09:12<00:11,  2.45it/s]

Gradient norm: 8.80113050265091


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1186/1212 [09:13<00:10,  2.45it/s]

Gradient norm: 284.99174613767167


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1187/1212 [09:13<00:10,  2.44it/s]

Gradient norm: 286.46024499660587


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1188/1212 [09:14<00:10,  2.25it/s]

Gradient norm: 286.47436228632205


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1189/1212 [09:14<00:11,  2.00it/s]

Gradient norm: 286.5374104175784


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1190/1212 [09:15<00:09,  2.21it/s]

Gradient norm: 287.1688551760441


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1191/1212 [09:15<00:09,  2.18it/s]

Gradient norm: 286.93146100293666


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1192/1212 [09:16<00:09,  2.16it/s]

Gradient norm: 286.97768668642817


Epoch 2 of 5 | Iteration:  98%|█████████▊| 1193/1212 [09:16<00:08,  2.14it/s]

Gradient norm: 287.1902414071628


Epoch 2 of 5 | Iteration:  99%|█████████▊| 1194/1212 [09:17<00:08,  2.04it/s]

Gradient norm: 287.21944286853847


Epoch 2 of 5 | Iteration:  99%|█████████▊| 1195/1212 [09:17<00:08,  2.00it/s]

Gradient norm: 287.44530219216057


Epoch 2 of 5 | Iteration:  99%|█████████▊| 1196/1212 [09:18<00:07,  2.01it/s]

Gradient norm: 287.4690601532781


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1197/1212 [09:18<00:07,  1.99it/s]

Gradient norm: 287.8224105071205


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1198/1212 [09:19<00:07,  1.94it/s]

Gradient norm: 287.90350909604365


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1199/1212 [09:19<00:07,  1.80it/s]

Gradient norm: 287.5396484340112


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1200/1212 [09:20<00:06,  1.93it/s]

Gradient norm: 287.4454059895659


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1201/1212 [09:20<00:05,  2.12it/s]

Gradient norm: 3.432226306017237


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1202/1212 [09:20<00:04,  2.29it/s]

Gradient norm: 4.340470284893775


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1203/1212 [09:21<00:03,  2.37it/s]

Gradient norm: 9.245256120841901


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1204/1212 [09:21<00:03,  2.43it/s]

Gradient norm: 560.5131086917573


Epoch 2 of 5 | Iteration:  99%|█████████▉| 1205/1212 [09:22<00:02,  2.43it/s]

Gradient norm: 560.2173057305147


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1206/1212 [09:22<00:02,  2.24it/s]

Gradient norm: 560.0946888357831


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1207/1212 [09:23<00:02,  2.38it/s]

Gradient norm: 560.0205687785359


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1208/1212 [09:23<00:01,  2.32it/s]

Gradient norm: 560.0092092034074


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1209/1212 [09:23<00:01,  2.43it/s]

Gradient norm: 559.8898139752185


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1210/1212 [09:24<00:00,  2.43it/s]

Gradient norm: 593.6057374537602


Epoch 2 of 5 | Iteration: 100%|█████████▉| 1211/1212 [09:24<00:00,  2.27it/s]

Gradient norm: 591.8839321355432


Epoch 2 of 5 | Iteration: 100%|██████████| 1212/1212 [09:25<00:00,  2.14it/s]


Gradient norm: 602.9137891217497


100%|██████████| 1212/1212 [05:09<00:00,  3.91it/s]


Epoch 2/5, Training Loss: 1.8174, Validation Loss: 1.7050
Validation top k acc: 0.8061
              precision    recall  f1-score   support

           0       0.91      0.77      0.83     10666
           1       0.32      0.58      0.41      1947

    accuracy                           0.74     12613
   macro avg       0.61      0.68      0.62     12613
weighted avg       0.82      0.74      0.77     12613



Epoch 3 of 5 | Iteration:   0%|          | 0/1212 [00:00<?, ?it/s]

Train ...


Epoch 3 of 5 | Iteration:   0%|          | 1/1212 [00:00<13:04,  1.54it/s]

Gradient norm: 601.1847877216209


Epoch 3 of 5 | Iteration:   0%|          | 2/1212 [00:01<11:52,  1.70it/s]

Gradient norm: 601.2044507876121


Epoch 3 of 5 | Iteration:   0%|          | 3/1212 [00:01<10:58,  1.84it/s]

Gradient norm: 602.0108874604033


Epoch 3 of 5 | Iteration:   0%|          | 4/1212 [00:02<11:52,  1.70it/s]

Gradient norm: 600.6322753630365


Epoch 3 of 5 | Iteration:   0%|          | 5/1212 [00:02<10:51,  1.85it/s]

Gradient norm: 600.6269637476215


Epoch 3 of 5 | Iteration:   0%|          | 6/1212 [00:03<09:53,  2.03it/s]

Gradient norm: 600.7768955792469


Epoch 3 of 5 | Iteration:   1%|          | 7/1212 [00:03<09:22,  2.14it/s]

Gradient norm: 600.9312656730617


Epoch 3 of 5 | Iteration:   1%|          | 8/1212 [00:04<09:59,  2.01it/s]

Gradient norm: 600.9663880343732


Epoch 3 of 5 | Iteration:   1%|          | 9/1212 [00:04<09:14,  2.17it/s]

Gradient norm: 599.7599336719181


Epoch 3 of 5 | Iteration:   1%|          | 10/1212 [00:05<10:13,  1.96it/s]

Gradient norm: 599.8110958843714


Epoch 3 of 5 | Iteration:   1%|          | 11/1212 [00:05<09:22,  2.13it/s]

Gradient norm: 599.5989473739851


Epoch 3 of 5 | Iteration:   1%|          | 12/1212 [00:06<10:10,  1.97it/s]

Gradient norm: 599.6630641307571


Epoch 3 of 5 | Iteration:   1%|          | 13/1212 [00:06<10:14,  1.95it/s]

Gradient norm: 609.3534452300413


Epoch 3 of 5 | Iteration:   1%|          | 14/1212 [00:07<10:25,  1.92it/s]

Gradient norm: 593.0982394172388


Epoch 3 of 5 | Iteration:   1%|          | 15/1212 [00:08<12:22,  1.61it/s]

Gradient norm: 589.3831467858041


Epoch 3 of 5 | Iteration:   1%|▏         | 16/1212 [00:08<13:36,  1.47it/s]

Gradient norm: 589.2431466300847


Epoch 3 of 5 | Iteration:   1%|▏         | 17/1212 [00:09<13:32,  1.47it/s]

Gradient norm: 0.9903689402221983


Epoch 3 of 5 | Iteration:   1%|▏         | 18/1212 [00:10<12:27,  1.60it/s]

Gradient norm: 4.7684030293466515


Epoch 3 of 5 | Iteration:   2%|▏         | 19/1212 [00:10<11:15,  1.76it/s]

Gradient norm: 4.7633979231438754


Epoch 3 of 5 | Iteration:   2%|▏         | 20/1212 [00:11<10:56,  1.82it/s]

Gradient norm: 4.913660260922326


Epoch 3 of 5 | Iteration:   2%|▏         | 21/1212 [00:11<09:58,  1.99it/s]

Gradient norm: 7.377528490152415


Epoch 3 of 5 | Iteration:   2%|▏         | 22/1212 [00:11<09:57,  1.99it/s]

Gradient norm: 6472.5117262362455


Epoch 3 of 5 | Iteration:   2%|▏         | 23/1212 [00:12<10:48,  1.83it/s]

Gradient norm: 6475.16555360962


Epoch 3 of 5 | Iteration:   2%|▏         | 24/1212 [00:12<10:02,  1.97it/s]

Gradient norm: 6475.276561179478


Epoch 3 of 5 | Iteration:   2%|▏         | 25/1212 [00:13<09:31,  2.08it/s]

Gradient norm: 6474.108391448118


Epoch 3 of 5 | Iteration:   2%|▏         | 26/1212 [00:14<10:29,  1.89it/s]

Gradient norm: 6472.334469879725


Epoch 3 of 5 | Iteration:   2%|▏         | 27/1212 [00:14<09:43,  2.03it/s]

Gradient norm: 6472.407293462533


Epoch 3 of 5 | Iteration:   2%|▏         | 28/1212 [00:14<09:00,  2.19it/s]

Gradient norm: 6472.792616605814


Epoch 3 of 5 | Iteration:   2%|▏         | 29/1212 [00:15<08:33,  2.30it/s]

Gradient norm: 6471.5603855779245


Epoch 3 of 5 | Iteration:   2%|▏         | 30/1212 [00:15<08:13,  2.39it/s]

Gradient norm: 6471.611517128365


Epoch 3 of 5 | Iteration:   3%|▎         | 31/1212 [00:15<07:59,  2.46it/s]

Gradient norm: 6469.588617779242


Epoch 3 of 5 | Iteration:   3%|▎         | 32/1212 [00:16<07:56,  2.48it/s]

Gradient norm: 6470.9518029181445


Epoch 3 of 5 | Iteration:   3%|▎         | 33/1212 [00:16<07:41,  2.56it/s]

Gradient norm: 2.218356868193972


Epoch 3 of 5 | Iteration:   3%|▎         | 34/1212 [00:17<09:17,  2.11it/s]

Gradient norm: 4.021058908632263


Epoch 3 of 5 | Iteration:   3%|▎         | 35/1212 [00:17<09:44,  2.02it/s]

Gradient norm: 14613.932011769171


Epoch 3 of 5 | Iteration:   3%|▎         | 36/1212 [00:18<09:18,  2.11it/s]

Gradient norm: 14614.069447837624


Epoch 3 of 5 | Iteration:   3%|▎         | 37/1212 [00:18<08:41,  2.25it/s]

Gradient norm: 14614.07192318421


Epoch 3 of 5 | Iteration:   3%|▎         | 38/1212 [00:19<08:55,  2.19it/s]

Gradient norm: 14609.673956651208


Epoch 3 of 5 | Iteration:   3%|▎         | 39/1212 [00:20<11:04,  1.77it/s]

Gradient norm: 14609.626985251913


Epoch 3 of 5 | Iteration:   3%|▎         | 40/1212 [00:20<10:45,  1.81it/s]

Gradient norm: 14577.429704863493


Epoch 3 of 5 | Iteration:   3%|▎         | 41/1212 [00:21<11:53,  1.64it/s]

Gradient norm: 14578.36511323826


Epoch 3 of 5 | Iteration:   3%|▎         | 42/1212 [00:21<11:21,  1.72it/s]

Gradient norm: 14578.928446122076


Epoch 3 of 5 | Iteration:   4%|▎         | 43/1212 [00:22<11:09,  1.75it/s]

Gradient norm: 14578.928142273235


Epoch 3 of 5 | Iteration:   4%|▎         | 44/1212 [00:22<11:22,  1.71it/s]

Gradient norm: 14614.645710458857


Epoch 3 of 5 | Iteration:   4%|▎         | 45/1212 [00:23<11:29,  1.69it/s]

Gradient norm: 14614.82813987933


Epoch 3 of 5 | Iteration:   4%|▍         | 46/1212 [00:24<11:02,  1.76it/s]

Gradient norm: 14615.037671047889


Epoch 3 of 5 | Iteration:   4%|▍         | 47/1212 [00:24<10:28,  1.85it/s]

Gradient norm: 14612.80645172486


Epoch 3 of 5 | Iteration:   4%|▍         | 48/1212 [00:25<10:01,  1.93it/s]

Gradient norm: 14612.601043544428


Epoch 3 of 5 | Iteration:   4%|▍         | 49/1212 [00:25<10:13,  1.90it/s]

Gradient norm: 68.673393649299


Epoch 3 of 5 | Iteration:   4%|▍         | 50/1212 [00:26<10:11,  1.90it/s]

Gradient norm: 68.83782288231835


Epoch 3 of 5 | Iteration:   4%|▍         | 51/1212 [00:26<10:11,  1.90it/s]

Gradient norm: 69.22376484714904


Epoch 3 of 5 | Iteration:   4%|▍         | 52/1212 [00:27<09:55,  1.95it/s]

Gradient norm: 69.32082260347188


Epoch 3 of 5 | Iteration:   4%|▍         | 53/1212 [00:27<09:11,  2.10it/s]

Gradient norm: 69.32958342432275


Epoch 3 of 5 | Iteration:   4%|▍         | 54/1212 [00:27<09:10,  2.10it/s]

Gradient norm: 69.28463304524372


Epoch 3 of 5 | Iteration:   5%|▍         | 55/1212 [00:28<10:09,  1.90it/s]

Gradient norm: 69.57736385898214


Epoch 3 of 5 | Iteration:   5%|▍         | 56/1212 [00:29<09:44,  1.98it/s]

Gradient norm: 69.49524476838252


Epoch 3 of 5 | Iteration:   5%|▍         | 57/1212 [00:29<09:27,  2.03it/s]

Gradient norm: 69.45634730818395


Epoch 3 of 5 | Iteration:   5%|▍         | 58/1212 [00:29<08:49,  2.18it/s]

Gradient norm: 69.36593468870656


Epoch 3 of 5 | Iteration:   5%|▍         | 59/1212 [00:30<09:20,  2.06it/s]

Gradient norm: 69.30642086865078


Epoch 3 of 5 | Iteration:   5%|▍         | 60/1212 [00:30<08:56,  2.15it/s]

Gradient norm: 69.37024038783755


Epoch 3 of 5 | Iteration:   5%|▌         | 61/1212 [00:31<10:01,  1.91it/s]

Gradient norm: 75.89806336271721


Epoch 3 of 5 | Iteration:   5%|▌         | 62/1212 [00:31<09:15,  2.07it/s]

Gradient norm: 76.74633290553467


Epoch 3 of 5 | Iteration:   5%|▌         | 63/1212 [00:32<10:12,  1.88it/s]

Gradient norm: 75.6828110625384


Epoch 3 of 5 | Iteration:   5%|▌         | 64/1212 [00:33<09:55,  1.93it/s]

Gradient norm: 119.14027021719033


Epoch 3 of 5 | Iteration:   5%|▌         | 65/1212 [00:33<10:41,  1.79it/s]

Gradient norm: 11.493643522845417


Epoch 3 of 5 | Iteration:   5%|▌         | 66/1212 [00:34<10:52,  1.76it/s]

Gradient norm: 15.422901389489208


Epoch 3 of 5 | Iteration:   6%|▌         | 67/1212 [00:34<10:48,  1.77it/s]

Gradient norm: 15.209119903824753


Epoch 3 of 5 | Iteration:   6%|▌         | 68/1212 [00:35<10:36,  1.80it/s]

Gradient norm: 17.567001084074047


Epoch 3 of 5 | Iteration:   6%|▌         | 69/1212 [00:35<10:41,  1.78it/s]

Gradient norm: 17.526763705902084


Epoch 3 of 5 | Iteration:   6%|▌         | 70/1212 [00:36<10:41,  1.78it/s]

Gradient norm: 18.085462413568997


Epoch 3 of 5 | Iteration:   6%|▌         | 71/1212 [00:37<10:57,  1.74it/s]

Gradient norm: 18.387309702361463


Epoch 3 of 5 | Iteration:   6%|▌         | 72/1212 [00:37<09:50,  1.93it/s]

Gradient norm: 24.48041130992473


Epoch 3 of 5 | Iteration:   6%|▌         | 73/1212 [00:37<08:57,  2.12it/s]

Gradient norm: 24.436023835229125


Epoch 3 of 5 | Iteration:   6%|▌         | 74/1212 [00:38<08:30,  2.23it/s]

Gradient norm: 25.672531318480623


Epoch 3 of 5 | Iteration:   6%|▌         | 75/1212 [00:38<08:27,  2.24it/s]

Gradient norm: 25.80645178324617


Epoch 3 of 5 | Iteration:   6%|▋         | 76/1212 [00:39<08:03,  2.35it/s]

Gradient norm: 26.227348125023756


Epoch 3 of 5 | Iteration:   6%|▋         | 77/1212 [00:39<08:21,  2.26it/s]

Gradient norm: 26.40817245674528


Epoch 3 of 5 | Iteration:   6%|▋         | 78/1212 [00:39<07:51,  2.40it/s]

Gradient norm: 27.47605091992543


Epoch 3 of 5 | Iteration:   7%|▋         | 79/1212 [00:40<07:44,  2.44it/s]

Gradient norm: 161.37370058067495


Epoch 3 of 5 | Iteration:   7%|▋         | 80/1212 [00:40<08:27,  2.23it/s]

Gradient norm: 161.61078779212392


Epoch 3 of 5 | Iteration:   7%|▋         | 81/1212 [00:41<08:09,  2.31it/s]

Gradient norm: 13.554421360747277


Epoch 3 of 5 | Iteration:   7%|▋         | 82/1212 [00:41<07:48,  2.41it/s]

Gradient norm: 21.609894900576858


Epoch 3 of 5 | Iteration:   7%|▋         | 83/1212 [00:42<07:32,  2.49it/s]

Gradient norm: 27.731262413451518


Epoch 3 of 5 | Iteration:   7%|▋         | 84/1212 [00:42<07:24,  2.54it/s]

Gradient norm: 28.132874933571877


Epoch 3 of 5 | Iteration:   7%|▋         | 85/1212 [00:42<07:42,  2.44it/s]

Gradient norm: 27.81507344658875


Epoch 3 of 5 | Iteration:   7%|▋         | 86/1212 [00:43<09:01,  2.08it/s]

Gradient norm: 51.120521082141074


Epoch 3 of 5 | Iteration:   7%|▋         | 87/1212 [00:43<08:20,  2.25it/s]

Gradient norm: 52.381242829518584


Epoch 3 of 5 | Iteration:   7%|▋         | 88/1212 [00:44<08:05,  2.32it/s]

Gradient norm: 57.50503243362109


Epoch 3 of 5 | Iteration:   7%|▋         | 89/1212 [00:44<09:16,  2.02it/s]

Gradient norm: 58.73968377172058


Epoch 3 of 5 | Iteration:   7%|▋         | 90/1212 [00:45<08:40,  2.16it/s]

Gradient norm: 58.76959868533037


Epoch 3 of 5 | Iteration:   8%|▊         | 91/1212 [00:45<08:04,  2.31it/s]

Gradient norm: 59.103486106593124


Epoch 3 of 5 | Iteration:   8%|▊         | 92/1212 [00:46<07:41,  2.43it/s]

Gradient norm: 77.52182234417842


Epoch 3 of 5 | Iteration:   8%|▊         | 93/1212 [00:46<07:31,  2.48it/s]

Gradient norm: 77.34170683323553


Epoch 3 of 5 | Iteration:   8%|▊         | 94/1212 [00:46<07:20,  2.54it/s]

Gradient norm: 77.38018097082966


Epoch 3 of 5 | Iteration:   8%|▊         | 95/1212 [00:47<09:34,  1.94it/s]

Gradient norm: 77.31080713490381


Epoch 3 of 5 | Iteration:   8%|▊         | 96/1212 [00:48<10:28,  1.77it/s]

Gradient norm: 77.17852695761758


Epoch 3 of 5 | Iteration:   8%|▊         | 97/1212 [00:48<10:38,  1.75it/s]

Gradient norm: 14.375455800529757


Epoch 3 of 5 | Iteration:   8%|▊         | 98/1212 [00:49<10:20,  1.80it/s]

Gradient norm: 319.43816803449613


Epoch 3 of 5 | Iteration:   8%|▊         | 99/1212 [00:49<10:26,  1.78it/s]

Gradient norm: 320.44496985657935


Epoch 3 of 5 | Iteration:   8%|▊         | 100/1212 [00:50<10:23,  1.78it/s]

Gradient norm: 320.55090706310233


Epoch 3 of 5 | Iteration:   8%|▊         | 101/1212 [00:51<11:10,  1.66it/s]

Gradient norm: 320.33194179999197


Epoch 3 of 5 | Iteration:   8%|▊         | 102/1212 [00:51<09:55,  1.86it/s]

Gradient norm: 320.1720909695937


Epoch 3 of 5 | Iteration:   8%|▊         | 103/1212 [00:51<09:03,  2.04it/s]

Gradient norm: 322.0750320638721


Epoch 3 of 5 | Iteration:   9%|▊         | 104/1212 [00:52<09:55,  1.86it/s]

Gradient norm: 320.8822106039462


Epoch 3 of 5 | Iteration:   9%|▊         | 105/1212 [00:52<09:03,  2.04it/s]

Gradient norm: 320.62637557795307


Epoch 3 of 5 | Iteration:   9%|▊         | 106/1212 [00:53<08:33,  2.15it/s]

Gradient norm: 317.61931731628056


Epoch 3 of 5 | Iteration:   9%|▉         | 107/1212 [00:53<08:21,  2.21it/s]

Gradient norm: 317.6101273514614


Epoch 3 of 5 | Iteration:   9%|▉         | 108/1212 [00:54<07:52,  2.34it/s]

Gradient norm: 316.4330348481087


Epoch 3 of 5 | Iteration:   9%|▉         | 109/1212 [00:54<08:31,  2.16it/s]

Gradient norm: 316.1883034546732


Epoch 3 of 5 | Iteration:   9%|▉         | 110/1212 [00:55<09:17,  1.98it/s]

Gradient norm: 316.18950348407816


Epoch 3 of 5 | Iteration:   9%|▉         | 111/1212 [00:55<08:37,  2.13it/s]

Gradient norm: 319.1734222433323


Epoch 3 of 5 | Iteration:   9%|▉         | 112/1212 [00:56<08:10,  2.24it/s]

Gradient norm: 319.9456082440421


Epoch 3 of 5 | Iteration:   9%|▉         | 113/1212 [00:56<07:48,  2.35it/s]

Gradient norm: 2.672206786856156


Epoch 3 of 5 | Iteration:   9%|▉         | 114/1212 [00:56<07:33,  2.42it/s]

Gradient norm: 19.425560269673085


Epoch 3 of 5 | Iteration:   9%|▉         | 115/1212 [00:57<07:21,  2.49it/s]

Gradient norm: 250.26896440292583


Epoch 3 of 5 | Iteration:  10%|▉         | 116/1212 [00:57<07:19,  2.49it/s]

Gradient norm: 250.65316895056705


Epoch 3 of 5 | Iteration:  10%|▉         | 117/1212 [00:58<07:09,  2.55it/s]

Gradient norm: 251.28343051937568


Epoch 3 of 5 | Iteration:  10%|▉         | 118/1212 [00:58<07:26,  2.45it/s]

Gradient norm: 255.6584412066288


Epoch 3 of 5 | Iteration:  10%|▉         | 119/1212 [00:58<07:20,  2.48it/s]

Gradient norm: 256.75067585927525


Epoch 3 of 5 | Iteration:  10%|▉         | 120/1212 [00:59<07:08,  2.55it/s]

Gradient norm: 384.7295594705703


Epoch 3 of 5 | Iteration:  10%|▉         | 121/1212 [00:59<07:37,  2.38it/s]

Gradient norm: 384.6862811202009


Epoch 3 of 5 | Iteration:  10%|█         | 122/1212 [01:00<07:17,  2.49it/s]

Gradient norm: 376.21678511921385


Epoch 3 of 5 | Iteration:  10%|█         | 123/1212 [01:00<07:06,  2.55it/s]

Gradient norm: 375.50220629181985


Epoch 3 of 5 | Iteration:  10%|█         | 124/1212 [01:00<07:37,  2.38it/s]

Gradient norm: 376.6242514021107


Epoch 3 of 5 | Iteration:  10%|█         | 125/1212 [01:01<08:17,  2.19it/s]

Gradient norm: 376.6261943591554


Epoch 3 of 5 | Iteration:  10%|█         | 126/1212 [01:02<09:12,  1.96it/s]

Gradient norm: 376.4706018627177


Epoch 3 of 5 | Iteration:  10%|█         | 127/1212 [01:02<09:07,  1.98it/s]

Gradient norm: 386.74007819028674


Epoch 3 of 5 | Iteration:  11%|█         | 128/1212 [01:03<09:16,  1.95it/s]

Gradient norm: 386.95222117937067


Epoch 3 of 5 | Iteration:  11%|█         | 129/1212 [01:03<09:11,  1.97it/s]

Gradient norm: 17.023097848121054


Epoch 3 of 5 | Iteration:  11%|█         | 130/1212 [01:04<09:36,  1.88it/s]

Gradient norm: 22.06430752769736


Epoch 3 of 5 | Iteration:  11%|█         | 131/1212 [01:04<09:01,  2.00it/s]

Gradient norm: 22.575210920341952


Epoch 3 of 5 | Iteration:  11%|█         | 132/1212 [01:05<08:21,  2.15it/s]

Gradient norm: 22.465130438687435


Epoch 3 of 5 | Iteration:  11%|█         | 133/1212 [01:05<07:54,  2.27it/s]

Gradient norm: 23.347514683979544


Epoch 3 of 5 | Iteration:  11%|█         | 134/1212 [01:05<07:41,  2.34it/s]

Gradient norm: 24.22895359463747


Epoch 3 of 5 | Iteration:  11%|█         | 135/1212 [01:06<08:16,  2.17it/s]

Gradient norm: 185.32930293275908


Epoch 3 of 5 | Iteration:  11%|█         | 136/1212 [01:06<08:20,  2.15it/s]

Gradient norm: 185.18016572305456


Epoch 3 of 5 | Iteration:  11%|█▏        | 137/1212 [01:07<08:03,  2.23it/s]

Gradient norm: 187.9497920512589


Epoch 3 of 5 | Iteration:  11%|█▏        | 138/1212 [01:07<08:24,  2.13it/s]

Gradient norm: 188.06390607465747


Epoch 3 of 5 | Iteration:  11%|█▏        | 139/1212 [01:08<07:52,  2.27it/s]

Gradient norm: 188.019085455966


Epoch 3 of 5 | Iteration:  12%|█▏        | 140/1212 [01:08<08:34,  2.08it/s]

Gradient norm: 188.26523413524637


Epoch 3 of 5 | Iteration:  12%|█▏        | 141/1212 [01:09<08:11,  2.18it/s]

Gradient norm: 188.40066877468303


Epoch 3 of 5 | Iteration:  12%|█▏        | 142/1212 [01:09<07:55,  2.25it/s]

Gradient norm: 188.13855780555312


Epoch 3 of 5 | Iteration:  12%|█▏        | 143/1212 [01:10<08:16,  2.15it/s]

Gradient norm: 188.7042402469808


Epoch 3 of 5 | Iteration:  12%|█▏        | 144/1212 [01:10<07:55,  2.25it/s]

Gradient norm: 190.90173455623417


Epoch 3 of 5 | Iteration:  12%|█▏        | 145/1212 [01:10<08:18,  2.14it/s]

Gradient norm: 4.5115323625666255


Epoch 3 of 5 | Iteration:  12%|█▏        | 146/1212 [01:11<09:15,  1.92it/s]

Gradient norm: 74.73314690200938


Epoch 3 of 5 | Iteration:  12%|█▏        | 147/1212 [01:11<08:27,  2.10it/s]

Gradient norm: 73.93841791047194


Epoch 3 of 5 | Iteration:  12%|█▏        | 148/1212 [01:12<08:07,  2.18it/s]

Gradient norm: 77.38259185345163


Epoch 3 of 5 | Iteration:  12%|█▏        | 149/1212 [01:13<09:10,  1.93it/s]

Gradient norm: 70.0595202807032


Epoch 3 of 5 | Iteration:  12%|█▏        | 150/1212 [01:13<08:33,  2.07it/s]

Gradient norm: 70.95937822311629


Epoch 3 of 5 | Iteration:  12%|█▏        | 151/1212 [01:13<07:59,  2.21it/s]

Gradient norm: 70.9765534565948


Epoch 3 of 5 | Iteration:  13%|█▎        | 152/1212 [01:14<07:59,  2.21it/s]

Gradient norm: 70.9281837124242


Epoch 3 of 5 | Iteration:  13%|█▎        | 153/1212 [01:14<08:25,  2.09it/s]

Gradient norm: 70.86226239984379


Epoch 3 of 5 | Iteration:  13%|█▎        | 154/1212 [01:15<08:32,  2.07it/s]

Gradient norm: 70.77074212646178


Epoch 3 of 5 | Iteration:  13%|█▎        | 155/1212 [01:15<08:46,  2.01it/s]

Gradient norm: 78.96135843752843


Epoch 3 of 5 | Iteration:  13%|█▎        | 156/1212 [01:16<10:01,  1.76it/s]

Gradient norm: 81.46456333514487


Epoch 3 of 5 | Iteration:  13%|█▎        | 157/1212 [01:17<09:39,  1.82it/s]

Gradient norm: 142.60127095511544


Epoch 3 of 5 | Iteration:  13%|█▎        | 158/1212 [01:17<09:25,  1.86it/s]

Gradient norm: 139.50757172793237


Epoch 3 of 5 | Iteration:  13%|█▎        | 159/1212 [01:18<09:45,  1.80it/s]

Gradient norm: 140.23066314148133


Epoch 3 of 5 | Iteration:  13%|█▎        | 160/1212 [01:18<10:55,  1.60it/s]

Gradient norm: 140.23252062512458


Epoch 3 of 5 | Iteration:  13%|█▎        | 161/1212 [01:19<09:46,  1.79it/s]

Gradient norm: 64.31598096334928


Epoch 3 of 5 | Iteration:  13%|█▎        | 162/1212 [01:19<09:24,  1.86it/s]

Gradient norm: 64.17253843634776


Epoch 3 of 5 | Iteration:  13%|█▎        | 163/1212 [01:20<08:48,  1.99it/s]

Gradient norm: 64.37888532408624


Epoch 3 of 5 | Iteration:  14%|█▎        | 164/1212 [01:20<09:02,  1.93it/s]

Gradient norm: 64.20447569971225


Epoch 3 of 5 | Iteration:  14%|█▎        | 165/1212 [01:21<08:46,  1.99it/s]

Gradient norm: 64.16554266611102


Epoch 3 of 5 | Iteration:  14%|█▎        | 166/1212 [01:21<09:05,  1.92it/s]

Gradient norm: 63.85641801609585


Epoch 3 of 5 | Iteration:  14%|█▍        | 167/1212 [01:22<08:16,  2.11it/s]

Gradient norm: 63.973782545649456


Epoch 3 of 5 | Iteration:  14%|█▍        | 168/1212 [01:22<07:54,  2.20it/s]

Gradient norm: 64.10078088549778


Epoch 3 of 5 | Iteration:  14%|█▍        | 169/1212 [01:23<07:27,  2.33it/s]

Gradient norm: 64.13150477887687


Epoch 3 of 5 | Iteration:  14%|█▍        | 170/1212 [01:23<07:34,  2.29it/s]

Gradient norm: 64.22250008053419


Epoch 3 of 5 | Iteration:  14%|█▍        | 171/1212 [01:23<07:44,  2.24it/s]

Gradient norm: 64.96836913814516


Epoch 3 of 5 | Iteration:  14%|█▍        | 172/1212 [01:24<07:38,  2.27it/s]

Gradient norm: 95.0453246999368


Epoch 3 of 5 | Iteration:  14%|█▍        | 173/1212 [01:24<08:12,  2.11it/s]

Gradient norm: 95.5639864364638


Epoch 3 of 5 | Iteration:  14%|█▍        | 174/1212 [01:25<07:46,  2.23it/s]

Gradient norm: 92.58596507307959


Epoch 3 of 5 | Iteration:  14%|█▍        | 175/1212 [01:25<07:29,  2.31it/s]

Gradient norm: 92.17888729917809


Epoch 3 of 5 | Iteration:  15%|█▍        | 176/1212 [01:26<07:46,  2.22it/s]

Gradient norm: 92.19627483313377


Epoch 3 of 5 | Iteration:  15%|█▍        | 177/1212 [01:26<07:35,  2.27it/s]

Gradient norm: 5.719736783234842


Epoch 3 of 5 | Iteration:  15%|█▍        | 178/1212 [01:27<07:43,  2.23it/s]

Gradient norm: 6.7344524721911085


Epoch 3 of 5 | Iteration:  15%|█▍        | 179/1212 [01:27<07:54,  2.18it/s]

Gradient norm: 9.530849295434395


Epoch 3 of 5 | Iteration:  15%|█▍        | 180/1212 [01:28<09:11,  1.87it/s]

Gradient norm: 21.184277359093887


Epoch 3 of 5 | Iteration:  15%|█▍        | 181/1212 [01:28<09:29,  1.81it/s]

Gradient norm: 21.823907308111277


Epoch 3 of 5 | Iteration:  15%|█▌        | 182/1212 [01:29<09:19,  1.84it/s]

Gradient norm: 22.698626544685894


Epoch 3 of 5 | Iteration:  15%|█▌        | 183/1212 [01:29<09:12,  1.86it/s]

Gradient norm: 23.007665811825888


Epoch 3 of 5 | Iteration:  15%|█▌        | 184/1212 [01:30<08:59,  1.90it/s]

Gradient norm: 23.03338343776484


Epoch 3 of 5 | Iteration:  15%|█▌        | 185/1212 [01:31<09:41,  1.77it/s]

Gradient norm: 22.946798990348007


Epoch 3 of 5 | Iteration:  15%|█▌        | 186/1212 [01:31<09:57,  1.72it/s]

Gradient norm: 23.243113603633127


Epoch 3 of 5 | Iteration:  15%|█▌        | 187/1212 [01:32<09:45,  1.75it/s]

Gradient norm: 42.510936329863945


Epoch 3 of 5 | Iteration:  16%|█▌        | 188/1212 [01:32<10:30,  1.62it/s]

Gradient norm: 42.54896978006061


Epoch 3 of 5 | Iteration:  16%|█▌        | 189/1212 [01:33<09:58,  1.71it/s]

Gradient norm: 43.32292976228449


Epoch 3 of 5 | Iteration:  16%|█▌        | 190/1212 [01:33<09:21,  1.82it/s]

Gradient norm: 41.237566339466476


Epoch 3 of 5 | Iteration:  16%|█▌        | 191/1212 [01:34<08:51,  1.92it/s]

Gradient norm: 41.211260112489725


Epoch 3 of 5 | Iteration:  16%|█▌        | 192/1212 [01:34<08:12,  2.07it/s]

Gradient norm: 39.97946073227156


Epoch 3 of 5 | Iteration:  16%|█▌        | 193/1212 [01:35<07:46,  2.18it/s]

Gradient norm: 136.38565813404776


Epoch 3 of 5 | Iteration:  16%|█▌        | 194/1212 [01:35<07:46,  2.18it/s]

Gradient norm: 140.77413170466403


Epoch 3 of 5 | Iteration:  16%|█▌        | 195/1212 [01:36<07:44,  2.19it/s]

Gradient norm: 140.771333387801


Epoch 3 of 5 | Iteration:  16%|█▌        | 196/1212 [01:36<07:43,  2.19it/s]

Gradient norm: 140.83049847659174


Epoch 3 of 5 | Iteration:  16%|█▋        | 197/1212 [01:36<07:18,  2.32it/s]

Gradient norm: 140.8415664178103


Epoch 3 of 5 | Iteration:  16%|█▋        | 198/1212 [01:37<07:09,  2.36it/s]

Gradient norm: 160.16330948237172


Epoch 3 of 5 | Iteration:  16%|█▋        | 199/1212 [01:37<06:57,  2.42it/s]

Gradient norm: 202.55333088741259


Epoch 3 of 5 | Iteration:  17%|█▋        | 200/1212 [01:38<07:09,  2.36it/s]

Gradient norm: 202.30448813697154


Epoch 3 of 5 | Iteration:  17%|█▋        | 201/1212 [01:38<06:53,  2.44it/s]

Gradient norm: 207.17996677096082


Epoch 3 of 5 | Iteration:  17%|█▋        | 202/1212 [01:38<06:41,  2.52it/s]

Gradient norm: 207.72998623023986


Epoch 3 of 5 | Iteration:  17%|█▋        | 203/1212 [01:39<06:39,  2.52it/s]

Gradient norm: 207.72998623023986


Epoch 3 of 5 | Iteration:  17%|█▋        | 204/1212 [01:39<06:33,  2.56it/s]

Gradient norm: 208.42829678173266


Epoch 3 of 5 | Iteration:  17%|█▋        | 205/1212 [01:40<07:31,  2.23it/s]

Gradient norm: 206.6191343454191


Epoch 3 of 5 | Iteration:  17%|█▋        | 206/1212 [01:40<07:43,  2.17it/s]

Gradient norm: 206.58468553282424


Epoch 3 of 5 | Iteration:  17%|█▋        | 207/1212 [01:41<07:23,  2.27it/s]

Gradient norm: 204.932273136063


Epoch 3 of 5 | Iteration:  17%|█▋        | 208/1212 [01:41<07:11,  2.32it/s]

Gradient norm: 206.1661709892263


Epoch 3 of 5 | Iteration:  17%|█▋        | 209/1212 [01:42<07:37,  2.19it/s]

Gradient norm: 2.8017794418890367


Epoch 3 of 5 | Iteration:  17%|█▋        | 210/1212 [01:42<08:20,  2.00it/s]

Gradient norm: 5.654501336266319


Epoch 3 of 5 | Iteration:  17%|█▋        | 211/1212 [01:43<08:50,  1.89it/s]

Gradient norm: 9.286002311455242


Epoch 3 of 5 | Iteration:  17%|█▋        | 212/1212 [01:43<08:41,  1.92it/s]

Gradient norm: 9.490023598262399


Epoch 3 of 5 | Iteration:  18%|█▊        | 213/1212 [01:44<09:12,  1.81it/s]

Gradient norm: 18.119153855594142


Epoch 3 of 5 | Iteration:  18%|█▊        | 214/1212 [01:44<09:17,  1.79it/s]

Gradient norm: 29.01184334115055


Epoch 3 of 5 | Iteration:  18%|█▊        | 215/1212 [01:45<08:48,  1.89it/s]

Gradient norm: 28.749687162775874


Epoch 3 of 5 | Iteration:  18%|█▊        | 216/1212 [01:45<07:56,  2.09it/s]

Gradient norm: 40.75059980911793


Epoch 3 of 5 | Iteration:  18%|█▊        | 217/1212 [01:46<07:56,  2.09it/s]

Gradient norm: 43.814657938013454


Epoch 3 of 5 | Iteration:  18%|█▊        | 218/1212 [01:46<07:29,  2.21it/s]

Gradient norm: 44.17251206392964


Epoch 3 of 5 | Iteration:  18%|█▊        | 219/1212 [01:47<07:06,  2.33it/s]

Gradient norm: 44.3239658814133


Epoch 3 of 5 | Iteration:  18%|█▊        | 220/1212 [01:47<06:53,  2.40it/s]

Gradient norm: 44.384987852629926


Epoch 3 of 5 | Iteration:  18%|█▊        | 221/1212 [01:47<06:37,  2.50it/s]

Gradient norm: 46.33012111759804


Epoch 3 of 5 | Iteration:  18%|█▊        | 222/1212 [01:48<06:27,  2.56it/s]

Gradient norm: 46.49168075280448


Epoch 3 of 5 | Iteration:  18%|█▊        | 223/1212 [01:48<06:23,  2.58it/s]

Gradient norm: 46.10700808755077


Epoch 3 of 5 | Iteration:  18%|█▊        | 224/1212 [01:48<06:24,  2.57it/s]

Gradient norm: 46.02194644559408


Epoch 3 of 5 | Iteration:  19%|█▊        | 225/1212 [01:49<06:43,  2.45it/s]

Gradient norm: 3.168622054762459


Epoch 3 of 5 | Iteration:  19%|█▊        | 226/1212 [01:49<07:20,  2.24it/s]

Gradient norm: 4.916089236712388


Epoch 3 of 5 | Iteration:  19%|█▊        | 227/1212 [01:50<07:00,  2.34it/s]

Gradient norm: 41.45607927218984


Epoch 3 of 5 | Iteration:  19%|█▉        | 228/1212 [01:50<06:43,  2.44it/s]

Gradient norm: 41.52774222285335


Epoch 3 of 5 | Iteration:  19%|█▉        | 229/1212 [01:51<07:44,  2.12it/s]

Gradient norm: 49.45387041981219


Epoch 3 of 5 | Iteration:  19%|█▉        | 230/1212 [01:51<07:15,  2.26it/s]

Gradient norm: 49.52112637248046


Epoch 3 of 5 | Iteration:  19%|█▉        | 231/1212 [01:52<07:03,  2.32it/s]

Gradient norm: 54.77491962407505


Epoch 3 of 5 | Iteration:  19%|█▉        | 232/1212 [01:52<07:41,  2.12it/s]

Gradient norm: 56.263661468633096


Epoch 3 of 5 | Iteration:  19%|█▉        | 233/1212 [01:52<07:10,  2.27it/s]

Gradient norm: 55.967712584031155


Epoch 3 of 5 | Iteration:  19%|█▉        | 234/1212 [01:53<08:05,  2.01it/s]

Gradient norm: 56.192579349171034


Epoch 3 of 5 | Iteration:  19%|█▉        | 235/1212 [01:53<07:27,  2.19it/s]

Gradient norm: 59.42575245414122


Epoch 3 of 5 | Iteration:  19%|█▉        | 236/1212 [01:54<07:34,  2.15it/s]

Gradient norm: 57.69930825900684


Epoch 3 of 5 | Iteration:  20%|█▉        | 237/1212 [01:54<07:21,  2.21it/s]

Gradient norm: 57.31784898941064


Epoch 3 of 5 | Iteration:  20%|█▉        | 238/1212 [01:55<07:25,  2.19it/s]

Gradient norm: 56.72721685797871


Epoch 3 of 5 | Iteration:  20%|█▉        | 239/1212 [01:55<07:43,  2.10it/s]

Gradient norm: 55.31527393187329


Epoch 3 of 5 | Iteration:  20%|█▉        | 240/1212 [01:56<08:02,  2.01it/s]

Gradient norm: 244.62304620425985


Epoch 3 of 5 | Iteration:  20%|█▉        | 241/1212 [01:57<08:46,  1.84it/s]

Gradient norm: 1.251697919355889


Epoch 3 of 5 | Iteration:  20%|█▉        | 242/1212 [01:57<09:32,  1.69it/s]

Gradient norm: 20.29313693618522


Epoch 3 of 5 | Iteration:  20%|██        | 243/1212 [01:58<10:22,  1.56it/s]

Gradient norm: 22.574322399228205


Epoch 3 of 5 | Iteration:  20%|██        | 244/1212 [01:59<09:42,  1.66it/s]

Gradient norm: 22.51963636423382


Epoch 3 of 5 | Iteration:  20%|██        | 245/1212 [01:59<09:03,  1.78it/s]

Gradient norm: 22.258213867001054


Epoch 3 of 5 | Iteration:  20%|██        | 246/1212 [02:00<09:08,  1.76it/s]

Gradient norm: 36.1317249580495


Epoch 3 of 5 | Iteration:  20%|██        | 247/1212 [02:00<08:07,  1.98it/s]

Gradient norm: 43.30021182529357


Epoch 3 of 5 | Iteration:  20%|██        | 248/1212 [02:00<07:59,  2.01it/s]

Gradient norm: 47.05123750191793


Epoch 3 of 5 | Iteration:  21%|██        | 249/1212 [02:01<07:38,  2.10it/s]

Gradient norm: 48.75004811029556


Epoch 3 of 5 | Iteration:  21%|██        | 250/1212 [02:01<07:10,  2.24it/s]

Gradient norm: 48.55136485801141


Epoch 3 of 5 | Iteration:  21%|██        | 251/1212 [02:02<06:49,  2.35it/s]

Gradient norm: 50.331074627355356


Epoch 3 of 5 | Iteration:  21%|██        | 252/1212 [02:02<06:38,  2.41it/s]

Gradient norm: 57.24356599311384


Epoch 3 of 5 | Iteration:  21%|██        | 253/1212 [02:02<06:54,  2.31it/s]

Gradient norm: 57.44577398240694


Epoch 3 of 5 | Iteration:  21%|██        | 254/1212 [02:03<07:05,  2.25it/s]

Gradient norm: 57.85919057869784


Epoch 3 of 5 | Iteration:  21%|██        | 255/1212 [02:03<06:50,  2.33it/s]

Gradient norm: 59.68857459749856


Epoch 3 of 5 | Iteration:  21%|██        | 256/1212 [02:04<07:28,  2.13it/s]

Gradient norm: 57.52077584774309


Epoch 3 of 5 | Iteration:  21%|██        | 257/1212 [02:05<08:09,  1.95it/s]

Gradient norm: 2.9494156672979273


Epoch 3 of 5 | Iteration:  21%|██▏       | 258/1212 [02:05<07:25,  2.14it/s]

Gradient norm: 121.88132650061175


Epoch 3 of 5 | Iteration:  21%|██▏       | 259/1212 [02:05<07:01,  2.26it/s]

Gradient norm: 122.18960765101032


Epoch 3 of 5 | Iteration:  21%|██▏       | 260/1212 [02:06<06:41,  2.37it/s]

Gradient norm: 122.30826231433343


Epoch 3 of 5 | Iteration:  22%|██▏       | 261/1212 [02:06<07:44,  2.05it/s]

Gradient norm: 122.65489356486253


Epoch 3 of 5 | Iteration:  22%|██▏       | 262/1212 [02:07<07:13,  2.19it/s]

Gradient norm: 122.61253305495742


Epoch 3 of 5 | Iteration:  22%|██▏       | 263/1212 [02:07<06:52,  2.30it/s]

Gradient norm: 123.0390626877521


Epoch 3 of 5 | Iteration:  22%|██▏       | 264/1212 [02:07<06:30,  2.43it/s]

Gradient norm: 123.26469883000797


Epoch 3 of 5 | Iteration:  22%|██▏       | 265/1212 [02:08<07:01,  2.25it/s]

Gradient norm: 123.23785694093927


Epoch 3 of 5 | Iteration:  22%|██▏       | 266/1212 [02:09<07:40,  2.05it/s]

Gradient norm: 123.23617743365747


Epoch 3 of 5 | Iteration:  22%|██▏       | 267/1212 [02:09<07:45,  2.03it/s]

Gradient norm: 122.98032933146047


Epoch 3 of 5 | Iteration:  22%|██▏       | 268/1212 [02:09<07:37,  2.06it/s]

Gradient norm: 123.08130156633936


Epoch 3 of 5 | Iteration:  22%|██▏       | 269/1212 [02:10<08:07,  1.93it/s]

Gradient norm: 123.5399419710612


Epoch 3 of 5 | Iteration:  22%|██▏       | 270/1212 [02:11<08:05,  1.94it/s]

Gradient norm: 179.77795611137677


Epoch 3 of 5 | Iteration:  22%|██▏       | 271/1212 [02:11<08:10,  1.92it/s]

Gradient norm: 179.79469349517598


Epoch 3 of 5 | Iteration:  22%|██▏       | 272/1212 [02:12<08:45,  1.79it/s]

Gradient norm: 180.11071744101696


Epoch 3 of 5 | Iteration:  23%|██▎       | 273/1212 [02:12<08:25,  1.86it/s]

Gradient norm: 74.1396577779391


Epoch 3 of 5 | Iteration:  23%|██▎       | 274/1212 [02:13<07:39,  2.04it/s]

Gradient norm: 73.98888820850263


Epoch 3 of 5 | Iteration:  23%|██▎       | 275/1212 [02:13<07:54,  1.98it/s]

Gradient norm: 74.14659532593834


Epoch 3 of 5 | Iteration:  23%|██▎       | 276/1212 [02:14<09:03,  1.72it/s]

Gradient norm: 81.66463222964092


Epoch 3 of 5 | Iteration:  23%|██▎       | 277/1212 [02:14<08:11,  1.90it/s]

Gradient norm: 250.89674140926977


Epoch 3 of 5 | Iteration:  23%|██▎       | 278/1212 [02:15<08:17,  1.88it/s]

Gradient norm: 257.24605678800145


Epoch 3 of 5 | Iteration:  23%|██▎       | 279/1212 [02:15<08:08,  1.91it/s]

Gradient norm: 258.08826721369326


Epoch 3 of 5 | Iteration:  23%|██▎       | 280/1212 [02:16<08:18,  1.87it/s]

Gradient norm: 258.94018176163905


Epoch 3 of 5 | Iteration:  23%|██▎       | 281/1212 [02:16<07:42,  2.01it/s]

Gradient norm: 258.91667658592377


Epoch 3 of 5 | Iteration:  23%|██▎       | 282/1212 [02:17<07:37,  2.03it/s]

Gradient norm: 256.732137584426


Epoch 3 of 5 | Iteration:  23%|██▎       | 283/1212 [02:17<07:12,  2.15it/s]

Gradient norm: 259.72793434090346


Epoch 3 of 5 | Iteration:  23%|██▎       | 284/1212 [02:18<07:29,  2.06it/s]

Gradient norm: 247.76433449110942


Epoch 3 of 5 | Iteration:  24%|██▎       | 285/1212 [02:18<06:57,  2.22it/s]

Gradient norm: 248.25320618886477


Epoch 3 of 5 | Iteration:  24%|██▎       | 286/1212 [02:19<07:11,  2.15it/s]

Gradient norm: 247.27535475866128


Epoch 3 of 5 | Iteration:  24%|██▎       | 287/1212 [02:19<07:21,  2.10it/s]

Gradient norm: 247.71113518546295


Epoch 3 of 5 | Iteration:  24%|██▍       | 288/1212 [02:20<06:59,  2.20it/s]

Gradient norm: 247.63008175736607


Epoch 3 of 5 | Iteration:  24%|██▍       | 289/1212 [02:20<07:49,  1.97it/s]

Gradient norm: 3.666642355627236


Epoch 3 of 5 | Iteration:  24%|██▍       | 290/1212 [02:21<07:14,  2.12it/s]

Gradient norm: 17.93933420967177


Epoch 3 of 5 | Iteration:  24%|██▍       | 291/1212 [02:21<07:29,  2.05it/s]

Gradient norm: 17.82809410340213


Epoch 3 of 5 | Iteration:  24%|██▍       | 292/1212 [02:21<06:58,  2.20it/s]

Gradient norm: 18.263187212282663


Epoch 3 of 5 | Iteration:  24%|██▍       | 293/1212 [02:22<06:59,  2.19it/s]

Gradient norm: 1604.3375486707275


Epoch 3 of 5 | Iteration:  24%|██▍       | 294/1212 [02:22<07:14,  2.11it/s]

Gradient norm: 1609.1786657127361


Epoch 3 of 5 | Iteration:  24%|██▍       | 295/1212 [02:23<07:24,  2.06it/s]

Gradient norm: 1608.2926398434395


Epoch 3 of 5 | Iteration:  24%|██▍       | 296/1212 [02:24<07:51,  1.94it/s]

Gradient norm: 1608.126626351648


Epoch 3 of 5 | Iteration:  25%|██▍       | 297/1212 [02:24<07:46,  1.96it/s]

Gradient norm: 1607.5142330043166


Epoch 3 of 5 | Iteration:  25%|██▍       | 298/1212 [02:25<08:17,  1.84it/s]

Gradient norm: 1607.4362484732287


Epoch 3 of 5 | Iteration:  25%|██▍       | 299/1212 [02:25<08:16,  1.84it/s]

Gradient norm: 1607.3302238147082


Epoch 3 of 5 | Iteration:  25%|██▍       | 300/1212 [02:26<08:18,  1.83it/s]

Gradient norm: 1607.236824814555


Epoch 3 of 5 | Iteration:  25%|██▍       | 301/1212 [02:26<07:34,  2.00it/s]

Gradient norm: 1607.1303126748508


Epoch 3 of 5 | Iteration:  25%|██▍       | 302/1212 [02:27<07:06,  2.13it/s]

Gradient norm: 1606.6498542663232


Epoch 3 of 5 | Iteration:  25%|██▌       | 303/1212 [02:27<06:52,  2.21it/s]

Gradient norm: 1606.3705298793109


Epoch 3 of 5 | Iteration:  25%|██▌       | 304/1212 [02:27<06:33,  2.31it/s]

Gradient norm: 1606.2451317860896


Epoch 3 of 5 | Iteration:  25%|██▌       | 305/1212 [02:28<07:06,  2.13it/s]

Gradient norm: 0.09596758055451686


Epoch 3 of 5 | Iteration:  25%|██▌       | 306/1212 [02:28<06:36,  2.29it/s]

Gradient norm: 0.45718879646276817


Epoch 3 of 5 | Iteration:  25%|██▌       | 307/1212 [02:29<06:34,  2.29it/s]

Gradient norm: 3.946086038236683


Epoch 3 of 5 | Iteration:  25%|██▌       | 308/1212 [02:29<06:15,  2.41it/s]

Gradient norm: 4.6140097934587585


Epoch 3 of 5 | Iteration:  25%|██▌       | 309/1212 [02:29<06:07,  2.46it/s]

Gradient norm: 5.28214858486452


Epoch 3 of 5 | Iteration:  26%|██▌       | 310/1212 [02:30<05:53,  2.55it/s]

Gradient norm: 5.594604720495127


Epoch 3 of 5 | Iteration:  26%|██▌       | 311/1212 [02:30<06:00,  2.50it/s]

Gradient norm: 7.005530689021229


Epoch 3 of 5 | Iteration:  26%|██▌       | 312/1212 [02:31<06:26,  2.33it/s]

Gradient norm: 7.354370220666491


Epoch 3 of 5 | Iteration:  26%|██▌       | 313/1212 [02:31<06:31,  2.30it/s]

Gradient norm: 7.517619548446458


Epoch 3 of 5 | Iteration:  26%|██▌       | 314/1212 [02:32<06:16,  2.38it/s]

Gradient norm: 9.449252170856518


Epoch 3 of 5 | Iteration:  26%|██▌       | 315/1212 [02:32<06:33,  2.28it/s]

Gradient norm: 12.61158579376984


Epoch 3 of 5 | Iteration:  26%|██▌       | 316/1212 [02:32<06:18,  2.36it/s]

Gradient norm: 12.627437220232302


Epoch 3 of 5 | Iteration:  26%|██▌       | 317/1212 [02:33<06:06,  2.44it/s]

Gradient norm: 13.36401382691567


Epoch 3 of 5 | Iteration:  26%|██▌       | 318/1212 [02:33<06:01,  2.48it/s]

Gradient norm: 13.146671472583133


Epoch 3 of 5 | Iteration:  26%|██▋       | 319/1212 [02:34<06:11,  2.40it/s]

Gradient norm: 14.213142180761421


Epoch 3 of 5 | Iteration:  26%|██▋       | 320/1212 [02:34<06:09,  2.42it/s]

Gradient norm: 14.356355541580731


Epoch 3 of 5 | Iteration:  26%|██▋       | 321/1212 [02:34<05:58,  2.48it/s]

Gradient norm: 3.3468232371309874


Epoch 3 of 5 | Iteration:  27%|██▋       | 322/1212 [02:35<05:53,  2.52it/s]

Gradient norm: 6.451956617515093


Epoch 3 of 5 | Iteration:  27%|██▋       | 323/1212 [02:35<06:06,  2.43it/s]

Gradient norm: 7.386764775940096


Epoch 3 of 5 | Iteration:  27%|██▋       | 324/1212 [02:36<06:50,  2.17it/s]

Gradient norm: 7.192852199035378


Epoch 3 of 5 | Iteration:  27%|██▋       | 325/1212 [02:36<07:12,  2.05it/s]

Gradient norm: 9.43253899390919


Epoch 3 of 5 | Iteration:  27%|██▋       | 326/1212 [02:37<08:01,  1.84it/s]

Gradient norm: 10.934774816768096


Epoch 3 of 5 | Iteration:  27%|██▋       | 327/1212 [02:38<07:43,  1.91it/s]

Gradient norm: 11.007298371138141


Epoch 3 of 5 | Iteration:  27%|██▋       | 328/1212 [02:38<08:02,  1.83it/s]

Gradient norm: 34.97465092080834


Epoch 3 of 5 | Iteration:  27%|██▋       | 329/1212 [02:39<08:10,  1.80it/s]

Gradient norm: 34.96261459780562


Epoch 3 of 5 | Iteration:  27%|██▋       | 330/1212 [02:39<07:58,  1.84it/s]

Gradient norm: 35.22529200011432


Epoch 3 of 5 | Iteration:  27%|██▋       | 331/1212 [02:40<07:19,  2.00it/s]

Gradient norm: 35.09898181232744


Epoch 3 of 5 | Iteration:  27%|██▋       | 332/1212 [02:40<06:58,  2.10it/s]

Gradient norm: 35.41619352974306


Epoch 3 of 5 | Iteration:  27%|██▋       | 333/1212 [02:41<07:08,  2.05it/s]

Gradient norm: 38.54056606440825


Epoch 3 of 5 | Iteration:  28%|██▊       | 334/1212 [02:41<06:41,  2.19it/s]

Gradient norm: 177.58701666315602


Epoch 3 of 5 | Iteration:  28%|██▊       | 335/1212 [02:41<06:15,  2.33it/s]

Gradient norm: 180.4555354674571


Epoch 3 of 5 | Iteration:  28%|██▊       | 336/1212 [02:42<06:23,  2.29it/s]

Gradient norm: 182.81071653128424


Epoch 3 of 5 | Iteration:  28%|██▊       | 337/1212 [02:42<06:06,  2.39it/s]

Gradient norm: 49.23254648009484


Epoch 3 of 5 | Iteration:  28%|██▊       | 338/1212 [02:43<05:58,  2.44it/s]

Gradient norm: 53.62268030470457


Epoch 3 of 5 | Iteration:  28%|██▊       | 339/1212 [02:43<06:14,  2.33it/s]

Gradient norm: 52.71409178057649


Epoch 3 of 5 | Iteration:  28%|██▊       | 340/1212 [02:43<06:13,  2.34it/s]

Gradient norm: 53.48121755827867


Epoch 3 of 5 | Iteration:  28%|██▊       | 341/1212 [02:44<06:11,  2.35it/s]

Gradient norm: 53.23142422332748


Epoch 3 of 5 | Iteration:  28%|██▊       | 342/1212 [02:44<06:42,  2.16it/s]

Gradient norm: 54.59254234176783


Epoch 3 of 5 | Iteration:  28%|██▊       | 343/1212 [02:45<06:21,  2.28it/s]

Gradient norm: 53.076536235663994


Epoch 3 of 5 | Iteration:  28%|██▊       | 344/1212 [02:45<06:47,  2.13it/s]

Gradient norm: 49.1611847371733


Epoch 3 of 5 | Iteration:  28%|██▊       | 345/1212 [02:46<07:20,  1.97it/s]

Gradient norm: 49.573973424497375


Epoch 3 of 5 | Iteration:  29%|██▊       | 346/1212 [02:46<07:27,  1.94it/s]

Gradient norm: 48.85018688788281


Epoch 3 of 5 | Iteration:  29%|██▊       | 347/1212 [02:47<06:58,  2.07it/s]

Gradient norm: 440.00539106466874


Epoch 3 of 5 | Iteration:  29%|██▊       | 348/1212 [02:47<06:34,  2.19it/s]

Gradient norm: 441.78766483030904


Epoch 3 of 5 | Iteration:  29%|██▉       | 349/1212 [02:48<06:28,  2.22it/s]

Gradient norm: 443.488424492161


Epoch 3 of 5 | Iteration:  29%|██▉       | 350/1212 [02:48<06:11,  2.32it/s]

Gradient norm: 443.5851964075355


Epoch 3 of 5 | Iteration:  29%|██▉       | 351/1212 [02:48<05:53,  2.44it/s]

Gradient norm: 449.88270859974773


Epoch 3 of 5 | Iteration:  29%|██▉       | 352/1212 [02:49<07:01,  2.04it/s]

Gradient norm: 448.423347159234


Epoch 3 of 5 | Iteration:  29%|██▉       | 353/1212 [02:50<07:29,  1.91it/s]

Gradient norm: 2.582942218439411


Epoch 3 of 5 | Iteration:  29%|██▉       | 354/1212 [02:50<07:24,  1.93it/s]

Gradient norm: 21.915499493650067


Epoch 3 of 5 | Iteration:  29%|██▉       | 355/1212 [02:51<07:40,  1.86it/s]

Gradient norm: 26.817615398651625


Epoch 3 of 5 | Iteration:  29%|██▉       | 356/1212 [02:51<07:36,  1.88it/s]

Gradient norm: 26.83500965523489


Epoch 3 of 5 | Iteration:  29%|██▉       | 357/1212 [02:52<07:20,  1.94it/s]

Gradient norm: 62.8102337967672


Epoch 3 of 5 | Iteration:  30%|██▉       | 358/1212 [02:52<07:27,  1.91it/s]

Gradient norm: 62.77532175736419


Epoch 3 of 5 | Iteration:  30%|██▉       | 359/1212 [02:53<07:29,  1.90it/s]

Gradient norm: 83.88929388817677


Epoch 3 of 5 | Iteration:  30%|██▉       | 360/1212 [02:53<07:24,  1.92it/s]

Gradient norm: 83.78764315659554


Epoch 3 of 5 | Iteration:  30%|██▉       | 361/1212 [02:54<06:50,  2.07it/s]

Gradient norm: 83.61556316675293


Epoch 3 of 5 | Iteration:  30%|██▉       | 362/1212 [02:54<06:27,  2.19it/s]

Gradient norm: 95.41462333686738


Epoch 3 of 5 | Iteration:  30%|██▉       | 363/1212 [02:55<06:37,  2.14it/s]

Gradient norm: 95.40188110599473


Epoch 3 of 5 | Iteration:  30%|███       | 364/1212 [02:55<06:22,  2.22it/s]

Gradient norm: 95.7565909891835


Epoch 3 of 5 | Iteration:  30%|███       | 365/1212 [02:55<06:07,  2.31it/s]

Gradient norm: 95.76142648744504


Epoch 3 of 5 | Iteration:  30%|███       | 366/1212 [02:56<05:57,  2.37it/s]

Gradient norm: 91.49614572603454


Epoch 3 of 5 | Iteration:  30%|███       | 367/1212 [02:56<06:41,  2.11it/s]

Gradient norm: 90.98418240306721


Epoch 3 of 5 | Iteration:  30%|███       | 368/1212 [02:57<06:19,  2.22it/s]

Gradient norm: 90.87137721132913


Epoch 3 of 5 | Iteration:  30%|███       | 369/1212 [02:57<06:33,  2.14it/s]

Gradient norm: 1.9599261656469966


Epoch 3 of 5 | Iteration:  31%|███       | 370/1212 [02:58<06:23,  2.20it/s]

Gradient norm: 2.308987809890497


Epoch 3 of 5 | Iteration:  31%|███       | 371/1212 [02:58<06:09,  2.28it/s]

Gradient norm: 11.821276848537249


Epoch 3 of 5 | Iteration:  31%|███       | 372/1212 [02:59<06:43,  2.08it/s]

Gradient norm: 12.861010298191047


Epoch 3 of 5 | Iteration:  31%|███       | 373/1212 [02:59<06:30,  2.15it/s]

Gradient norm: 23.292446289700795


Epoch 3 of 5 | Iteration:  31%|███       | 374/1212 [03:00<06:11,  2.26it/s]

Gradient norm: 26.731013174240445


Epoch 3 of 5 | Iteration:  31%|███       | 375/1212 [03:00<05:51,  2.38it/s]

Gradient norm: 26.921148551454074


Epoch 3 of 5 | Iteration:  31%|███       | 376/1212 [03:00<05:36,  2.48it/s]

Gradient norm: 29.0085689113524


Epoch 3 of 5 | Iteration:  31%|███       | 377/1212 [03:01<05:32,  2.51it/s]

Gradient norm: 33.41098551043697


Epoch 3 of 5 | Iteration:  31%|███       | 378/1212 [03:01<05:26,  2.55it/s]

Gradient norm: 34.16429712647559


Epoch 3 of 5 | Iteration:  31%|███▏      | 379/1212 [03:02<05:31,  2.51it/s]

Gradient norm: 34.934856381219525


Epoch 3 of 5 | Iteration:  31%|███▏      | 380/1212 [03:02<05:29,  2.52it/s]

Gradient norm: 35.057598300714986


Epoch 3 of 5 | Iteration:  31%|███▏      | 381/1212 [03:02<05:22,  2.58it/s]

Gradient norm: 35.78160253483162


Epoch 3 of 5 | Iteration:  32%|███▏      | 382/1212 [03:03<06:14,  2.22it/s]

Gradient norm: 39.397328897939566


Epoch 3 of 5 | Iteration:  32%|███▏      | 383/1212 [03:03<06:28,  2.14it/s]

Gradient norm: 40.25781973951821


Epoch 3 of 5 | Iteration:  32%|███▏      | 384/1212 [03:04<06:33,  2.10it/s]

Gradient norm: 40.26185237863836


Epoch 3 of 5 | Iteration:  32%|███▏      | 385/1212 [03:04<06:50,  2.01it/s]

Gradient norm: 2.02241468358473


Epoch 3 of 5 | Iteration:  32%|███▏      | 386/1212 [03:05<07:12,  1.91it/s]

Gradient norm: 32.70121578368781


Epoch 3 of 5 | Iteration:  32%|███▏      | 387/1212 [03:06<07:14,  1.90it/s]

Gradient norm: 32.16273553227434


Epoch 3 of 5 | Iteration:  32%|███▏      | 388/1212 [03:06<07:28,  1.84it/s]

Gradient norm: 32.45808226070164


Epoch 3 of 5 | Iteration:  32%|███▏      | 389/1212 [03:07<07:37,  1.80it/s]

Gradient norm: 32.334382163911116


Epoch 3 of 5 | Iteration:  32%|███▏      | 390/1212 [03:07<06:55,  1.98it/s]

Gradient norm: 31.91782916381454


Epoch 3 of 5 | Iteration:  32%|███▏      | 391/1212 [03:07<06:30,  2.10it/s]

Gradient norm: 32.44940123976154


Epoch 3 of 5 | Iteration:  32%|███▏      | 392/1212 [03:08<06:16,  2.18it/s]

Gradient norm: 36.53494821227292


Epoch 3 of 5 | Iteration:  32%|███▏      | 393/1212 [03:09<07:05,  1.92it/s]

Gradient norm: 96.60928897436015


Epoch 3 of 5 | Iteration:  33%|███▎      | 394/1212 [03:09<07:11,  1.90it/s]

Gradient norm: 96.7811277982535


Epoch 3 of 5 | Iteration:  33%|███▎      | 395/1212 [03:09<06:32,  2.08it/s]

Gradient norm: 96.7619529093346


Epoch 3 of 5 | Iteration:  33%|███▎      | 396/1212 [03:10<06:19,  2.15it/s]

Gradient norm: 98.1530695715745


Epoch 3 of 5 | Iteration:  33%|███▎      | 397/1212 [03:10<06:07,  2.22it/s]

Gradient norm: 99.500749787871


Epoch 3 of 5 | Iteration:  33%|███▎      | 398/1212 [03:11<06:17,  2.16it/s]

Gradient norm: 99.89589615806194


Epoch 3 of 5 | Iteration:  33%|███▎      | 399/1212 [03:11<06:39,  2.04it/s]

Gradient norm: 102.36711214817517


Epoch 3 of 5 | Iteration:  33%|███▎      | 400/1212 [03:12<06:24,  2.11it/s]

Gradient norm: 102.63436570257265


Epoch 3 of 5 | Iteration:  33%|███▎      | 401/1212 [03:12<06:43,  2.01it/s]

Gradient norm: 8.694116091894745


Epoch 3 of 5 | Iteration:  33%|███▎      | 402/1212 [03:13<07:27,  1.81it/s]

Gradient norm: 88.35839513080117


Epoch 3 of 5 | Iteration:  33%|███▎      | 403/1212 [03:13<06:53,  1.96it/s]

Gradient norm: 88.36446645630019


Epoch 3 of 5 | Iteration:  33%|███▎      | 404/1212 [03:14<06:17,  2.14it/s]

Gradient norm: 88.54938796010977


Epoch 3 of 5 | Iteration:  33%|███▎      | 405/1212 [03:14<06:03,  2.22it/s]

Gradient norm: 88.50320168680094


Epoch 3 of 5 | Iteration:  33%|███▎      | 406/1212 [03:15<05:47,  2.32it/s]

Gradient norm: 81.72468703845135


Epoch 3 of 5 | Iteration:  34%|███▎      | 407/1212 [03:15<05:37,  2.39it/s]

Gradient norm: 81.62362923174123


Epoch 3 of 5 | Iteration:  34%|███▎      | 408/1212 [03:15<05:23,  2.49it/s]

Gradient norm: 81.79158974740515


Epoch 3 of 5 | Iteration:  34%|███▎      | 409/1212 [03:16<05:48,  2.30it/s]

Gradient norm: 81.61726096396538


Epoch 3 of 5 | Iteration:  34%|███▍      | 410/1212 [03:16<05:34,  2.39it/s]

Gradient norm: 81.74086941195422


Epoch 3 of 5 | Iteration:  34%|███▍      | 411/1212 [03:17<06:14,  2.14it/s]

Gradient norm: 93.03967570068562


Epoch 3 of 5 | Iteration:  34%|███▍      | 412/1212 [03:17<06:17,  2.12it/s]

Gradient norm: 92.81342352972362


Epoch 3 of 5 | Iteration:  34%|███▍      | 413/1212 [03:18<06:27,  2.06it/s]

Gradient norm: 587.2179807448614


Epoch 3 of 5 | Iteration:  34%|███▍      | 414/1212 [03:18<06:43,  1.98it/s]

Gradient norm: 587.6445272400524


Epoch 3 of 5 | Iteration:  34%|███▍      | 415/1212 [03:19<07:29,  1.77it/s]

Gradient norm: 566.8413466708257


Epoch 3 of 5 | Iteration:  34%|███▍      | 416/1212 [03:20<07:51,  1.69it/s]

Gradient norm: 561.6704268931745


Epoch 3 of 5 | Iteration:  34%|███▍      | 417/1212 [03:20<07:41,  1.72it/s]

Gradient norm: 2.4644509357247055


Epoch 3 of 5 | Iteration:  34%|███▍      | 418/1212 [03:21<06:56,  1.91it/s]

Gradient norm: 35.927268419418624


Epoch 3 of 5 | Iteration:  35%|███▍      | 419/1212 [03:21<06:27,  2.05it/s]

Gradient norm: 110.16288781537641


Epoch 3 of 5 | Iteration:  35%|███▍      | 420/1212 [03:22<06:07,  2.15it/s]

Gradient norm: 109.88392430414464


Epoch 3 of 5 | Iteration:  35%|███▍      | 421/1212 [03:22<06:16,  2.10it/s]

Gradient norm: 110.94122262612683


Epoch 3 of 5 | Iteration:  35%|███▍      | 422/1212 [03:22<05:51,  2.25it/s]

Gradient norm: 147.8583606285062


Epoch 3 of 5 | Iteration:  35%|███▍      | 423/1212 [03:23<05:34,  2.36it/s]

Gradient norm: 147.94952829441544


Epoch 3 of 5 | Iteration:  35%|███▍      | 424/1212 [03:23<06:25,  2.05it/s]

Gradient norm: 149.2666254104073


Epoch 3 of 5 | Iteration:  35%|███▌      | 425/1212 [03:24<06:11,  2.12it/s]

Gradient norm: 153.3237931845814


Epoch 3 of 5 | Iteration:  35%|███▌      | 426/1212 [03:24<05:44,  2.28it/s]

Gradient norm: 178.45226991031154


Epoch 3 of 5 | Iteration:  35%|███▌      | 427/1212 [03:25<05:35,  2.34it/s]

Gradient norm: 178.44124651729607


Epoch 3 of 5 | Iteration:  35%|███▌      | 428/1212 [03:25<06:13,  2.10it/s]

Gradient norm: 205.57844921096586


Epoch 3 of 5 | Iteration:  35%|███▌      | 429/1212 [03:26<06:00,  2.17it/s]

Gradient norm: 205.85781448228218


Epoch 3 of 5 | Iteration:  35%|███▌      | 430/1212 [03:26<05:37,  2.32it/s]

Gradient norm: 205.97921028192093


Epoch 3 of 5 | Iteration:  36%|███▌      | 431/1212 [03:26<05:24,  2.41it/s]

Gradient norm: 206.18173788304279


Epoch 3 of 5 | Iteration:  36%|███▌      | 432/1212 [03:27<05:19,  2.44it/s]

Gradient norm: 206.07256451651858


Epoch 3 of 5 | Iteration:  36%|███▌      | 433/1212 [03:27<05:30,  2.35it/s]

Gradient norm: 5269.1370186989


Epoch 3 of 5 | Iteration:  36%|███▌      | 434/1212 [03:28<05:24,  2.40it/s]

Gradient norm: 5269.382136354907


Epoch 3 of 5 | Iteration:  36%|███▌      | 435/1212 [03:28<05:12,  2.49it/s]

Gradient norm: 5270.782082144844


Epoch 3 of 5 | Iteration:  36%|███▌      | 436/1212 [03:28<05:07,  2.52it/s]

Gradient norm: 5270.7883644233725


Epoch 3 of 5 | Iteration:  36%|███▌      | 437/1212 [03:29<05:05,  2.54it/s]

Gradient norm: 5271.763741442639


Epoch 3 of 5 | Iteration:  36%|███▌      | 438/1212 [03:29<05:00,  2.58it/s]

Gradient norm: 5271.7064841153315


Epoch 3 of 5 | Iteration:  36%|███▌      | 439/1212 [03:30<05:11,  2.48it/s]

Gradient norm: 5264.697489863


Epoch 3 of 5 | Iteration:  36%|███▋      | 440/1212 [03:30<05:03,  2.55it/s]

Gradient norm: 5264.159044572801


Epoch 3 of 5 | Iteration:  36%|███▋      | 441/1212 [03:30<05:13,  2.46it/s]

Gradient norm: 5264.1351396722785


Epoch 3 of 5 | Iteration:  36%|███▋      | 442/1212 [03:31<05:56,  2.16it/s]

Gradient norm: 5274.526687176873


Epoch 3 of 5 | Iteration:  37%|███▋      | 443/1212 [03:31<06:08,  2.09it/s]

Gradient norm: 5273.992811447896


Epoch 3 of 5 | Iteration:  37%|███▋      | 444/1212 [03:32<06:17,  2.04it/s]

Gradient norm: 5274.087502823345


Epoch 3 of 5 | Iteration:  37%|███▋      | 445/1212 [03:33<06:26,  1.98it/s]

Gradient norm: 5273.854097907729


Epoch 3 of 5 | Iteration:  37%|███▋      | 446/1212 [03:33<06:42,  1.90it/s]

Gradient norm: 5273.958466468654


Epoch 3 of 5 | Iteration:  37%|███▋      | 447/1212 [03:34<06:53,  1.85it/s]

Gradient norm: 5274.562118484573


Epoch 3 of 5 | Iteration:  37%|███▋      | 448/1212 [03:34<06:43,  1.90it/s]

Gradient norm: 5276.478996622684


Epoch 3 of 5 | Iteration:  37%|███▋      | 449/1212 [03:35<06:46,  1.88it/s]

Gradient norm: 19.888212844165842


Epoch 3 of 5 | Iteration:  37%|███▋      | 450/1212 [03:35<06:20,  2.00it/s]

Gradient norm: 20.212632348243613


Epoch 3 of 5 | Iteration:  37%|███▋      | 451/1212 [03:36<06:00,  2.11it/s]

Gradient norm: 20.26017710583273


Epoch 3 of 5 | Iteration:  37%|███▋      | 452/1212 [03:36<06:18,  2.01it/s]

Gradient norm: 20.36479532953315


Epoch 3 of 5 | Iteration:  37%|███▋      | 453/1212 [03:36<05:47,  2.19it/s]

Gradient norm: 20.399361046747465


Epoch 3 of 5 | Iteration:  37%|███▋      | 454/1212 [03:37<05:46,  2.19it/s]

Gradient norm: 20.19734788169768


Epoch 3 of 5 | Iteration:  38%|███▊      | 455/1212 [03:37<05:54,  2.13it/s]

Gradient norm: 48.20659283476923


Epoch 3 of 5 | Iteration:  38%|███▊      | 456/1212 [03:38<07:03,  1.78it/s]

Gradient norm: 49.719999575467305


Epoch 3 of 5 | Iteration:  38%|███▊      | 457/1212 [03:39<06:46,  1.86it/s]

Gradient norm: 49.21175454095259


Epoch 3 of 5 | Iteration:  38%|███▊      | 458/1212 [03:39<06:19,  1.99it/s]

Gradient norm: 48.96124389163847


Epoch 3 of 5 | Iteration:  38%|███▊      | 459/1212 [03:40<06:25,  1.95it/s]

Gradient norm: 47.73804974743137


Epoch 3 of 5 | Iteration:  38%|███▊      | 460/1212 [03:40<06:15,  2.00it/s]

Gradient norm: 56.1838123619646


Epoch 3 of 5 | Iteration:  38%|███▊      | 461/1212 [03:40<05:47,  2.16it/s]

Gradient norm: 55.122207611820194


Epoch 3 of 5 | Iteration:  38%|███▊      | 462/1212 [03:41<05:26,  2.30it/s]

Gradient norm: 54.27196449542558


Epoch 3 of 5 | Iteration:  38%|███▊      | 463/1212 [03:41<05:15,  2.38it/s]

Gradient norm: 55.671404650216324


Epoch 3 of 5 | Iteration:  38%|███▊      | 464/1212 [03:42<05:08,  2.42it/s]

Gradient norm: 55.64346404279119


Epoch 3 of 5 | Iteration:  38%|███▊      | 465/1212 [03:42<05:11,  2.40it/s]

Gradient norm: 9.0207074936174


Epoch 3 of 5 | Iteration:  38%|███▊      | 466/1212 [03:42<05:02,  2.46it/s]

Gradient norm: 14.636168870241244


Epoch 3 of 5 | Iteration:  39%|███▊      | 467/1212 [03:43<05:51,  2.12it/s]

Gradient norm: 43.03327386787786


Epoch 3 of 5 | Iteration:  39%|███▊      | 468/1212 [03:43<05:32,  2.24it/s]

Gradient norm: 43.08857581922263


Epoch 3 of 5 | Iteration:  39%|███▊      | 469/1212 [03:44<05:22,  2.30it/s]

Gradient norm: 63.78747319745707


Epoch 3 of 5 | Iteration:  39%|███▉      | 470/1212 [03:44<05:37,  2.20it/s]

Gradient norm: 64.67009273290151


Epoch 3 of 5 | Iteration:  39%|███▉      | 471/1212 [03:45<05:42,  2.17it/s]

Gradient norm: 63.126237354220414


Epoch 3 of 5 | Iteration:  39%|███▉      | 472/1212 [03:45<05:50,  2.11it/s]

Gradient norm: 65.8417136429126


Epoch 3 of 5 | Iteration:  39%|███▉      | 473/1212 [03:46<05:56,  2.07it/s]

Gradient norm: 67.3615451216545


Epoch 3 of 5 | Iteration:  39%|███▉      | 474/1212 [03:46<06:02,  2.04it/s]

Gradient norm: 67.7140985516744


Epoch 3 of 5 | Iteration:  39%|███▉      | 475/1212 [03:47<07:03,  1.74it/s]

Gradient norm: 67.60470927651384


Epoch 3 of 5 | Iteration:  39%|███▉      | 476/1212 [03:48<07:35,  1.61it/s]

Gradient norm: 95.54097937953199


Epoch 3 of 5 | Iteration:  39%|███▉      | 477/1212 [03:48<07:06,  1.72it/s]

Gradient norm: 129.76945838977826


Epoch 3 of 5 | Iteration:  39%|███▉      | 478/1212 [03:49<07:21,  1.66it/s]

Gradient norm: 138.86395561327535


Epoch 3 of 5 | Iteration:  40%|███▉      | 479/1212 [03:49<06:34,  1.86it/s]

Gradient norm: 138.77328359446454


Epoch 3 of 5 | Iteration:  40%|███▉      | 480/1212 [03:50<06:31,  1.87it/s]

Gradient norm: 140.91762055363796


Epoch 3 of 5 | Iteration:  40%|███▉      | 481/1212 [03:50<06:24,  1.90it/s]

Gradient norm: 23.979064019609435


Epoch 3 of 5 | Iteration:  40%|███▉      | 482/1212 [03:51<06:26,  1.89it/s]

Gradient norm: 24.746974119227197


Epoch 3 of 5 | Iteration:  40%|███▉      | 483/1212 [03:51<05:51,  2.07it/s]

Gradient norm: 25.339246797997465


Epoch 3 of 5 | Iteration:  40%|███▉      | 484/1212 [03:52<05:36,  2.16it/s]

Gradient norm: 26.432056698423242


Epoch 3 of 5 | Iteration:  40%|████      | 485/1212 [03:52<05:16,  2.30it/s]

Gradient norm: 26.375715424306343


Epoch 3 of 5 | Iteration:  40%|████      | 486/1212 [03:53<05:10,  2.33it/s]

Gradient norm: 26.328742604598993


Epoch 3 of 5 | Iteration:  40%|████      | 487/1212 [03:53<05:00,  2.41it/s]

Gradient norm: 27.39264768869599


Epoch 3 of 5 | Iteration:  40%|████      | 488/1212 [03:53<04:59,  2.41it/s]

Gradient norm: 27.59754627117765


Epoch 3 of 5 | Iteration:  40%|████      | 489/1212 [03:54<04:53,  2.47it/s]

Gradient norm: 27.58785263084882


Epoch 3 of 5 | Iteration:  40%|████      | 490/1212 [03:54<04:43,  2.54it/s]

Gradient norm: 27.490342067946465


Epoch 3 of 5 | Iteration:  41%|████      | 491/1212 [03:54<04:42,  2.56it/s]

Gradient norm: 28.95443860058179


Epoch 3 of 5 | Iteration:  41%|████      | 492/1212 [03:55<04:44,  2.53it/s]

Gradient norm: 28.98668086048901


Epoch 3 of 5 | Iteration:  41%|████      | 493/1212 [03:55<04:56,  2.42it/s]

Gradient norm: 29.12492860258588


Epoch 3 of 5 | Iteration:  41%|████      | 494/1212 [03:56<05:14,  2.28it/s]

Gradient norm: 29.28250542694306


Epoch 3 of 5 | Iteration:  41%|████      | 495/1212 [03:56<05:34,  2.14it/s]

Gradient norm: 62.8091470114259


Epoch 3 of 5 | Iteration:  41%|████      | 496/1212 [03:57<05:24,  2.21it/s]

Gradient norm: 72.0560524397064


Epoch 3 of 5 | Iteration:  41%|████      | 497/1212 [03:57<06:01,  1.98it/s]

Gradient norm: 1.4264027025245276


Epoch 3 of 5 | Iteration:  41%|████      | 498/1212 [03:58<06:08,  1.94it/s]

Gradient norm: 4.0627217493325425


Epoch 3 of 5 | Iteration:  41%|████      | 499/1212 [03:58<06:13,  1.91it/s]

Gradient norm: 5.239975758662975


Epoch 3 of 5 | Iteration:  41%|████▏     | 500/1212 [03:59<06:39,  1.78it/s]

Gradient norm: 35.14937576499297


Epoch 3 of 5 | Iteration:  41%|████▏     | 501/1212 [04:00<06:34,  1.80it/s]

Gradient norm: 37.11457441850732


Epoch 3 of 5 | Iteration:  41%|████▏     | 502/1212 [04:00<06:15,  1.89it/s]

Gradient norm: 58.224202472995444


Epoch 3 of 5 | Iteration:  42%|████▏     | 503/1212 [04:01<06:40,  1.77it/s]

Gradient norm: 56.35965796875055


Epoch 3 of 5 | Iteration:  42%|████▏     | 504/1212 [04:01<06:35,  1.79it/s]

Gradient norm: 57.414990619268586


Epoch 3 of 5 | Iteration:  42%|████▏     | 505/1212 [04:02<06:16,  1.88it/s]

Gradient norm: 239.9774792855349


Epoch 3 of 5 | Iteration:  42%|████▏     | 506/1212 [04:02<05:40,  2.07it/s]

Gradient norm: 240.03506603935008


Epoch 3 of 5 | Iteration:  42%|████▏     | 507/1212 [04:03<06:02,  1.94it/s]

Gradient norm: 239.54723756519988


Epoch 3 of 5 | Iteration:  42%|████▏     | 508/1212 [04:03<05:38,  2.08it/s]

Gradient norm: 239.55211766431052


Epoch 3 of 5 | Iteration:  42%|████▏     | 509/1212 [04:04<05:15,  2.23it/s]

Gradient norm: 239.49161896960314


Epoch 3 of 5 | Iteration:  42%|████▏     | 510/1212 [04:04<05:00,  2.33it/s]

Gradient norm: 239.55910774271663


Epoch 3 of 5 | Iteration:  42%|████▏     | 511/1212 [04:05<05:42,  2.05it/s]

Gradient norm: 247.46598913411052


Epoch 3 of 5 | Iteration:  42%|████▏     | 512/1212 [04:05<05:26,  2.15it/s]

Gradient norm: 248.6765850619242


Epoch 3 of 5 | Iteration:  42%|████▏     | 513/1212 [04:05<05:05,  2.29it/s]

Gradient norm: 72.78306505363993


Epoch 3 of 5 | Iteration:  42%|████▏     | 514/1212 [04:06<05:00,  2.32it/s]

Gradient norm: 85.75356706607862


Epoch 3 of 5 | Iteration:  42%|████▏     | 515/1212 [04:06<04:53,  2.38it/s]

Gradient norm: 86.6871121869286


Epoch 3 of 5 | Iteration:  43%|████▎     | 516/1212 [04:07<04:50,  2.39it/s]

Gradient norm: 86.47658955780206


Epoch 3 of 5 | Iteration:  43%|████▎     | 517/1212 [04:07<05:21,  2.16it/s]

Gradient norm: 88.29773862513824


Epoch 3 of 5 | Iteration:  43%|████▎     | 518/1212 [04:08<05:07,  2.26it/s]

Gradient norm: 89.41872813854658


Epoch 3 of 5 | Iteration:  43%|████▎     | 519/1212 [04:08<05:02,  2.29it/s]

Gradient norm: 87.5655749872387


Epoch 3 of 5 | Iteration:  43%|████▎     | 520/1212 [04:09<05:47,  1.99it/s]

Gradient norm: 91.74448541876113


Epoch 3 of 5 | Iteration:  43%|████▎     | 521/1212 [04:09<05:57,  1.93it/s]

Gradient norm: 90.76531631934326


Epoch 3 of 5 | Iteration:  43%|████▎     | 522/1212 [04:10<06:04,  1.89it/s]

Gradient norm: 91.66116295746478


Epoch 3 of 5 | Iteration:  43%|████▎     | 523/1212 [04:10<05:41,  2.02it/s]

Gradient norm: 93.43934720286329


Epoch 3 of 5 | Iteration:  43%|████▎     | 524/1212 [04:11<05:25,  2.12it/s]

Gradient norm: 93.31926922088338


Epoch 3 of 5 | Iteration:  43%|████▎     | 525/1212 [04:11<05:49,  1.97it/s]

Gradient norm: 180.99642097548744


Epoch 3 of 5 | Iteration:  43%|████▎     | 526/1212 [04:12<05:56,  1.92it/s]

Gradient norm: 544.5837185219274


Epoch 3 of 5 | Iteration:  43%|████▎     | 527/1212 [04:12<05:56,  1.92it/s]

Gradient norm: 541.1536875784702


Epoch 3 of 5 | Iteration:  44%|████▎     | 528/1212 [04:13<06:03,  1.88it/s]

Gradient norm: 542.5399622965292


Epoch 3 of 5 | Iteration:  44%|████▎     | 529/1212 [04:13<06:11,  1.84it/s]

Gradient norm: 5.286626438586821


Epoch 3 of 5 | Iteration:  44%|████▎     | 530/1212 [04:14<05:59,  1.90it/s]

Gradient norm: 13.283617246826559


Epoch 3 of 5 | Iteration:  44%|████▍     | 531/1212 [04:14<06:06,  1.86it/s]

Gradient norm: 15.38245364394864


Epoch 3 of 5 | Iteration:  44%|████▍     | 532/1212 [04:15<06:09,  1.84it/s]

Gradient norm: 15.427842298303013


Epoch 3 of 5 | Iteration:  44%|████▍     | 533/1212 [04:15<05:58,  1.90it/s]

Gradient norm: 19.845639296108754


Epoch 3 of 5 | Iteration:  44%|████▍     | 534/1212 [04:16<05:33,  2.03it/s]

Gradient norm: 186.30895031084006


Epoch 3 of 5 | Iteration:  44%|████▍     | 535/1212 [04:16<05:28,  2.06it/s]

Gradient norm: 261.0886718783612


Epoch 3 of 5 | Iteration:  44%|████▍     | 536/1212 [04:17<05:27,  2.06it/s]

Gradient norm: 261.05264513806543


Epoch 3 of 5 | Iteration:  44%|████▍     | 537/1212 [04:17<05:14,  2.15it/s]

Gradient norm: 261.0868769626931


Epoch 3 of 5 | Iteration:  44%|████▍     | 538/1212 [04:18<04:59,  2.25it/s]

Gradient norm: 262.1376343732434


Epoch 3 of 5 | Iteration:  44%|████▍     | 539/1212 [04:18<05:01,  2.23it/s]

Gradient norm: 262.059390008753


Epoch 3 of 5 | Iteration:  45%|████▍     | 540/1212 [04:18<04:49,  2.32it/s]

Gradient norm: 262.4670200541359


Epoch 3 of 5 | Iteration:  45%|████▍     | 541/1212 [04:19<04:48,  2.33it/s]

Gradient norm: 262.727740479832


Epoch 3 of 5 | Iteration:  45%|████▍     | 542/1212 [04:19<04:40,  2.39it/s]

Gradient norm: 369.29054940014055


Epoch 3 of 5 | Iteration:  45%|████▍     | 543/1212 [04:20<04:50,  2.30it/s]

Gradient norm: 369.85891061400395


Epoch 3 of 5 | Iteration:  45%|████▍     | 544/1212 [04:20<05:22,  2.07it/s]

Gradient norm: 369.47870348358265


Epoch 3 of 5 | Iteration:  45%|████▍     | 545/1212 [04:21<05:30,  2.02it/s]

Gradient norm: 2.8559237342152106


Epoch 3 of 5 | Iteration:  45%|████▌     | 546/1212 [04:21<05:33,  2.00it/s]

Gradient norm: 7.07064782245389


Epoch 3 of 5 | Iteration:  45%|████▌     | 547/1212 [04:22<05:13,  2.12it/s]

Gradient norm: 8.296293359152697


Epoch 3 of 5 | Iteration:  45%|████▌     | 548/1212 [04:22<04:58,  2.22it/s]

Gradient norm: 9.172002399860034


Epoch 3 of 5 | Iteration:  45%|████▌     | 549/1212 [04:23<05:34,  1.98it/s]

Gradient norm: 9.931281296332102


Epoch 3 of 5 | Iteration:  45%|████▌     | 550/1212 [04:23<05:56,  1.86it/s]

Gradient norm: 11.286669152158487


Epoch 3 of 5 | Iteration:  45%|████▌     | 551/1212 [04:24<05:34,  1.98it/s]

Gradient norm: 23.98840405215084


Epoch 3 of 5 | Iteration:  46%|████▌     | 552/1212 [04:24<05:09,  2.13it/s]

Gradient norm: 23.69730789473589


Epoch 3 of 5 | Iteration:  46%|████▌     | 553/1212 [04:25<05:00,  2.19it/s]

Gradient norm: 28.424202139645605


Epoch 3 of 5 | Iteration:  46%|████▌     | 554/1212 [04:25<05:10,  2.12it/s]

Gradient norm: 28.411596829756355


Epoch 3 of 5 | Iteration:  46%|████▌     | 555/1212 [04:26<05:20,  2.05it/s]

Gradient norm: 28.680128882669113


Epoch 3 of 5 | Iteration:  46%|████▌     | 556/1212 [04:26<06:07,  1.78it/s]

Gradient norm: 34.01047767708573


Epoch 3 of 5 | Iteration:  46%|████▌     | 557/1212 [04:27<06:08,  1.78it/s]

Gradient norm: 1470.560783907398


Epoch 3 of 5 | Iteration:  46%|████▌     | 558/1212 [04:28<06:43,  1.62it/s]

Gradient norm: 1470.8018903055383


Epoch 3 of 5 | Iteration:  46%|████▌     | 559/1212 [04:28<06:28,  1.68it/s]

Gradient norm: 1471.1822782909926


Epoch 3 of 5 | Iteration:  46%|████▌     | 560/1212 [04:29<06:03,  1.79it/s]

Gradient norm: 1471.0780656594122


Epoch 3 of 5 | Iteration:  46%|████▋     | 561/1212 [04:29<05:24,  2.00it/s]

Gradient norm: 3.105102108920674


Epoch 3 of 5 | Iteration:  46%|████▋     | 562/1212 [04:30<05:10,  2.09it/s]

Gradient norm: 3.9409110039204824


Epoch 3 of 5 | Iteration:  46%|████▋     | 563/1212 [04:30<04:51,  2.23it/s]

Gradient norm: 4.256879377856542


Epoch 3 of 5 | Iteration:  47%|████▋     | 564/1212 [04:30<04:44,  2.28it/s]

Gradient norm: 4.74706457479078


Epoch 3 of 5 | Iteration:  47%|████▋     | 565/1212 [04:31<04:46,  2.26it/s]

Gradient norm: 134.63045791715714


Epoch 3 of 5 | Iteration:  47%|████▋     | 566/1212 [04:31<04:33,  2.36it/s]

Gradient norm: 134.56754379544307


Epoch 3 of 5 | Iteration:  47%|████▋     | 567/1212 [04:32<04:24,  2.44it/s]

Gradient norm: 134.2719132802516


Epoch 3 of 5 | Iteration:  47%|████▋     | 568/1212 [04:32<04:25,  2.42it/s]

Gradient norm: 134.56383367647356


Epoch 3 of 5 | Iteration:  47%|████▋     | 569/1212 [04:32<04:25,  2.42it/s]

Gradient norm: 134.60479840296816


Epoch 3 of 5 | Iteration:  47%|████▋     | 570/1212 [04:33<04:20,  2.46it/s]

Gradient norm: 148.7276018362176


Epoch 3 of 5 | Iteration:  47%|████▋     | 571/1212 [04:33<04:22,  2.44it/s]

Gradient norm: 182.0638051328714


Epoch 3 of 5 | Iteration:  47%|████▋     | 572/1212 [04:34<04:31,  2.36it/s]

Gradient norm: 183.33021440450364


Epoch 3 of 5 | Iteration:  47%|████▋     | 573/1212 [04:34<04:30,  2.36it/s]

Gradient norm: 184.07919760103508


Epoch 3 of 5 | Iteration:  47%|████▋     | 574/1212 [04:35<04:29,  2.37it/s]

Gradient norm: 184.02928045273157


Epoch 3 of 5 | Iteration:  47%|████▋     | 575/1212 [04:35<04:32,  2.34it/s]

Gradient norm: 184.66063789473878


Epoch 3 of 5 | Iteration:  48%|████▊     | 576/1212 [04:35<04:24,  2.40it/s]

Gradient norm: 184.56599942990346


Epoch 3 of 5 | Iteration:  48%|████▊     | 577/1212 [04:36<04:43,  2.24it/s]

Gradient norm: 1.41080465599942


Epoch 3 of 5 | Iteration:  48%|████▊     | 578/1212 [04:36<04:34,  2.31it/s]

Gradient norm: 60.385391066108134


Epoch 3 of 5 | Iteration:  48%|████▊     | 579/1212 [04:37<04:34,  2.31it/s]

Gradient norm: 60.800836964855414


Epoch 3 of 5 | Iteration:  48%|████▊     | 580/1212 [04:37<04:26,  2.38it/s]

Gradient norm: 77.6251020432651


Epoch 3 of 5 | Iteration:  48%|████▊     | 581/1212 [04:37<04:19,  2.43it/s]

Gradient norm: 77.7191549422497


Epoch 3 of 5 | Iteration:  48%|████▊     | 582/1212 [04:38<04:19,  2.43it/s]

Gradient norm: 77.66134489783353


Epoch 3 of 5 | Iteration:  48%|████▊     | 583/1212 [04:38<04:30,  2.33it/s]

Gradient norm: 77.92920983687736


Epoch 3 of 5 | Iteration:  48%|████▊     | 584/1212 [04:39<05:08,  2.04it/s]

Gradient norm: 78.00566016459959


Epoch 3 of 5 | Iteration:  48%|████▊     | 585/1212 [04:40<05:13,  2.00it/s]

Gradient norm: 78.2976721897335


Epoch 3 of 5 | Iteration:  48%|████▊     | 586/1212 [04:40<05:36,  1.86it/s]

Gradient norm: 78.8552283355064


Epoch 3 of 5 | Iteration:  48%|████▊     | 587/1212 [04:41<05:36,  1.85it/s]

Gradient norm: 79.93665224842051


Epoch 3 of 5 | Iteration:  49%|████▊     | 588/1212 [04:41<06:02,  1.72it/s]

Gradient norm: 79.81565452443353


Epoch 3 of 5 | Iteration:  49%|████▊     | 589/1212 [04:42<05:59,  1.73it/s]

Gradient norm: 80.3600576499686


Epoch 3 of 5 | Iteration:  49%|████▊     | 590/1212 [04:42<05:37,  1.84it/s]

Gradient norm: 81.13444953986436


Epoch 3 of 5 | Iteration:  49%|████▉     | 591/1212 [04:43<05:06,  2.03it/s]

Gradient norm: 81.1717331151415


Epoch 3 of 5 | Iteration:  49%|████▉     | 592/1212 [04:43<05:01,  2.05it/s]

Gradient norm: 80.9594478052201


Epoch 3 of 5 | Iteration:  49%|████▉     | 593/1212 [04:44<04:44,  2.18it/s]

Gradient norm: 1.0602713821689247


Epoch 3 of 5 | Iteration:  49%|████▉     | 594/1212 [04:44<05:29,  1.87it/s]

Gradient norm: 3.5402903641804655


Epoch 3 of 5 | Iteration:  49%|████▉     | 595/1212 [04:45<05:12,  1.97it/s]

Gradient norm: 61.740408888034175


Epoch 3 of 5 | Iteration:  49%|████▉     | 596/1212 [04:45<04:50,  2.12it/s]

Gradient norm: 61.81391117338725


Epoch 3 of 5 | Iteration:  49%|████▉     | 597/1212 [04:46<04:33,  2.25it/s]

Gradient norm: 61.748643184210174


Epoch 3 of 5 | Iteration:  49%|████▉     | 598/1212 [04:46<04:24,  2.32it/s]

Gradient norm: 71.79697973626793


Epoch 3 of 5 | Iteration:  49%|████▉     | 599/1212 [04:46<04:20,  2.35it/s]

Gradient norm: 71.73088327667689


Epoch 3 of 5 | Iteration:  50%|████▉     | 600/1212 [04:47<04:31,  2.25it/s]

Gradient norm: 103.75249997096832


Epoch 3 of 5 | Iteration:  50%|████▉     | 601/1212 [04:47<04:20,  2.35it/s]

Gradient norm: 104.08224809144536


Epoch 3 of 5 | Iteration:  50%|████▉     | 602/1212 [04:48<04:13,  2.41it/s]

Gradient norm: 166.64033923637265


Epoch 3 of 5 | Iteration:  50%|████▉     | 603/1212 [04:48<04:22,  2.32it/s]

Gradient norm: 166.24648629324437


Epoch 3 of 5 | Iteration:  50%|████▉     | 604/1212 [04:49<04:19,  2.34it/s]

Gradient norm: 165.8392089263751


Epoch 3 of 5 | Iteration:  50%|████▉     | 605/1212 [04:49<04:34,  2.21it/s]

Gradient norm: 165.91244485601015


Epoch 3 of 5 | Iteration:  50%|█████     | 606/1212 [04:49<04:32,  2.23it/s]

Gradient norm: 165.85770930212797


Epoch 3 of 5 | Iteration:  50%|█████     | 607/1212 [04:50<05:05,  1.98it/s]

Gradient norm: 164.34163126054946


Epoch 3 of 5 | Iteration:  50%|█████     | 608/1212 [04:51<04:49,  2.08it/s]

Gradient norm: 163.72270051166157


Epoch 3 of 5 | Iteration:  50%|█████     | 609/1212 [04:51<04:39,  2.16it/s]

Gradient norm: 2.1373481389574667


Epoch 3 of 5 | Iteration:  50%|█████     | 610/1212 [04:51<04:53,  2.05it/s]

Gradient norm: 5.349636093900287


Epoch 3 of 5 | Iteration:  50%|█████     | 611/1212 [04:52<04:33,  2.20it/s]

Gradient norm: 6.318968475285052


Epoch 3 of 5 | Iteration:  50%|█████     | 612/1212 [04:52<04:33,  2.20it/s]

Gradient norm: 6.450161342806339


Epoch 3 of 5 | Iteration:  51%|█████     | 613/1212 [04:53<04:51,  2.06it/s]

Gradient norm: 25.49945210936842


Epoch 3 of 5 | Iteration:  51%|█████     | 614/1212 [04:54<05:17,  1.88it/s]

Gradient norm: 28.85495289243982


Epoch 3 of 5 | Iteration:  51%|█████     | 615/1212 [04:54<05:16,  1.88it/s]

Gradient norm: 29.727849101428163


Epoch 3 of 5 | Iteration:  51%|█████     | 616/1212 [04:55<05:40,  1.75it/s]

Gradient norm: 29.93319410051116


Epoch 3 of 5 | Iteration:  51%|█████     | 617/1212 [04:55<05:34,  1.78it/s]

Gradient norm: 32.936009993695


Epoch 3 of 5 | Iteration:  51%|█████     | 618/1212 [04:56<05:34,  1.78it/s]

Gradient norm: 1153.9627699233201


Epoch 3 of 5 | Iteration:  51%|█████     | 619/1212 [04:56<05:15,  1.88it/s]

Gradient norm: 1153.1617453217093


Epoch 3 of 5 | Iteration:  51%|█████     | 620/1212 [04:57<05:04,  1.95it/s]

Gradient norm: 1153.5154196443684


Epoch 3 of 5 | Iteration:  51%|█████     | 621/1212 [04:57<05:12,  1.89it/s]

Gradient norm: 1152.6936580553177


Epoch 3 of 5 | Iteration:  51%|█████▏    | 622/1212 [04:58<04:47,  2.05it/s]

Gradient norm: 1150.5021058628915


Epoch 3 of 5 | Iteration:  51%|█████▏    | 623/1212 [04:58<04:37,  2.12it/s]

Gradient norm: 1150.553185254736


Epoch 3 of 5 | Iteration:  51%|█████▏    | 624/1212 [04:59<04:37,  2.12it/s]

Gradient norm: 1687.964082894298


Epoch 3 of 5 | Iteration:  52%|█████▏    | 625/1212 [04:59<04:43,  2.07it/s]

Gradient norm: 1.4824216643718175


Epoch 3 of 5 | Iteration:  52%|█████▏    | 626/1212 [05:00<04:28,  2.18it/s]

Gradient norm: 10.055774088360897


Epoch 3 of 5 | Iteration:  52%|█████▏    | 627/1212 [05:00<04:16,  2.28it/s]

Gradient norm: 26.715576126782242


Epoch 3 of 5 | Iteration:  52%|█████▏    | 628/1212 [05:01<04:47,  2.03it/s]

Gradient norm: 26.400681664396746


Epoch 3 of 5 | Iteration:  52%|█████▏    | 629/1212 [05:01<04:33,  2.14it/s]

Gradient norm: 26.389070939142133


Epoch 3 of 5 | Iteration:  52%|█████▏    | 630/1212 [05:01<04:32,  2.13it/s]

Gradient norm: 26.914710469329858


Epoch 3 of 5 | Iteration:  52%|█████▏    | 631/1212 [05:02<04:19,  2.24it/s]

Gradient norm: 28.746001284177026


Epoch 3 of 5 | Iteration:  52%|█████▏    | 632/1212 [05:02<04:53,  1.98it/s]

Gradient norm: 30.483909856683262


Epoch 3 of 5 | Iteration:  52%|█████▏    | 633/1212 [05:03<05:14,  1.84it/s]

Gradient norm: 30.502971826771176


Epoch 3 of 5 | Iteration:  52%|█████▏    | 634/1212 [05:04<05:25,  1.78it/s]

Gradient norm: 30.460459011754015


Epoch 3 of 5 | Iteration:  52%|█████▏    | 635/1212 [05:04<04:55,  1.95it/s]

Gradient norm: 30.265104760785025


Epoch 3 of 5 | Iteration:  52%|█████▏    | 636/1212 [05:04<04:32,  2.12it/s]

Gradient norm: 30.076198283706194


Epoch 3 of 5 | Iteration:  53%|█████▎    | 637/1212 [05:05<04:16,  2.24it/s]

Gradient norm: 31.078069203571086


Epoch 3 of 5 | Iteration:  53%|█████▎    | 638/1212 [05:05<04:06,  2.32it/s]

Gradient norm: 31.198241963283614


Epoch 3 of 5 | Iteration:  53%|█████▎    | 639/1212 [05:06<04:10,  2.29it/s]

Gradient norm: 31.174455506918367


Epoch 3 of 5 | Iteration:  53%|█████▎    | 640/1212 [05:06<04:39,  2.04it/s]

Gradient norm: 31.31377294769622


Epoch 3 of 5 | Iteration:  53%|█████▎    | 641/1212 [05:07<04:48,  1.98it/s]

Gradient norm: 18.368834101496464


Epoch 3 of 5 | Iteration:  53%|█████▎    | 642/1212 [05:07<04:59,  1.91it/s]

Gradient norm: 103.24421362959697


Epoch 3 of 5 | Iteration:  53%|█████▎    | 643/1212 [05:08<04:54,  1.93it/s]

Gradient norm: 103.12460942902277


Epoch 3 of 5 | Iteration:  53%|█████▎    | 644/1212 [05:08<04:52,  1.94it/s]

Gradient norm: 102.95880734072925


Epoch 3 of 5 | Iteration:  53%|█████▎    | 645/1212 [05:09<05:13,  1.81it/s]

Gradient norm: 102.93461395028635


Epoch 3 of 5 | Iteration:  53%|█████▎    | 646/1212 [05:10<05:09,  1.83it/s]

Gradient norm: 103.04572522795418


Epoch 3 of 5 | Iteration:  53%|█████▎    | 647/1212 [05:10<04:39,  2.02it/s]

Gradient norm: 103.08784175629977


Epoch 3 of 5 | Iteration:  53%|█████▎    | 648/1212 [05:10<04:28,  2.10it/s]

Gradient norm: 102.97509085015571


Epoch 3 of 5 | Iteration:  54%|█████▎    | 649/1212 [05:11<04:10,  2.25it/s]

Gradient norm: 103.88401584333596


Epoch 3 of 5 | Iteration:  54%|█████▎    | 650/1212 [05:11<04:29,  2.08it/s]

Gradient norm: 106.04721338192735


Epoch 3 of 5 | Iteration:  54%|█████▎    | 651/1212 [05:12<04:13,  2.21it/s]

Gradient norm: 105.60817865865823


Epoch 3 of 5 | Iteration:  54%|█████▍    | 652/1212 [05:12<04:49,  1.93it/s]

Gradient norm: 105.59104192607884


Epoch 3 of 5 | Iteration:  54%|█████▍    | 653/1212 [05:13<04:55,  1.89it/s]

Gradient norm: 108.72997471942035


Epoch 3 of 5 | Iteration:  54%|█████▍    | 654/1212 [05:13<04:48,  1.93it/s]

Gradient norm: 107.92689190612921


Epoch 3 of 5 | Iteration:  54%|█████▍    | 655/1212 [05:14<04:30,  2.06it/s]

Gradient norm: 153.5568655667651


Epoch 3 of 5 | Iteration:  54%|█████▍    | 656/1212 [05:14<04:18,  2.15it/s]

Gradient norm: 153.7493200448364


Epoch 3 of 5 | Iteration:  54%|█████▍    | 657/1212 [05:15<04:08,  2.23it/s]

Gradient norm: 11.510274052381218


Epoch 3 of 5 | Iteration:  54%|█████▍    | 658/1212 [05:15<04:04,  2.26it/s]

Gradient norm: 13.89502135945648


Epoch 3 of 5 | Iteration:  54%|█████▍    | 659/1212 [05:16<04:01,  2.29it/s]

Gradient norm: 15.363532677117801


Epoch 3 of 5 | Iteration:  54%|█████▍    | 660/1212 [05:16<03:53,  2.37it/s]

Gradient norm: 15.367241408404032


Epoch 3 of 5 | Iteration:  55%|█████▍    | 661/1212 [05:16<03:51,  2.38it/s]

Gradient norm: 15.656769522812787


Epoch 3 of 5 | Iteration:  55%|█████▍    | 662/1212 [05:17<03:58,  2.31it/s]

Gradient norm: 17.038910878446817


Epoch 3 of 5 | Iteration:  55%|█████▍    | 663/1212 [05:17<03:57,  2.31it/s]

Gradient norm: 35.21338301434379


Epoch 3 of 5 | Iteration:  55%|█████▍    | 664/1212 [05:18<03:51,  2.36it/s]

Gradient norm: 35.30837702321707


Epoch 3 of 5 | Iteration:  55%|█████▍    | 665/1212 [05:18<04:11,  2.17it/s]

Gradient norm: 37.31511348135638


Epoch 3 of 5 | Iteration:  55%|█████▍    | 666/1212 [05:19<03:59,  2.28it/s]

Gradient norm: 39.551388159015616


Epoch 3 of 5 | Iteration:  55%|█████▌    | 667/1212 [05:19<03:48,  2.39it/s]

Gradient norm: 39.36829796208159


Epoch 3 of 5 | Iteration:  55%|█████▌    | 668/1212 [05:19<03:50,  2.36it/s]

Gradient norm: 39.387984993909335


Epoch 3 of 5 | Iteration:  55%|█████▌    | 669/1212 [05:20<04:08,  2.18it/s]

Gradient norm: 40.300625145692


Epoch 3 of 5 | Iteration:  55%|█████▌    | 670/1212 [05:20<04:19,  2.09it/s]

Gradient norm: 40.29332022382544


Epoch 3 of 5 | Iteration:  55%|█████▌    | 671/1212 [05:21<04:33,  1.98it/s]

Gradient norm: 40.16756145279606


Epoch 3 of 5 | Iteration:  55%|█████▌    | 672/1212 [05:22<04:38,  1.94it/s]

Gradient norm: 40.342742893039784


Epoch 3 of 5 | Iteration:  56%|█████▌    | 673/1212 [05:22<04:42,  1.91it/s]

Gradient norm: 3.688833760361735


Epoch 3 of 5 | Iteration:  56%|█████▌    | 674/1212 [05:23<04:53,  1.83it/s]

Gradient norm: 3.8074204784223507


Epoch 3 of 5 | Iteration:  56%|█████▌    | 675/1212 [05:23<04:51,  1.84it/s]

Gradient norm: 8.408235477689628


Epoch 3 of 5 | Iteration:  56%|█████▌    | 676/1212 [05:24<04:55,  1.81it/s]

Gradient norm: 11.703128708438998


Epoch 3 of 5 | Iteration:  56%|█████▌    | 677/1212 [05:24<04:27,  2.00it/s]

Gradient norm: 19.778132668919334


Epoch 3 of 5 | Iteration:  56%|█████▌    | 678/1212 [05:25<04:07,  2.15it/s]

Gradient norm: 22.288862222288248


Epoch 3 of 5 | Iteration:  56%|█████▌    | 679/1212 [05:25<04:17,  2.07it/s]

Gradient norm: 25.01807546410016


Epoch 3 of 5 | Iteration:  56%|█████▌    | 680/1212 [05:25<04:02,  2.19it/s]

Gradient norm: 25.91493261833428


Epoch 3 of 5 | Iteration:  56%|█████▌    | 681/1212 [05:26<03:53,  2.27it/s]

Gradient norm: 28.636144423448222


Epoch 3 of 5 | Iteration:  56%|█████▋    | 682/1212 [05:26<03:43,  2.37it/s]

Gradient norm: 29.043119976220506


Epoch 3 of 5 | Iteration:  56%|█████▋    | 683/1212 [05:27<03:55,  2.25it/s]

Gradient norm: 36.61459359410211


Epoch 3 of 5 | Iteration:  56%|█████▋    | 684/1212 [05:27<03:53,  2.26it/s]

Gradient norm: 38.150463204772684


Epoch 3 of 5 | Iteration:  57%|█████▋    | 685/1212 [05:28<04:16,  2.05it/s]

Gradient norm: 39.27252543380996


Epoch 3 of 5 | Iteration:  57%|█████▋    | 686/1212 [05:28<04:01,  2.18it/s]

Gradient norm: 41.00176262334466


Epoch 3 of 5 | Iteration:  57%|█████▋    | 687/1212 [05:29<03:50,  2.28it/s]

Gradient norm: 46.0092868793089


Epoch 3 of 5 | Iteration:  57%|█████▋    | 688/1212 [05:29<03:47,  2.30it/s]

Gradient norm: 46.40532835113096


Epoch 3 of 5 | Iteration:  57%|█████▋    | 689/1212 [05:29<03:38,  2.39it/s]

Gradient norm: 224.75189018603388


Epoch 3 of 5 | Iteration:  57%|█████▋    | 690/1212 [05:30<04:01,  2.16it/s]

Gradient norm: 317.23600615832703


Epoch 3 of 5 | Iteration:  57%|█████▋    | 691/1212 [05:31<04:30,  1.93it/s]

Gradient norm: 315.74767546226474


Epoch 3 of 5 | Iteration:  57%|█████▋    | 692/1212 [05:31<04:24,  1.97it/s]

Gradient norm: 319.3358351517632


Epoch 3 of 5 | Iteration:  57%|█████▋    | 693/1212 [05:32<04:35,  1.89it/s]

Gradient norm: 319.46370851329243


Epoch 3 of 5 | Iteration:  57%|█████▋    | 694/1212 [05:32<04:14,  2.03it/s]

Gradient norm: 317.83510722826855


Epoch 3 of 5 | Iteration:  57%|█████▋    | 695/1212 [05:32<03:58,  2.17it/s]

Gradient norm: 317.292553655141


Epoch 3 of 5 | Iteration:  57%|█████▋    | 696/1212 [05:33<03:48,  2.26it/s]

Gradient norm: 317.22384789793193


Epoch 3 of 5 | Iteration:  58%|█████▊    | 697/1212 [05:33<04:07,  2.08it/s]

Gradient norm: 319.921492694278


Epoch 3 of 5 | Iteration:  58%|█████▊    | 698/1212 [05:34<04:11,  2.05it/s]

Gradient norm: 319.7794617720293


Epoch 3 of 5 | Iteration:  58%|█████▊    | 699/1212 [05:35<04:36,  1.86it/s]

Gradient norm: 346.12217267784075


Epoch 3 of 5 | Iteration:  58%|█████▊    | 700/1212 [05:35<04:41,  1.82it/s]

Gradient norm: 348.5038497678667


Epoch 3 of 5 | Iteration:  58%|█████▊    | 701/1212 [05:36<04:36,  1.84it/s]

Gradient norm: 348.53131060641323


Epoch 3 of 5 | Iteration:  58%|█████▊    | 702/1212 [05:36<04:39,  1.83it/s]

Gradient norm: 350.3846641392271


Epoch 3 of 5 | Iteration:  58%|█████▊    | 703/1212 [05:37<04:40,  1.81it/s]

Gradient norm: 481.0905245443567


Epoch 3 of 5 | Iteration:  58%|█████▊    | 704/1212 [05:37<04:25,  1.91it/s]

Gradient norm: 481.1044211085637


Epoch 3 of 5 | Iteration:  58%|█████▊    | 705/1212 [05:38<04:23,  1.92it/s]

Gradient norm: 2.7793191666101396


Epoch 3 of 5 | Iteration:  58%|█████▊    | 706/1212 [05:38<04:42,  1.79it/s]

Gradient norm: 116.53532030264432


Epoch 3 of 5 | Iteration:  58%|█████▊    | 707/1212 [05:39<04:48,  1.75it/s]

Gradient norm: 116.95471493673571


Epoch 3 of 5 | Iteration:  58%|█████▊    | 708/1212 [05:39<04:20,  1.94it/s]

Gradient norm: 115.97339535401694


Epoch 3 of 5 | Iteration:  58%|█████▊    | 709/1212 [05:40<04:11,  2.00it/s]

Gradient norm: 117.03849931708208


Epoch 3 of 5 | Iteration:  59%|█████▊    | 710/1212 [05:40<03:52,  2.16it/s]

Gradient norm: 118.95184927068743


Epoch 3 of 5 | Iteration:  59%|█████▊    | 711/1212 [05:41<03:45,  2.22it/s]

Gradient norm: 114.43002040602563


Epoch 3 of 5 | Iteration:  59%|█████▊    | 712/1212 [05:41<03:59,  2.08it/s]

Gradient norm: 112.9792257537705


Epoch 3 of 5 | Iteration:  59%|█████▉    | 713/1212 [05:42<03:58,  2.09it/s]

Gradient norm: 113.15849579668473


Epoch 3 of 5 | Iteration:  59%|█████▉    | 714/1212 [05:42<03:46,  2.20it/s]

Gradient norm: 122.43145330542247


Epoch 3 of 5 | Iteration:  59%|█████▉    | 715/1212 [05:42<03:37,  2.28it/s]

Gradient norm: 123.02217322271261


Epoch 3 of 5 | Iteration:  59%|█████▉    | 716/1212 [05:43<03:29,  2.36it/s]

Gradient norm: 123.64544909560236


Epoch 3 of 5 | Iteration:  59%|█████▉    | 717/1212 [05:43<03:34,  2.31it/s]

Gradient norm: 123.705449439745


Epoch 3 of 5 | Iteration:  59%|█████▉    | 718/1212 [05:44<03:33,  2.31it/s]

Gradient norm: 123.62680144856888


Epoch 3 of 5 | Iteration:  59%|█████▉    | 719/1212 [05:44<03:45,  2.18it/s]

Gradient norm: 136.15495764181676


Epoch 3 of 5 | Iteration:  59%|█████▉    | 720/1212 [05:45<03:37,  2.26it/s]

Gradient norm: 136.81775700167816


Epoch 3 of 5 | Iteration:  59%|█████▉    | 721/1212 [05:45<03:28,  2.35it/s]

Gradient norm: 153.1469637209934


Epoch 3 of 5 | Iteration:  60%|█████▉    | 722/1212 [05:45<03:19,  2.46it/s]

Gradient norm: 153.04965701218055


Epoch 3 of 5 | Iteration:  60%|█████▉    | 723/1212 [05:46<03:21,  2.42it/s]

Gradient norm: 153.86497878367518


Epoch 3 of 5 | Iteration:  60%|█████▉    | 724/1212 [05:46<03:15,  2.49it/s]

Gradient norm: 153.3617578032444


Epoch 3 of 5 | Iteration:  60%|█████▉    | 725/1212 [05:47<03:13,  2.51it/s]

Gradient norm: 153.35571039318856


Epoch 3 of 5 | Iteration:  60%|█████▉    | 726/1212 [05:47<03:25,  2.37it/s]

Gradient norm: 153.3877138331317


Epoch 3 of 5 | Iteration:  60%|█████▉    | 727/1212 [05:48<03:36,  2.24it/s]

Gradient norm: 153.24484459085136


Epoch 3 of 5 | Iteration:  60%|██████    | 728/1212 [05:48<04:01,  2.01it/s]

Gradient norm: 153.23965275763982


Epoch 3 of 5 | Iteration:  60%|██████    | 729/1212 [05:49<05:13,  1.54it/s]

Gradient norm: 153.3642201419021


Epoch 3 of 5 | Iteration:  60%|██████    | 730/1212 [05:50<04:56,  1.63it/s]

Gradient norm: 153.15805735131232


Epoch 3 of 5 | Iteration:  60%|██████    | 731/1212 [05:50<04:55,  1.63it/s]

Gradient norm: 152.86176375090983


Epoch 3 of 5 | Iteration:  60%|██████    | 732/1212 [05:51<04:24,  1.81it/s]

Gradient norm: 154.2116684574705


Epoch 3 of 5 | Iteration:  60%|██████    | 733/1212 [05:51<03:59,  2.00it/s]

Gradient norm: 153.9929751779317


Epoch 3 of 5 | Iteration:  61%|██████    | 734/1212 [05:52<03:42,  2.14it/s]

Gradient norm: 154.05017741965392


Epoch 3 of 5 | Iteration:  61%|██████    | 735/1212 [05:52<03:52,  2.05it/s]

Gradient norm: 157.344016800352


Epoch 3 of 5 | Iteration:  61%|██████    | 736/1212 [05:53<03:58,  1.99it/s]

Gradient norm: 161.00352210895636


Epoch 3 of 5 | Iteration:  61%|██████    | 737/1212 [05:53<03:41,  2.14it/s]

Gradient norm: 5.68499020283013


Epoch 3 of 5 | Iteration:  61%|██████    | 738/1212 [05:54<03:54,  2.02it/s]

Gradient norm: 6.072115517628951


Epoch 3 of 5 | Iteration:  61%|██████    | 739/1212 [05:54<03:42,  2.12it/s]

Gradient norm: 29.048637506557025


Epoch 3 of 5 | Iteration:  61%|██████    | 740/1212 [05:54<03:29,  2.26it/s]

Gradient norm: 30.11085200678484


Epoch 3 of 5 | Iteration:  61%|██████    | 741/1212 [05:55<03:37,  2.17it/s]

Gradient norm: 31.066294783681034


Epoch 3 of 5 | Iteration:  61%|██████    | 742/1212 [05:55<03:38,  2.15it/s]

Gradient norm: 31.08474385940331


Epoch 3 of 5 | Iteration:  61%|██████▏   | 743/1212 [05:56<03:45,  2.08it/s]

Gradient norm: 31.068746609698895


Epoch 3 of 5 | Iteration:  61%|██████▏   | 744/1212 [05:56<03:38,  2.15it/s]

Gradient norm: 34.08131213681497


Epoch 3 of 5 | Iteration:  61%|██████▏   | 745/1212 [05:57<03:40,  2.12it/s]

Gradient norm: 36.124042507573066


Epoch 3 of 5 | Iteration:  62%|██████▏   | 746/1212 [05:57<03:27,  2.25it/s]

Gradient norm: 36.82822160201221


Epoch 3 of 5 | Iteration:  62%|██████▏   | 747/1212 [05:58<03:16,  2.37it/s]

Gradient norm: 39.12457956094942


Epoch 3 of 5 | Iteration:  62%|██████▏   | 748/1212 [05:58<03:13,  2.40it/s]

Gradient norm: 46.91328593194007


Epoch 3 of 5 | Iteration:  62%|██████▏   | 749/1212 [05:58<03:14,  2.38it/s]

Gradient norm: 52.41928301380793


Epoch 3 of 5 | Iteration:  62%|██████▏   | 750/1212 [05:59<03:08,  2.46it/s]

Gradient norm: 52.41321940663472


Epoch 3 of 5 | Iteration:  62%|██████▏   | 751/1212 [05:59<03:14,  2.38it/s]

Gradient norm: 52.466382065823275


Epoch 3 of 5 | Iteration:  62%|██████▏   | 752/1212 [06:00<03:18,  2.32it/s]

Gradient norm: 76.67930563403156


Epoch 3 of 5 | Iteration:  62%|██████▏   | 753/1212 [06:00<03:10,  2.41it/s]

Gradient norm: 4.339090472274478


Epoch 3 of 5 | Iteration:  62%|██████▏   | 754/1212 [06:00<03:14,  2.35it/s]

Gradient norm: 4.409598086673017


Epoch 3 of 5 | Iteration:  62%|██████▏   | 755/1212 [06:01<03:44,  2.03it/s]

Gradient norm: 83.94159086743933


Epoch 3 of 5 | Iteration:  62%|██████▏   | 756/1212 [06:02<03:56,  1.92it/s]

Gradient norm: 82.37896907286495


Epoch 3 of 5 | Iteration:  62%|██████▏   | 757/1212 [06:02<04:00,  1.90it/s]

Gradient norm: 82.3368798644667


Epoch 3 of 5 | Iteration:  63%|██████▎   | 758/1212 [06:03<04:30,  1.68it/s]

Gradient norm: 76.11290588280167


Epoch 3 of 5 | Iteration:  63%|██████▎   | 759/1212 [06:04<04:51,  1.55it/s]

Gradient norm: 76.05647384440066


Epoch 3 of 5 | Iteration:  63%|██████▎   | 760/1212 [06:04<04:38,  1.62it/s]

Gradient norm: 76.00209765231587


Epoch 3 of 5 | Iteration:  63%|██████▎   | 761/1212 [06:05<04:20,  1.73it/s]

Gradient norm: 76.70306058761089


Epoch 3 of 5 | Iteration:  63%|██████▎   | 762/1212 [06:05<03:59,  1.88it/s]

Gradient norm: 76.46968968752091


Epoch 3 of 5 | Iteration:  63%|██████▎   | 763/1212 [06:06<03:37,  2.06it/s]

Gradient norm: 80.29370853782835


Epoch 3 of 5 | Iteration:  63%|██████▎   | 764/1212 [06:06<03:29,  2.14it/s]

Gradient norm: 80.40727359396783


Epoch 3 of 5 | Iteration:  63%|██████▎   | 765/1212 [06:06<03:17,  2.26it/s]

Gradient norm: 75.86285355814363


Epoch 3 of 5 | Iteration:  63%|██████▎   | 766/1212 [06:07<03:31,  2.11it/s]

Gradient norm: 76.36517042999611


Epoch 3 of 5 | Iteration:  63%|██████▎   | 767/1212 [06:07<03:19,  2.23it/s]

Gradient norm: 77.58860709823968


Epoch 3 of 5 | Iteration:  63%|██████▎   | 768/1212 [06:08<03:13,  2.30it/s]

Gradient norm: 78.26252661180848


Epoch 3 of 5 | Iteration:  63%|██████▎   | 769/1212 [06:08<03:15,  2.27it/s]

Gradient norm: 3.0671768523673104


Epoch 3 of 5 | Iteration:  64%|██████▎   | 770/1212 [06:09<03:13,  2.28it/s]

Gradient norm: 7.390600513657921


Epoch 3 of 5 | Iteration:  64%|██████▎   | 771/1212 [06:09<03:14,  2.27it/s]

Gradient norm: 8.23131316745269


Epoch 3 of 5 | Iteration:  64%|██████▎   | 772/1212 [06:09<03:07,  2.35it/s]

Gradient norm: 23.518068111677938


Epoch 3 of 5 | Iteration:  64%|██████▍   | 773/1212 [06:10<03:33,  2.05it/s]

Gradient norm: 27.17327675227476


Epoch 3 of 5 | Iteration:  64%|██████▍   | 774/1212 [06:11<03:21,  2.17it/s]

Gradient norm: 26.60258406488896


Epoch 3 of 5 | Iteration:  64%|██████▍   | 775/1212 [06:11<03:09,  2.31it/s]

Gradient norm: 27.980081843761493


Epoch 3 of 5 | Iteration:  64%|██████▍   | 776/1212 [06:11<03:16,  2.22it/s]

Gradient norm: 144.91819587388116


Epoch 3 of 5 | Iteration:  64%|██████▍   | 777/1212 [06:12<03:08,  2.31it/s]

Gradient norm: 145.52611398446248


Epoch 3 of 5 | Iteration:  64%|██████▍   | 778/1212 [06:12<03:07,  2.31it/s]

Gradient norm: 145.8522758486179


Epoch 3 of 5 | Iteration:  64%|██████▍   | 779/1212 [06:13<03:05,  2.33it/s]

Gradient norm: 145.51765722282548


Epoch 3 of 5 | Iteration:  64%|██████▍   | 780/1212 [06:13<02:57,  2.44it/s]

Gradient norm: 152.88576115961143


Epoch 3 of 5 | Iteration:  64%|██████▍   | 781/1212 [06:14<03:27,  2.07it/s]

Gradient norm: 152.6672016298822


Epoch 3 of 5 | Iteration:  65%|██████▍   | 782/1212 [06:14<03:33,  2.02it/s]

Gradient norm: 153.6834721216173


Epoch 3 of 5 | Iteration:  65%|██████▍   | 783/1212 [06:15<03:35,  1.99it/s]

Gradient norm: 153.1492243152899


Epoch 3 of 5 | Iteration:  65%|██████▍   | 784/1212 [06:15<03:44,  1.91it/s]

Gradient norm: 153.49720171916886


Epoch 3 of 5 | Iteration:  65%|██████▍   | 785/1212 [06:16<03:49,  1.86it/s]

Gradient norm: 5.158134395038783


Epoch 3 of 5 | Iteration:  65%|██████▍   | 786/1212 [06:16<03:50,  1.84it/s]

Gradient norm: 18.37730138724198


Epoch 3 of 5 | Iteration:  65%|██████▍   | 787/1212 [06:17<03:46,  1.88it/s]

Gradient norm: 43.338440818379915


Epoch 3 of 5 | Iteration:  65%|██████▌   | 788/1212 [06:18<04:00,  1.76it/s]

Gradient norm: 44.09033409294476


Epoch 3 of 5 | Iteration:  65%|██████▌   | 789/1212 [06:18<03:37,  1.95it/s]

Gradient norm: 50.43123813990498


Epoch 3 of 5 | Iteration:  65%|██████▌   | 790/1212 [06:18<03:25,  2.05it/s]

Gradient norm: 50.33284809226314


Epoch 3 of 5 | Iteration:  65%|██████▌   | 791/1212 [06:19<03:12,  2.19it/s]

Gradient norm: 52.616046080184326


Epoch 3 of 5 | Iteration:  65%|██████▌   | 792/1212 [06:19<03:01,  2.32it/s]

Gradient norm: 95.80678141540479


Epoch 3 of 5 | Iteration:  65%|██████▌   | 793/1212 [06:19<02:53,  2.41it/s]

Gradient norm: 95.87028494415414


Epoch 3 of 5 | Iteration:  66%|██████▌   | 794/1212 [06:20<03:30,  1.98it/s]

Gradient norm: 95.82584277600135


Epoch 3 of 5 | Iteration:  66%|██████▌   | 795/1212 [06:21<03:21,  2.06it/s]

Gradient norm: 95.48663468473913


Epoch 3 of 5 | Iteration:  66%|██████▌   | 796/1212 [06:21<03:29,  1.99it/s]

Gradient norm: 97.17151688732162


Epoch 3 of 5 | Iteration:  66%|██████▌   | 797/1212 [06:22<03:14,  2.14it/s]

Gradient norm: 96.66595311735577


Epoch 3 of 5 | Iteration:  66%|██████▌   | 798/1212 [06:22<03:07,  2.21it/s]

Gradient norm: 96.82939104087151


Epoch 3 of 5 | Iteration:  66%|██████▌   | 799/1212 [06:22<03:03,  2.25it/s]

Gradient norm: 94.60883073842484


Epoch 3 of 5 | Iteration:  66%|██████▌   | 800/1212 [06:23<02:58,  2.31it/s]

Gradient norm: 184.70493964706634


Epoch 3 of 5 | Iteration:  66%|██████▌   | 801/1212 [06:23<02:59,  2.29it/s]

Gradient norm: 480.1111118071296


Epoch 3 of 5 | Iteration:  66%|██████▌   | 802/1212 [06:24<03:01,  2.26it/s]

Gradient norm: 479.979074478189


Epoch 3 of 5 | Iteration:  66%|██████▋   | 803/1212 [06:24<02:59,  2.28it/s]

Gradient norm: 479.81265826008814


Epoch 3 of 5 | Iteration:  66%|██████▋   | 804/1212 [06:25<03:07,  2.18it/s]

Gradient norm: 479.79671752400986


Epoch 3 of 5 | Iteration:  66%|██████▋   | 805/1212 [06:25<02:57,  2.29it/s]

Gradient norm: 480.3257415702869


Epoch 3 of 5 | Iteration:  67%|██████▋   | 806/1212 [06:25<02:48,  2.41it/s]

Gradient norm: 480.15057839765876


Epoch 3 of 5 | Iteration:  67%|██████▋   | 807/1212 [06:26<02:44,  2.46it/s]

Gradient norm: 479.7582046621762


Epoch 3 of 5 | Iteration:  67%|██████▋   | 808/1212 [06:26<02:43,  2.47it/s]

Gradient norm: 478.8749899077567


Epoch 3 of 5 | Iteration:  67%|██████▋   | 809/1212 [06:27<02:47,  2.41it/s]

Gradient norm: 477.773135331161


Epoch 3 of 5 | Iteration:  67%|██████▋   | 810/1212 [06:27<02:44,  2.44it/s]

Gradient norm: 474.5069312105486


Epoch 3 of 5 | Iteration:  67%|██████▋   | 811/1212 [06:27<02:41,  2.48it/s]

Gradient norm: 474.36179078283527


Epoch 3 of 5 | Iteration:  67%|██████▋   | 812/1212 [06:28<03:00,  2.21it/s]

Gradient norm: 475.1296462398432


Epoch 3 of 5 | Iteration:  67%|██████▋   | 813/1212 [06:28<03:07,  2.13it/s]

Gradient norm: 475.08371107455804


Epoch 3 of 5 | Iteration:  67%|██████▋   | 814/1212 [06:29<03:40,  1.81it/s]

Gradient norm: 474.09251229810917


Epoch 3 of 5 | Iteration:  67%|██████▋   | 815/1212 [06:30<03:36,  1.84it/s]

Gradient norm: 473.95841189363495


Epoch 3 of 5 | Iteration:  67%|██████▋   | 816/1212 [06:30<03:44,  1.76it/s]

Gradient norm: 473.17804497835016


Epoch 3 of 5 | Iteration:  67%|██████▋   | 817/1212 [06:31<03:41,  1.79it/s]

Gradient norm: 2.915399167594498


Epoch 3 of 5 | Iteration:  67%|██████▋   | 818/1212 [06:31<03:30,  1.87it/s]

Gradient norm: 5.348282983410779


Epoch 3 of 5 | Iteration:  68%|██████▊   | 819/1212 [06:32<03:21,  1.95it/s]

Gradient norm: 6.383873950622935


Epoch 3 of 5 | Iteration:  68%|██████▊   | 820/1212 [06:32<03:11,  2.04it/s]

Gradient norm: 6.756940324356477


Epoch 3 of 5 | Iteration:  68%|██████▊   | 821/1212 [06:33<03:13,  2.02it/s]

Gradient norm: 7.483005522084605


Epoch 3 of 5 | Iteration:  68%|██████▊   | 822/1212 [06:33<03:00,  2.16it/s]

Gradient norm: 19.874543179657906


Epoch 3 of 5 | Iteration:  68%|██████▊   | 823/1212 [06:34<03:01,  2.15it/s]

Gradient norm: 55.44841356763573


Epoch 3 of 5 | Iteration:  68%|██████▊   | 824/1212 [06:34<03:01,  2.14it/s]

Gradient norm: 56.171428844755354


Epoch 3 of 5 | Iteration:  68%|██████▊   | 825/1212 [06:35<03:25,  1.89it/s]

Gradient norm: 59.2112387191291


Epoch 3 of 5 | Iteration:  68%|██████▊   | 826/1212 [06:35<03:08,  2.05it/s]

Gradient norm: 66.48189923843881


Epoch 3 of 5 | Iteration:  68%|██████▊   | 827/1212 [06:36<03:07,  2.06it/s]

Gradient norm: 65.86731662911707


Epoch 3 of 5 | Iteration:  68%|██████▊   | 828/1212 [06:36<02:54,  2.20it/s]

Gradient norm: 65.90452769100268


Epoch 3 of 5 | Iteration:  68%|██████▊   | 829/1212 [06:36<02:48,  2.27it/s]

Gradient norm: 62.809830418559905


Epoch 3 of 5 | Iteration:  68%|██████▊   | 830/1212 [06:37<02:43,  2.34it/s]

Gradient norm: 63.29058357132875


Epoch 3 of 5 | Iteration:  69%|██████▊   | 831/1212 [06:37<02:37,  2.42it/s]

Gradient norm: 63.43460983645428


Epoch 3 of 5 | Iteration:  69%|██████▊   | 832/1212 [06:38<02:37,  2.41it/s]

Gradient norm: 64.82802187221897


Epoch 3 of 5 | Iteration:  69%|██████▊   | 833/1212 [06:38<02:55,  2.16it/s]

Gradient norm: 2.9602447559049008


Epoch 3 of 5 | Iteration:  69%|██████▉   | 834/1212 [06:39<02:47,  2.26it/s]

Gradient norm: 3.0358047318070147


Epoch 3 of 5 | Iteration:  69%|██████▉   | 835/1212 [06:39<02:49,  2.23it/s]

Gradient norm: 5.706395937401749


Epoch 3 of 5 | Iteration:  69%|██████▉   | 836/1212 [06:40<02:44,  2.28it/s]

Gradient norm: 85.70767230533552


Epoch 3 of 5 | Iteration:  69%|██████▉   | 837/1212 [06:40<02:56,  2.12it/s]

Gradient norm: 84.19223207946567


Epoch 3 of 5 | Iteration:  69%|██████▉   | 838/1212 [06:41<03:04,  2.02it/s]

Gradient norm: 89.14160281587834


Epoch 3 of 5 | Iteration:  69%|██████▉   | 839/1212 [06:41<03:25,  1.81it/s]

Gradient norm: 93.83873308663827


Epoch 3 of 5 | Iteration:  69%|██████▉   | 840/1212 [06:42<03:24,  1.82it/s]

Gradient norm: 94.03683342588664


Epoch 3 of 5 | Iteration:  69%|██████▉   | 841/1212 [06:42<03:16,  1.89it/s]

Gradient norm: 93.96944744181575


Epoch 3 of 5 | Iteration:  69%|██████▉   | 842/1212 [06:43<03:27,  1.79it/s]

Gradient norm: 92.66019044671187


Epoch 3 of 5 | Iteration:  70%|██████▉   | 843/1212 [06:43<03:21,  1.83it/s]

Gradient norm: 93.04322875195952


Epoch 3 of 5 | Iteration:  70%|██████▉   | 844/1212 [06:44<03:22,  1.82it/s]

Gradient norm: 91.97471294328132


Epoch 3 of 5 | Iteration:  70%|██████▉   | 845/1212 [06:45<03:46,  1.62it/s]

Gradient norm: 91.81904142897386


Epoch 3 of 5 | Iteration:  70%|██████▉   | 846/1212 [06:45<03:19,  1.83it/s]

Gradient norm: 972.5216722940568


Epoch 3 of 5 | Iteration:  70%|██████▉   | 847/1212 [06:46<03:01,  2.01it/s]

Gradient norm: 971.8944470329224


Epoch 3 of 5 | Iteration:  70%|██████▉   | 848/1212 [06:46<02:49,  2.15it/s]

Gradient norm: 972.1247176756684


Epoch 3 of 5 | Iteration:  70%|███████   | 849/1212 [06:46<02:50,  2.12it/s]

Gradient norm: 4.412366749489542


Epoch 3 of 5 | Iteration:  70%|███████   | 850/1212 [06:47<02:50,  2.12it/s]

Gradient norm: 4.874205970196931


Epoch 3 of 5 | Iteration:  70%|███████   | 851/1212 [06:47<02:53,  2.08it/s]

Gradient norm: 8.094974467664345


Epoch 3 of 5 | Iteration:  70%|███████   | 852/1212 [06:48<02:53,  2.08it/s]

Gradient norm: 13.861782521997064


Epoch 3 of 5 | Iteration:  70%|███████   | 853/1212 [06:48<02:43,  2.20it/s]

Gradient norm: 14.205384691363543


Epoch 3 of 5 | Iteration:  70%|███████   | 854/1212 [06:49<02:50,  2.10it/s]

Gradient norm: 32.562851050218136


Epoch 3 of 5 | Iteration:  71%|███████   | 855/1212 [06:49<03:09,  1.88it/s]

Gradient norm: 33.96715014855643


Epoch 3 of 5 | Iteration:  71%|███████   | 856/1212 [06:50<03:08,  1.89it/s]

Gradient norm: 33.489924878652374


Epoch 3 of 5 | Iteration:  71%|███████   | 857/1212 [06:50<02:57,  2.00it/s]

Gradient norm: 45.80611885652374


Epoch 3 of 5 | Iteration:  71%|███████   | 858/1212 [06:51<02:47,  2.12it/s]

Gradient norm: 45.79810965439385


Epoch 3 of 5 | Iteration:  71%|███████   | 859/1212 [06:51<02:39,  2.21it/s]

Gradient norm: 45.872582590552774


Epoch 3 of 5 | Iteration:  71%|███████   | 860/1212 [06:52<02:39,  2.21it/s]

Gradient norm: 45.30459247434001


Epoch 3 of 5 | Iteration:  71%|███████   | 861/1212 [06:52<02:46,  2.10it/s]

Gradient norm: 65.88925269609945


Epoch 3 of 5 | Iteration:  71%|███████   | 862/1212 [06:53<02:51,  2.04it/s]

Gradient norm: 66.14559087389044


Epoch 3 of 5 | Iteration:  71%|███████   | 863/1212 [06:53<02:49,  2.06it/s]

Gradient norm: 66.37205028840604


Epoch 3 of 5 | Iteration:  71%|███████▏  | 864/1212 [06:54<02:56,  1.97it/s]

Gradient norm: 66.57562578500453


Epoch 3 of 5 | Iteration:  71%|███████▏  | 865/1212 [06:54<02:43,  2.12it/s]

Gradient norm: 1.0841256489485063


Epoch 3 of 5 | Iteration:  71%|███████▏  | 866/1212 [06:55<02:43,  2.12it/s]

Gradient norm: 115.85833919317623


Epoch 3 of 5 | Iteration:  72%|███████▏  | 867/1212 [06:55<02:56,  1.96it/s]

Gradient norm: 116.96670958056994


Epoch 3 of 5 | Iteration:  72%|███████▏  | 868/1212 [06:56<02:55,  1.96it/s]

Gradient norm: 118.28998256407608


Epoch 3 of 5 | Iteration:  72%|███████▏  | 869/1212 [06:56<02:58,  1.92it/s]

Gradient norm: 121.4127603971535


Epoch 3 of 5 | Iteration:  72%|███████▏  | 870/1212 [06:57<03:00,  1.90it/s]

Gradient norm: 119.40136805680336


Epoch 3 of 5 | Iteration:  72%|███████▏  | 871/1212 [06:57<02:55,  1.94it/s]

Gradient norm: 119.64977483278876


Epoch 3 of 5 | Iteration:  72%|███████▏  | 872/1212 [06:58<03:15,  1.74it/s]

Gradient norm: 122.81429619247824


Epoch 3 of 5 | Iteration:  72%|███████▏  | 873/1212 [06:59<03:14,  1.74it/s]

Gradient norm: 122.93579371306826


Epoch 3 of 5 | Iteration:  72%|███████▏  | 874/1212 [06:59<02:56,  1.92it/s]

Gradient norm: 123.64288466954638


Epoch 3 of 5 | Iteration:  72%|███████▏  | 875/1212 [06:59<02:50,  1.97it/s]

Gradient norm: 122.86803737485778


Epoch 3 of 5 | Iteration:  72%|███████▏  | 876/1212 [07:00<02:54,  1.92it/s]

Gradient norm: 123.00802477914647


Epoch 3 of 5 | Iteration:  72%|███████▏  | 877/1212 [07:01<03:02,  1.83it/s]

Gradient norm: 123.07784986288813


Epoch 3 of 5 | Iteration:  72%|███████▏  | 878/1212 [07:01<02:44,  2.03it/s]

Gradient norm: 123.0852378881666


Epoch 3 of 5 | Iteration:  73%|███████▎  | 879/1212 [07:01<02:37,  2.11it/s]

Gradient norm: 131.4936104565732


Epoch 3 of 5 | Iteration:  73%|███████▎  | 880/1212 [07:02<02:32,  2.17it/s]

Gradient norm: 130.80817205435628


Epoch 3 of 5 | Iteration:  73%|███████▎  | 881/1212 [07:02<02:36,  2.11it/s]

Gradient norm: 9.911494102326268


Epoch 3 of 5 | Iteration:  73%|███████▎  | 882/1212 [07:03<02:28,  2.23it/s]

Gradient norm: 9.815328535024335


Epoch 3 of 5 | Iteration:  73%|███████▎  | 883/1212 [07:03<02:21,  2.33it/s]

Gradient norm: 19.13053651834487


Epoch 3 of 5 | Iteration:  73%|███████▎  | 884/1212 [07:04<02:31,  2.16it/s]

Gradient norm: 20.65430798541481


Epoch 3 of 5 | Iteration:  73%|███████▎  | 885/1212 [07:04<02:30,  2.17it/s]

Gradient norm: 21.35525811609276


Epoch 3 of 5 | Iteration:  73%|███████▎  | 886/1212 [07:05<02:36,  2.08it/s]

Gradient norm: 880.0699741514278


Epoch 3 of 5 | Iteration:  73%|███████▎  | 887/1212 [07:05<02:29,  2.18it/s]

Gradient norm: 880.2105258779297


Epoch 3 of 5 | Iteration:  73%|███████▎  | 888/1212 [07:05<02:21,  2.28it/s]

Gradient norm: 880.1785671202217


Epoch 3 of 5 | Iteration:  73%|███████▎  | 889/1212 [07:06<02:15,  2.39it/s]

Gradient norm: 880.1393252994403


Epoch 3 of 5 | Iteration:  73%|███████▎  | 890/1212 [07:06<02:11,  2.45it/s]

Gradient norm: 879.9863843141727


Epoch 3 of 5 | Iteration:  74%|███████▎  | 891/1212 [07:07<02:09,  2.48it/s]

Gradient norm: 880.6985340041708


Epoch 3 of 5 | Iteration:  74%|███████▎  | 892/1212 [07:07<02:10,  2.44it/s]

Gradient norm: 880.81490308726


Epoch 3 of 5 | Iteration:  74%|███████▎  | 893/1212 [07:08<02:24,  2.20it/s]

Gradient norm: 880.0167405768861


Epoch 3 of 5 | Iteration:  74%|███████▍  | 894/1212 [07:08<02:43,  1.94it/s]

Gradient norm: 880.7400420535268


Epoch 3 of 5 | Iteration:  74%|███████▍  | 895/1212 [07:09<02:41,  1.97it/s]

Gradient norm: 880.3516981337021


Epoch 3 of 5 | Iteration:  74%|███████▍  | 896/1212 [07:09<02:43,  1.93it/s]

Gradient norm: 879.8117541893531


Epoch 3 of 5 | Iteration:  74%|███████▍  | 897/1212 [07:10<02:49,  1.86it/s]

Gradient norm: 3.4600027997146134


Epoch 3 of 5 | Iteration:  74%|███████▍  | 898/1212 [07:10<02:50,  1.84it/s]

Gradient norm: 9.671025719107442


Epoch 3 of 5 | Iteration:  74%|███████▍  | 899/1212 [07:11<02:49,  1.84it/s]

Gradient norm: 10.3624874645189


Epoch 3 of 5 | Iteration:  74%|███████▍  | 900/1212 [07:12<02:51,  1.82it/s]

Gradient norm: 11.24994502704105


Epoch 3 of 5 | Iteration:  74%|███████▍  | 901/1212 [07:12<02:54,  1.78it/s]

Gradient norm: 29.289650933030007


Epoch 3 of 5 | Iteration:  74%|███████▍  | 902/1212 [07:13<02:44,  1.88it/s]

Gradient norm: 29.506013370950694


Epoch 3 of 5 | Iteration:  75%|███████▍  | 903/1212 [07:13<02:55,  1.76it/s]

Gradient norm: 29.209557278564905


Epoch 3 of 5 | Iteration:  75%|███████▍  | 904/1212 [07:14<02:40,  1.92it/s]

Gradient norm: 37.034241835705544


Epoch 3 of 5 | Iteration:  75%|███████▍  | 905/1212 [07:14<02:34,  1.98it/s]

Gradient norm: 36.73109268711777


Epoch 3 of 5 | Iteration:  75%|███████▍  | 906/1212 [07:14<02:22,  2.15it/s]

Gradient norm: 36.454525629771126


Epoch 3 of 5 | Iteration:  75%|███████▍  | 907/1212 [07:15<02:17,  2.22it/s]

Gradient norm: 56.51015241350804


Epoch 3 of 5 | Iteration:  75%|███████▍  | 908/1212 [07:15<02:13,  2.27it/s]

Gradient norm: 63.73302442467118


Epoch 3 of 5 | Iteration:  75%|███████▌  | 909/1212 [07:16<02:09,  2.34it/s]

Gradient norm: 63.247721112722004


Epoch 3 of 5 | Iteration:  75%|███████▌  | 910/1212 [07:16<02:06,  2.38it/s]

Gradient norm: 62.40302207657305


Epoch 3 of 5 | Iteration:  75%|███████▌  | 911/1212 [07:17<02:04,  2.41it/s]

Gradient norm: 79.33533511088406


Epoch 3 of 5 | Iteration:  75%|███████▌  | 912/1212 [07:17<02:04,  2.40it/s]

Gradient norm: 75.98659709111998


Epoch 3 of 5 | Iteration:  75%|███████▌  | 913/1212 [07:17<02:01,  2.46it/s]

Gradient norm: 19.594027885193864


Epoch 3 of 5 | Iteration:  75%|███████▌  | 914/1212 [07:18<02:07,  2.33it/s]

Gradient norm: 22.819303041755397


Epoch 3 of 5 | Iteration:  75%|███████▌  | 915/1212 [07:18<02:04,  2.39it/s]

Gradient norm: 25.095712031795426


Epoch 3 of 5 | Iteration:  76%|███████▌  | 916/1212 [07:19<02:21,  2.09it/s]

Gradient norm: 25.268825289658675


Epoch 3 of 5 | Iteration:  76%|███████▌  | 917/1212 [07:19<02:24,  2.04it/s]

Gradient norm: 26.994206299510303


Epoch 3 of 5 | Iteration:  76%|███████▌  | 918/1212 [07:20<02:17,  2.14it/s]

Gradient norm: 61.826440947841164


Epoch 3 of 5 | Iteration:  76%|███████▌  | 919/1212 [07:20<02:25,  2.02it/s]

Gradient norm: 62.01925952070207


Epoch 3 of 5 | Iteration:  76%|███████▌  | 920/1212 [07:21<02:25,  2.01it/s]

Gradient norm: 62.116373572086104


Epoch 3 of 5 | Iteration:  76%|███████▌  | 921/1212 [07:21<02:15,  2.14it/s]

Gradient norm: 64.0105066639837


Epoch 3 of 5 | Iteration:  76%|███████▌  | 922/1212 [07:22<02:11,  2.20it/s]

Gradient norm: 64.01739726327419


Epoch 3 of 5 | Iteration:  76%|███████▌  | 923/1212 [07:22<02:08,  2.25it/s]

Gradient norm: 63.93822162600229


Epoch 3 of 5 | Iteration:  76%|███████▌  | 924/1212 [07:23<02:20,  2.04it/s]

Gradient norm: 64.63852969930343


Epoch 3 of 5 | Iteration:  76%|███████▋  | 925/1212 [07:23<02:23,  2.00it/s]

Gradient norm: 64.29291035531682


Epoch 3 of 5 | Iteration:  76%|███████▋  | 926/1212 [07:24<02:29,  1.92it/s]

Gradient norm: 64.44374827838315


Epoch 3 of 5 | Iteration:  76%|███████▋  | 927/1212 [07:24<02:28,  1.92it/s]

Gradient norm: 64.42191055228892


Epoch 3 of 5 | Iteration:  77%|███████▋  | 928/1212 [07:25<02:25,  1.96it/s]

Gradient norm: 64.51096049271489


Epoch 3 of 5 | Iteration:  77%|███████▋  | 929/1212 [07:25<02:33,  1.84it/s]

Gradient norm: 46.230484856991495


Epoch 3 of 5 | Iteration:  77%|███████▋  | 930/1212 [07:26<02:42,  1.74it/s]

Gradient norm: 53.7115603966175


Epoch 3 of 5 | Iteration:  77%|███████▋  | 931/1212 [07:26<02:32,  1.84it/s]

Gradient norm: 188.22997327503535


Epoch 3 of 5 | Iteration:  77%|███████▋  | 932/1212 [07:27<02:18,  2.02it/s]

Gradient norm: 191.41383610706313


Epoch 3 of 5 | Iteration:  77%|███████▋  | 933/1212 [07:27<02:10,  2.14it/s]

Gradient norm: 191.57654432298102


Epoch 3 of 5 | Iteration:  77%|███████▋  | 934/1212 [07:28<02:01,  2.29it/s]

Gradient norm: 196.46153156011786


Epoch 3 of 5 | Iteration:  77%|███████▋  | 935/1212 [07:28<02:17,  2.02it/s]

Gradient norm: 198.1141856002407


Epoch 3 of 5 | Iteration:  77%|███████▋  | 936/1212 [07:29<02:30,  1.83it/s]

Gradient norm: 198.3805961769493


Epoch 3 of 5 | Iteration:  77%|███████▋  | 937/1212 [07:29<02:16,  2.01it/s]

Gradient norm: 198.6472917519714


Epoch 3 of 5 | Iteration:  77%|███████▋  | 938/1212 [07:30<02:13,  2.05it/s]

Gradient norm: 198.66348359597418


Epoch 3 of 5 | Iteration:  77%|███████▋  | 939/1212 [07:30<02:05,  2.18it/s]

Gradient norm: 199.1480383205577


Epoch 3 of 5 | Iteration:  78%|███████▊  | 940/1212 [07:31<01:58,  2.29it/s]

Gradient norm: 203.71435641467565


Epoch 3 of 5 | Iteration:  78%|███████▊  | 941/1212 [07:31<02:26,  1.85it/s]

Gradient norm: 203.15940960574986


Epoch 3 of 5 | Iteration:  78%|███████▊  | 942/1212 [07:32<02:29,  1.81it/s]

Gradient norm: 203.0786315384554


Epoch 3 of 5 | Iteration:  78%|███████▊  | 943/1212 [07:32<02:19,  1.93it/s]

Gradient norm: 203.17388221302002


Epoch 3 of 5 | Iteration:  78%|███████▊  | 944/1212 [07:33<02:09,  2.06it/s]

Gradient norm: 212.64096952492642


Epoch 3 of 5 | Iteration:  78%|███████▊  | 945/1212 [07:33<02:13,  2.00it/s]

Gradient norm: 1.4079968761621484


Epoch 3 of 5 | Iteration:  78%|███████▊  | 946/1212 [07:34<02:08,  2.06it/s]

Gradient norm: 3.8334032395560595


Epoch 3 of 5 | Iteration:  78%|███████▊  | 947/1212 [07:34<02:01,  2.19it/s]

Gradient norm: 4.1165578228909805


Epoch 3 of 5 | Iteration:  78%|███████▊  | 948/1212 [07:35<02:03,  2.13it/s]

Gradient norm: 4.962036047576585


Epoch 3 of 5 | Iteration:  78%|███████▊  | 949/1212 [07:35<01:58,  2.23it/s]

Gradient norm: 15.147198762409461


Epoch 3 of 5 | Iteration:  78%|███████▊  | 950/1212 [07:35<01:56,  2.25it/s]

Gradient norm: 16.338109504925516


Epoch 3 of 5 | Iteration:  78%|███████▊  | 951/1212 [07:36<01:51,  2.35it/s]

Gradient norm: 20.675088014805507


Epoch 3 of 5 | Iteration:  79%|███████▊  | 952/1212 [07:36<02:04,  2.09it/s]

Gradient norm: 20.72164566596711


Epoch 3 of 5 | Iteration:  79%|███████▊  | 953/1212 [07:37<02:05,  2.06it/s]

Gradient norm: 23.56976069814891


Epoch 3 of 5 | Iteration:  79%|███████▊  | 954/1212 [07:38<02:17,  1.87it/s]

Gradient norm: 28.699209638616253


Epoch 3 of 5 | Iteration:  79%|███████▉  | 955/1212 [07:38<02:22,  1.80it/s]

Gradient norm: 30.607390459812038


Epoch 3 of 5 | Iteration:  79%|███████▉  | 956/1212 [07:39<02:22,  1.79it/s]

Gradient norm: 30.79560073629143


Epoch 3 of 5 | Iteration:  79%|███████▉  | 957/1212 [07:39<02:30,  1.70it/s]

Gradient norm: 36.20761774236777


Epoch 3 of 5 | Iteration:  79%|███████▉  | 958/1212 [07:40<02:25,  1.75it/s]

Gradient norm: 37.06155258486277


Epoch 3 of 5 | Iteration:  79%|███████▉  | 959/1212 [07:41<02:20,  1.80it/s]

Gradient norm: 37.166565730531346


Epoch 3 of 5 | Iteration:  79%|███████▉  | 960/1212 [07:41<02:17,  1.83it/s]

Gradient norm: 37.21476841045245


Epoch 3 of 5 | Iteration:  79%|███████▉  | 961/1212 [07:41<02:06,  1.98it/s]

Gradient norm: 606.8467111485401


Epoch 3 of 5 | Iteration:  79%|███████▉  | 962/1212 [07:42<02:11,  1.91it/s]

Gradient norm: 605.6355965776283


Epoch 3 of 5 | Iteration:  79%|███████▉  | 963/1212 [07:42<02:00,  2.07it/s]

Gradient norm: 605.639845450903


Epoch 3 of 5 | Iteration:  80%|███████▉  | 964/1212 [07:43<01:54,  2.16it/s]

Gradient norm: 605.5952634671204


Epoch 3 of 5 | Iteration:  80%|███████▉  | 965/1212 [07:43<01:53,  2.17it/s]

Gradient norm: 605.566837532666


Epoch 3 of 5 | Iteration:  80%|███████▉  | 966/1212 [07:44<01:47,  2.28it/s]

Gradient norm: 597.9961018206055


Epoch 3 of 5 | Iteration:  80%|███████▉  | 967/1212 [07:44<01:47,  2.28it/s]

Gradient norm: 598.7688847328573


Epoch 3 of 5 | Iteration:  80%|███████▉  | 968/1212 [07:45<02:02,  1.99it/s]

Gradient norm: 598.794428441564


Epoch 3 of 5 | Iteration:  80%|███████▉  | 969/1212 [07:45<02:00,  2.02it/s]

Gradient norm: 598.287517583127


Epoch 3 of 5 | Iteration:  80%|████████  | 970/1212 [07:46<01:51,  2.16it/s]

Gradient norm: 598.2038365743704


Epoch 3 of 5 | Iteration:  80%|████████  | 971/1212 [07:46<01:48,  2.21it/s]

Gradient norm: 597.9459142786842


Epoch 3 of 5 | Iteration:  80%|████████  | 972/1212 [07:46<01:45,  2.28it/s]

Gradient norm: 597.6851933553861


Epoch 3 of 5 | Iteration:  80%|████████  | 973/1212 [07:47<01:41,  2.35it/s]

Gradient norm: 596.6240542144234


Epoch 3 of 5 | Iteration:  80%|████████  | 974/1212 [07:47<01:48,  2.20it/s]

Gradient norm: 596.680596287928


Epoch 3 of 5 | Iteration:  80%|████████  | 975/1212 [07:48<01:59,  1.98it/s]

Gradient norm: 597.0894786783665


Epoch 3 of 5 | Iteration:  81%|████████  | 976/1212 [07:48<01:52,  2.09it/s]

Gradient norm: 627.8062313763625


Epoch 3 of 5 | Iteration:  81%|████████  | 977/1212 [07:49<01:51,  2.10it/s]

Gradient norm: 3.1216739709621786


Epoch 3 of 5 | Iteration:  81%|████████  | 978/1212 [07:49<01:56,  2.02it/s]

Gradient norm: 9.06801989377159


Epoch 3 of 5 | Iteration:  81%|████████  | 979/1212 [07:50<01:55,  2.02it/s]

Gradient norm: 9.123963593833373


Epoch 3 of 5 | Iteration:  81%|████████  | 980/1212 [07:51<02:11,  1.76it/s]

Gradient norm: 11.058824090460657


Epoch 3 of 5 | Iteration:  81%|████████  | 981/1212 [07:51<02:07,  1.81it/s]

Gradient norm: 18.1549614075144


Epoch 3 of 5 | Iteration:  81%|████████  | 982/1212 [07:52<02:06,  1.81it/s]

Gradient norm: 18.115758387883602


Epoch 3 of 5 | Iteration:  81%|████████  | 983/1212 [07:52<02:04,  1.84it/s]

Gradient norm: 22.745230965224795


Epoch 3 of 5 | Iteration:  81%|████████  | 984/1212 [07:53<02:03,  1.84it/s]

Gradient norm: 22.911469661983837


Epoch 3 of 5 | Iteration:  81%|████████▏ | 985/1212 [07:53<02:07,  1.77it/s]

Gradient norm: 27.49561027738901


Epoch 3 of 5 | Iteration:  81%|████████▏ | 986/1212 [07:54<01:57,  1.92it/s]

Gradient norm: 27.668824746904814


Epoch 3 of 5 | Iteration:  81%|████████▏ | 987/1212 [07:54<01:48,  2.07it/s]

Gradient norm: 27.950980406103962


Epoch 3 of 5 | Iteration:  82%|████████▏ | 988/1212 [07:55<01:58,  1.90it/s]

Gradient norm: 29.41740250727382


Epoch 3 of 5 | Iteration:  82%|████████▏ | 989/1212 [07:55<01:51,  2.00it/s]

Gradient norm: 30.9562925110373


Epoch 3 of 5 | Iteration:  82%|████████▏ | 990/1212 [07:56<01:49,  2.03it/s]

Gradient norm: 30.804257390574726


Epoch 3 of 5 | Iteration:  82%|████████▏ | 991/1212 [07:56<01:41,  2.17it/s]

Gradient norm: 30.82525404136487


Epoch 3 of 5 | Iteration:  82%|████████▏ | 992/1212 [07:57<01:39,  2.21it/s]

Gradient norm: 30.82525404136487


Epoch 3 of 5 | Iteration:  82%|████████▏ | 993/1212 [07:57<01:34,  2.31it/s]

Gradient norm: 2.7351609662159913


Epoch 3 of 5 | Iteration:  82%|████████▏ | 994/1212 [07:58<01:42,  2.12it/s]

Gradient norm: 7.964139123187484


Epoch 3 of 5 | Iteration:  82%|████████▏ | 995/1212 [07:58<01:38,  2.21it/s]

Gradient norm: 279.10219159762323


Epoch 3 of 5 | Iteration:  82%|████████▏ | 996/1212 [07:58<01:41,  2.13it/s]

Gradient norm: 279.02377156984846


Epoch 3 of 5 | Iteration:  82%|████████▏ | 997/1212 [07:59<01:34,  2.28it/s]

Gradient norm: 279.1317862589754


Epoch 3 of 5 | Iteration:  82%|████████▏ | 998/1212 [07:59<01:31,  2.35it/s]

Gradient norm: 435.1664066532551


Epoch 3 of 5 | Iteration:  82%|████████▏ | 999/1212 [08:00<01:34,  2.25it/s]

Gradient norm: 434.81156910091886


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1000/1212 [08:00<01:32,  2.30it/s]

Gradient norm: 435.2991533165382


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1001/1212 [08:01<01:39,  2.12it/s]

Gradient norm: 432.73044956580236


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1002/1212 [08:01<01:39,  2.11it/s]

Gradient norm: 432.62949677429236


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1003/1212 [08:02<01:34,  2.21it/s]

Gradient norm: 432.5904123933842


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1004/1212 [08:02<01:36,  2.16it/s]

Gradient norm: 432.5999963608333


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1005/1212 [08:03<01:38,  2.10it/s]

Gradient norm: 430.1331267261125


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1006/1212 [08:03<01:31,  2.25it/s]

Gradient norm: 430.81915241434336


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1007/1212 [08:04<01:43,  1.98it/s]

Gradient norm: 430.9964802227888


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1008/1212 [08:04<01:45,  1.93it/s]

Gradient norm: 428.87091222680056


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1009/1212 [08:05<01:51,  1.82it/s]

Gradient norm: 6.542531181300605


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1010/1212 [08:05<01:49,  1.85it/s]

Gradient norm: 8.653179292321177


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1011/1212 [08:06<01:51,  1.80it/s]

Gradient norm: 9.645917508807946


Epoch 3 of 5 | Iteration:  83%|████████▎ | 1012/1212 [08:06<01:47,  1.86it/s]

Gradient norm: 10.237565510636916


Epoch 3 of 5 | Iteration:  84%|████████▎ | 1013/1212 [08:07<01:51,  1.78it/s]

Gradient norm: 18.799617007380977


Epoch 3 of 5 | Iteration:  84%|████████▎ | 1014/1212 [08:07<01:48,  1.83it/s]

Gradient norm: 19.211464239470903


Epoch 3 of 5 | Iteration:  84%|████████▎ | 1015/1212 [08:08<01:43,  1.90it/s]

Gradient norm: 20.920965233491074


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1016/1212 [08:08<01:36,  2.03it/s]

Gradient norm: 32.15596517448513


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1017/1212 [08:09<01:40,  1.95it/s]

Gradient norm: 32.82368104978926


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1018/1212 [08:09<01:42,  1.90it/s]

Gradient norm: 32.525284249351415


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1019/1212 [08:10<01:43,  1.87it/s]

Gradient norm: 31.606955886329295


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1020/1212 [08:10<01:36,  1.99it/s]

Gradient norm: 32.34651802557496


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1021/1212 [08:11<01:44,  1.83it/s]

Gradient norm: 124.1961012968113


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1022/1212 [08:11<01:34,  2.01it/s]

Gradient norm: 126.3404227859593


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1023/1212 [08:12<01:32,  2.05it/s]

Gradient norm: 131.42921088137348


Epoch 3 of 5 | Iteration:  84%|████████▍ | 1024/1212 [08:12<01:26,  2.18it/s]

Gradient norm: 131.41454024739184


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1025/1212 [08:13<01:21,  2.30it/s]

Gradient norm: 3.883129356199298


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1026/1212 [08:13<01:29,  2.08it/s]

Gradient norm: 3.957260211528527


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1027/1212 [08:14<01:26,  2.14it/s]

Gradient norm: 4.706751766835224


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1028/1212 [08:14<01:22,  2.24it/s]

Gradient norm: 18.353274321793474


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1029/1212 [08:15<01:33,  1.96it/s]

Gradient norm: 45.06144417329481


Epoch 3 of 5 | Iteration:  85%|████████▍ | 1030/1212 [08:15<01:27,  2.09it/s]

Gradient norm: 44.98745539721278


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1031/1212 [08:16<01:25,  2.13it/s]

Gradient norm: 45.05014195957522


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1032/1212 [08:16<01:26,  2.07it/s]

Gradient norm: 45.29703332561888


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1033/1212 [08:17<01:20,  2.22it/s]

Gradient norm: 45.63074235747828


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1034/1212 [08:17<01:16,  2.33it/s]

Gradient norm: 249.7841618660122


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1035/1212 [08:17<01:17,  2.30it/s]

Gradient norm: 250.61229373585667


Epoch 3 of 5 | Iteration:  85%|████████▌ | 1036/1212 [08:18<01:21,  2.16it/s]

Gradient norm: 250.19951275135998


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1037/1212 [08:18<01:23,  2.10it/s]

Gradient norm: 247.06770143991983


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1038/1212 [08:19<01:27,  1.98it/s]

Gradient norm: 247.20725729132838


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1039/1212 [08:19<01:28,  1.96it/s]

Gradient norm: 247.21159150740678


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1040/1212 [08:20<01:35,  1.80it/s]

Gradient norm: 247.6190611599472


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1041/1212 [08:21<01:36,  1.77it/s]

Gradient norm: 4.096446764176881


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1042/1212 [08:21<01:28,  1.92it/s]

Gradient norm: 21.910616428413487


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1043/1212 [08:22<01:20,  2.09it/s]

Gradient norm: 27.63617027780592


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1044/1212 [08:22<01:15,  2.22it/s]

Gradient norm: 35.31292244611085


Epoch 3 of 5 | Iteration:  86%|████████▌ | 1045/1212 [08:22<01:13,  2.27it/s]

Gradient norm: 35.86287730435962


Epoch 3 of 5 | Iteration:  86%|████████▋ | 1046/1212 [08:23<01:14,  2.23it/s]

Gradient norm: 33.84762196895533


Epoch 3 of 5 | Iteration:  86%|████████▋ | 1047/1212 [08:23<01:14,  2.22it/s]

Gradient norm: 35.08560390398523


Epoch 3 of 5 | Iteration:  86%|████████▋ | 1048/1212 [08:24<01:17,  2.12it/s]

Gradient norm: 35.07109559147488


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1049/1212 [08:24<01:23,  1.95it/s]

Gradient norm: 35.08948431505816


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1050/1212 [08:25<01:17,  2.09it/s]

Gradient norm: 35.21963745228802


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1051/1212 [08:25<01:19,  2.04it/s]

Gradient norm: 49.345912804656166


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1052/1212 [08:26<01:15,  2.13it/s]

Gradient norm: 49.404939326141736


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1053/1212 [08:26<01:11,  2.23it/s]

Gradient norm: 48.79391407876075


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1054/1212 [08:27<01:07,  2.33it/s]

Gradient norm: 48.77019667153967


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1055/1212 [08:27<01:04,  2.42it/s]

Gradient norm: 48.645890675191104


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1056/1212 [08:27<01:10,  2.22it/s]

Gradient norm: 48.12387663885252


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1057/1212 [08:28<01:06,  2.32it/s]

Gradient norm: 6.541204041793068


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1058/1212 [08:28<01:04,  2.37it/s]

Gradient norm: 6.714647751318844


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1059/1212 [08:29<01:10,  2.16it/s]

Gradient norm: 88.48205171653106


Epoch 3 of 5 | Iteration:  87%|████████▋ | 1060/1212 [08:29<01:06,  2.28it/s]

Gradient norm: 89.60836953878623


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1061/1212 [08:30<01:05,  2.29it/s]

Gradient norm: 89.58040396504296


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1062/1212 [08:30<01:17,  1.94it/s]

Gradient norm: 90.53391885549327


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1063/1212 [08:31<01:22,  1.82it/s]

Gradient norm: 97.6381459630915


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1064/1212 [08:31<01:22,  1.80it/s]

Gradient norm: 97.72591326738588


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1065/1212 [08:32<01:21,  1.80it/s]

Gradient norm: 97.50851013232582


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1066/1212 [08:33<01:22,  1.76it/s]

Gradient norm: 97.95356961059454


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1067/1212 [08:33<01:19,  1.83it/s]

Gradient norm: 97.72449804677689


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1068/1212 [08:34<01:22,  1.74it/s]

Gradient norm: 98.18538801261725


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1069/1212 [08:34<01:24,  1.69it/s]

Gradient norm: 98.44254960840004


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1070/1212 [08:35<01:22,  1.73it/s]

Gradient norm: 98.50989265950618


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1071/1212 [08:35<01:12,  1.94it/s]

Gradient norm: 114.39539970937095


Epoch 3 of 5 | Iteration:  88%|████████▊ | 1072/1212 [08:36<01:07,  2.07it/s]

Gradient norm: 114.46136108084777


Epoch 3 of 5 | Iteration:  89%|████████▊ | 1073/1212 [08:36<01:04,  2.17it/s]

Gradient norm: 7.462569031251163


Epoch 3 of 5 | Iteration:  89%|████████▊ | 1074/1212 [08:37<01:00,  2.27it/s]

Gradient norm: 7.370082485678451


Epoch 3 of 5 | Iteration:  89%|████████▊ | 1075/1212 [08:37<01:09,  1.97it/s]

Gradient norm: 7.774852920702297


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1076/1212 [08:38<01:03,  2.13it/s]

Gradient norm: 7.892534245575525


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1077/1212 [08:38<01:01,  2.20it/s]

Gradient norm: 8.683532120748866


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1078/1212 [08:38<00:59,  2.26it/s]

Gradient norm: 10.447569532802317


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1079/1212 [08:39<00:56,  2.35it/s]

Gradient norm: 11.550671207090023


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1080/1212 [08:39<01:01,  2.14it/s]

Gradient norm: 11.610732938328866


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1081/1212 [08:40<01:07,  1.95it/s]

Gradient norm: 11.481225914116237


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1082/1212 [08:40<01:03,  2.05it/s]

Gradient norm: 11.353497251785592


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1083/1212 [08:41<01:01,  2.08it/s]

Gradient norm: 14.615103589670495


Epoch 3 of 5 | Iteration:  89%|████████▉ | 1084/1212 [08:41<00:58,  2.21it/s]

Gradient norm: 14.846996001854707


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1085/1212 [08:42<00:55,  2.28it/s]

Gradient norm: 16.272538818675216


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1086/1212 [08:42<00:56,  2.23it/s]

Gradient norm: 16.82718693956678


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1087/1212 [08:43<00:53,  2.32it/s]

Gradient norm: 23.27363946872352


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1088/1212 [08:43<00:53,  2.32it/s]

Gradient norm: 22.989046530713697


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1089/1212 [08:43<00:50,  2.42it/s]

Gradient norm: 2.370673717901456


Epoch 3 of 5 | Iteration:  90%|████████▉ | 1090/1212 [08:44<00:54,  2.23it/s]

Gradient norm: 12.9168407869481


Epoch 3 of 5 | Iteration:  90%|█████████ | 1091/1212 [08:44<00:59,  2.02it/s]

Gradient norm: 25.490759382248317


Epoch 3 of 5 | Iteration:  90%|█████████ | 1092/1212 [08:45<01:00,  1.98it/s]

Gradient norm: 41.11721012545505


Epoch 3 of 5 | Iteration:  90%|█████████ | 1093/1212 [08:46<01:05,  1.82it/s]

Gradient norm: 59.63091814623953


Epoch 3 of 5 | Iteration:  90%|█████████ | 1094/1212 [08:46<01:07,  1.76it/s]

Gradient norm: 61.79230261198685


Epoch 3 of 5 | Iteration:  90%|█████████ | 1095/1212 [08:47<01:05,  1.78it/s]

Gradient norm: 64.94218093732026


Epoch 3 of 5 | Iteration:  90%|█████████ | 1096/1212 [08:47<01:04,  1.80it/s]

Gradient norm: 66.16767410929548


Epoch 3 of 5 | Iteration:  91%|█████████ | 1097/1212 [08:48<01:05,  1.74it/s]

Gradient norm: 68.50946188997719


Epoch 3 of 5 | Iteration:  91%|█████████ | 1098/1212 [08:48<01:03,  1.80it/s]

Gradient norm: 68.25911224469745


Epoch 3 of 5 | Iteration:  91%|█████████ | 1099/1212 [08:49<00:57,  1.98it/s]

Gradient norm: 69.1293841256031


Epoch 3 of 5 | Iteration:  91%|█████████ | 1100/1212 [08:49<00:57,  1.96it/s]

Gradient norm: 69.59470253189687


Epoch 3 of 5 | Iteration:  91%|█████████ | 1101/1212 [08:50<00:55,  2.02it/s]

Gradient norm: 70.13497065655207


Epoch 3 of 5 | Iteration:  91%|█████████ | 1102/1212 [08:50<00:53,  2.05it/s]

Gradient norm: 70.10276761719426


Epoch 3 of 5 | Iteration:  91%|█████████ | 1103/1212 [08:51<00:50,  2.17it/s]

Gradient norm: 69.82952663965943


Epoch 3 of 5 | Iteration:  91%|█████████ | 1104/1212 [08:51<00:48,  2.24it/s]

Gradient norm: 69.86640888607754


Epoch 3 of 5 | Iteration:  91%|█████████ | 1105/1212 [08:52<00:46,  2.30it/s]

Gradient norm: 19.306523051858015


Epoch 3 of 5 | Iteration:  91%|█████████▏| 1106/1212 [08:52<00:48,  2.17it/s]

Gradient norm: 19.542617965772745


Epoch 3 of 5 | Iteration:  91%|█████████▏| 1107/1212 [08:53<00:51,  2.04it/s]

Gradient norm: 49.45406895400479


Epoch 3 of 5 | Iteration:  91%|█████████▏| 1108/1212 [08:53<00:47,  2.19it/s]

Gradient norm: 49.64603201160145


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1109/1212 [08:53<00:47,  2.16it/s]

Gradient norm: 60.1282563174438


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1110/1212 [08:54<00:50,  2.03it/s]

Gradient norm: 60.128745539526776


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1111/1212 [08:54<00:46,  2.17it/s]

Gradient norm: 60.02152168239296


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1112/1212 [08:55<00:46,  2.17it/s]

Gradient norm: 61.744623895698126


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1113/1212 [08:55<00:47,  2.08it/s]

Gradient norm: 61.4977755704875


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1114/1212 [08:56<00:44,  2.18it/s]

Gradient norm: 61.37054089738114


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1115/1212 [08:56<00:45,  2.14it/s]

Gradient norm: 61.34966706629223


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1116/1212 [08:57<00:50,  1.92it/s]

Gradient norm: 62.270206533383345


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1117/1212 [08:57<00:45,  2.09it/s]

Gradient norm: 65.76097304597228


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1118/1212 [08:58<00:42,  2.20it/s]

Gradient norm: 65.79592712343174


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1119/1212 [08:58<00:40,  2.28it/s]

Gradient norm: 66.78709514359647


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1120/1212 [08:59<00:47,  1.94it/s]

Gradient norm: 68.74956861427627


Epoch 3 of 5 | Iteration:  92%|█████████▏| 1121/1212 [08:59<00:46,  1.94it/s]

Gradient norm: 4.263886432833168


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1122/1212 [09:00<00:46,  1.93it/s]

Gradient norm: 4.25104312476216


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1123/1212 [09:00<00:48,  1.85it/s]

Gradient norm: 4.316658852362281


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1124/1212 [09:01<00:45,  1.93it/s]

Gradient norm: 4.559859707357577


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1125/1212 [09:01<00:46,  1.88it/s]

Gradient norm: 4.501681461629525


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1126/1212 [09:02<00:46,  1.85it/s]

Gradient norm: 5.99814787036537


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1127/1212 [09:02<00:42,  2.02it/s]

Gradient norm: 8.817018800571663


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1128/1212 [09:03<00:41,  2.04it/s]

Gradient norm: 9.188915149559755


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1129/1212 [09:03<00:42,  1.96it/s]

Gradient norm: 9.536960741496657


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1130/1212 [09:04<00:39,  2.07it/s]

Gradient norm: 12.423020234494834


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1131/1212 [09:04<00:36,  2.21it/s]

Gradient norm: 46.01330635657345


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1132/1212 [09:05<00:37,  2.14it/s]

Gradient norm: 45.895978262313925


Epoch 3 of 5 | Iteration:  93%|█████████▎| 1133/1212 [09:05<00:34,  2.27it/s]

Gradient norm: 45.95237313678324


Epoch 3 of 5 | Iteration:  94%|█████████▎| 1134/1212 [09:06<00:33,  2.34it/s]

Gradient norm: 46.60782208407957


Epoch 3 of 5 | Iteration:  94%|█████████▎| 1135/1212 [09:06<00:32,  2.38it/s]

Gradient norm: 47.42597387657255


Epoch 3 of 5 | Iteration:  94%|█████████▎| 1136/1212 [09:06<00:32,  2.37it/s]

Gradient norm: 47.0822388722065


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1137/1212 [09:07<00:30,  2.44it/s]

Gradient norm: 10.31451494100785


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1138/1212 [09:07<00:30,  2.42it/s]

Gradient norm: 88.74672885635647


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1139/1212 [09:08<00:33,  2.16it/s]

Gradient norm: 89.99254372162142


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1140/1212 [09:08<00:32,  2.19it/s]

Gradient norm: 90.35099485481501


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1141/1212 [09:09<00:31,  2.28it/s]

Gradient norm: 89.40585012027191


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1142/1212 [09:09<00:29,  2.35it/s]

Gradient norm: 88.59626918776374


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1143/1212 [09:09<00:29,  2.34it/s]

Gradient norm: 88.93668677575482


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1144/1212 [09:10<00:28,  2.37it/s]

Gradient norm: 88.8928601589789


Epoch 3 of 5 | Iteration:  94%|█████████▍| 1145/1212 [09:10<00:27,  2.43it/s]

Gradient norm: 105.57818495989119


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1146/1212 [09:11<00:26,  2.48it/s]

Gradient norm: 111.27280735956974


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1147/1212 [09:11<00:26,  2.49it/s]

Gradient norm: 112.74379082280005


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1148/1212 [09:11<00:25,  2.52it/s]

Gradient norm: 114.70666503528885


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1149/1212 [09:12<00:24,  2.55it/s]

Gradient norm: 112.04167866738217


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1150/1212 [09:12<00:25,  2.39it/s]

Gradient norm: 113.69084887244017


Epoch 3 of 5 | Iteration:  95%|█████████▍| 1151/1212 [09:13<00:27,  2.24it/s]

Gradient norm: 113.90723226655275


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1152/1212 [09:13<00:28,  2.13it/s]

Gradient norm: 113.72861314196022


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1153/1212 [09:14<00:29,  2.02it/s]

Gradient norm: 3.1367633534388255


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1154/1212 [09:14<00:28,  2.01it/s]

Gradient norm: 12.933731016499719


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1155/1212 [09:15<00:30,  1.88it/s]

Gradient norm: 14.139809065376628


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1156/1212 [09:16<00:31,  1.80it/s]

Gradient norm: 30.77624943282884


Epoch 3 of 5 | Iteration:  95%|█████████▌| 1157/1212 [09:16<00:28,  1.93it/s]

Gradient norm: 39.32305930184842


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1158/1212 [09:16<00:25,  2.08it/s]

Gradient norm: 38.729181200237456


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1159/1212 [09:17<00:23,  2.22it/s]

Gradient norm: 38.78882487793075


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1160/1212 [09:17<00:22,  2.33it/s]

Gradient norm: 38.911988765653135


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1161/1212 [09:18<00:20,  2.43it/s]

Gradient norm: 39.38636769815948


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1162/1212 [09:18<00:22,  2.26it/s]

Gradient norm: 39.46212511051783


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1163/1212 [09:19<00:23,  2.08it/s]

Gradient norm: 60.94970445086748


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1164/1212 [09:19<00:25,  1.91it/s]

Gradient norm: 60.944886472159716


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1165/1212 [09:20<00:22,  2.06it/s]

Gradient norm: 60.98301890514128


Epoch 3 of 5 | Iteration:  96%|█████████▌| 1166/1212 [09:20<00:20,  2.20it/s]

Gradient norm: 62.138308937713674


Epoch 3 of 5 | Iteration:  96%|█████████▋| 1167/1212 [09:20<00:20,  2.25it/s]

Gradient norm: 62.06241090270184


Epoch 3 of 5 | Iteration:  96%|█████████▋| 1168/1212 [09:21<00:19,  2.21it/s]

Gradient norm: 83.09152896974646


Epoch 3 of 5 | Iteration:  96%|█████████▋| 1169/1212 [09:21<00:19,  2.19it/s]

Gradient norm: 12.34043415240347


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1170/1212 [09:22<00:18,  2.27it/s]

Gradient norm: 24.69434529163288


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1171/1212 [09:22<00:18,  2.21it/s]

Gradient norm: 35.64195407603722


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1172/1212 [09:23<00:19,  2.06it/s]

Gradient norm: 35.32179079924521


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1173/1212 [09:23<00:20,  1.91it/s]

Gradient norm: 35.238778647428056


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1174/1212 [09:24<00:18,  2.03it/s]

Gradient norm: 34.83165056239599


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1175/1212 [09:24<00:19,  1.86it/s]

Gradient norm: 34.588569304861636


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1176/1212 [09:25<00:18,  1.95it/s]

Gradient norm: 34.573787369288624


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1177/1212 [09:25<00:16,  2.10it/s]

Gradient norm: 41.59837403360311


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1178/1212 [09:26<00:15,  2.23it/s]

Gradient norm: 80.64556318632837


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1179/1212 [09:26<00:15,  2.10it/s]

Gradient norm: 80.28321749595689


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1180/1212 [09:27<00:15,  2.01it/s]

Gradient norm: 80.00209552589695


Epoch 3 of 5 | Iteration:  97%|█████████▋| 1181/1212 [09:27<00:16,  1.90it/s]

Gradient norm: 84.90023309605122


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1182/1212 [09:28<00:15,  1.92it/s]

Gradient norm: 84.5258824954704


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1183/1212 [09:28<00:15,  1.88it/s]

Gradient norm: 84.46018269238299


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1184/1212 [09:29<00:16,  1.74it/s]

Gradient norm: 83.83384103868072


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1185/1212 [09:30<00:15,  1.69it/s]

Gradient norm: 265.15216412291164


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1186/1212 [09:30<00:13,  1.88it/s]

Gradient norm: 265.4613699998056


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1187/1212 [09:31<00:12,  1.94it/s]

Gradient norm: 265.5837614982117


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1188/1212 [09:31<00:12,  1.94it/s]

Gradient norm: 266.09439098844535


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1189/1212 [09:32<00:11,  2.01it/s]

Gradient norm: 266.08135428576236


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1190/1212 [09:32<00:10,  2.15it/s]

Gradient norm: 265.7942412023911


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1191/1212 [09:32<00:09,  2.26it/s]

Gradient norm: 267.21697107402406


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1192/1212 [09:33<00:09,  2.13it/s]

Gradient norm: 266.5181822935684


Epoch 3 of 5 | Iteration:  98%|█████████▊| 1193/1212 [09:33<00:08,  2.26it/s]

Gradient norm: 264.3284524165271


Epoch 3 of 5 | Iteration:  99%|█████████▊| 1194/1212 [09:34<00:07,  2.31it/s]

Gradient norm: 264.22420654908865


Epoch 3 of 5 | Iteration:  99%|█████████▊| 1195/1212 [09:34<00:07,  2.41it/s]

Gradient norm: 265.2310416634849


Epoch 3 of 5 | Iteration:  99%|█████████▊| 1196/1212 [09:34<00:06,  2.48it/s]

Gradient norm: 265.61784766005854


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1197/1212 [09:35<00:07,  2.03it/s]

Gradient norm: 266.1257581721736


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1198/1212 [09:36<00:07,  1.84it/s]

Gradient norm: 265.6031248555695


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1199/1212 [09:36<00:06,  2.02it/s]

Gradient norm: 265.4697203716083


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1200/1212 [09:37<00:06,  1.96it/s]

Gradient norm: 266.50278868110405


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1201/1212 [09:37<00:05,  2.11it/s]

Gradient norm: 13.568558034341331


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1202/1212 [09:38<00:04,  2.16it/s]

Gradient norm: 15.866616505627803


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1203/1212 [09:38<00:05,  1.80it/s]

Gradient norm: 16.098900297889948


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1204/1212 [09:39<00:04,  1.99it/s]

Gradient norm: 21.143239295440118


Epoch 3 of 5 | Iteration:  99%|█████████▉| 1205/1212 [09:39<00:03,  2.12it/s]

Gradient norm: 20.90743244060843


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1206/1212 [09:40<00:02,  2.06it/s]

Gradient norm: 20.87775419856162


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1207/1212 [09:40<00:02,  1.97it/s]

Gradient norm: 159.93941541878036


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1208/1212 [09:41<00:02,  1.95it/s]

Gradient norm: 160.771399376026


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1209/1212 [09:41<00:01,  1.90it/s]

Gradient norm: 160.51841221703197


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1210/1212 [09:42<00:01,  1.86it/s]

Gradient norm: 161.25716389497168


Epoch 3 of 5 | Iteration: 100%|█████████▉| 1211/1212 [09:42<00:00,  1.81it/s]

Gradient norm: 160.99551306497202


Epoch 3 of 5 | Iteration: 100%|██████████| 1212/1212 [09:43<00:00,  2.08it/s]


Gradient norm: 160.9116398522317


100%|██████████| 1212/1212 [05:16<00:00,  3.83it/s]


Epoch 3/5, Training Loss: 1.6763, Validation Loss: 1.6459
Validation top k acc: 0.8531
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     10666
           1       0.33      0.62      0.43      1947

    accuracy                           0.75     12613
   macro avg       0.63      0.70      0.64     12613
weighted avg       0.83      0.75      0.78     12613



Epoch 4 of 5 | Iteration:   0%|          | 0/1212 [00:00<?, ?it/s]

Train ...


Epoch 4 of 5 | Iteration:   0%|          | 1/1212 [00:00<12:56,  1.56it/s]

Gradient norm: 160.7370337602968


Epoch 4 of 5 | Iteration:   0%|          | 2/1212 [00:01<11:58,  1.68it/s]

Gradient norm: 160.6661390279899


Epoch 4 of 5 | Iteration:   0%|          | 3/1212 [00:01<13:14,  1.52it/s]

Gradient norm: 160.00872646691963


Epoch 4 of 5 | Iteration:   0%|          | 4/1212 [00:02<13:03,  1.54it/s]

Gradient norm: 159.54768772186512


Epoch 4 of 5 | Iteration:   0%|          | 5/1212 [00:03<12:52,  1.56it/s]

Gradient norm: 161.13180864110694


Epoch 4 of 5 | Iteration:   0%|          | 6/1212 [00:03<11:10,  1.80it/s]

Gradient norm: 161.3669179633195


Epoch 4 of 5 | Iteration:   1%|          | 7/1212 [00:03<10:04,  1.99it/s]

Gradient norm: 161.64235534458876


Epoch 4 of 5 | Iteration:   1%|          | 8/1212 [00:04<09:24,  2.13it/s]

Gradient norm: 161.66512406548097


Epoch 4 of 5 | Iteration:   1%|          | 9/1212 [00:04<08:53,  2.25it/s]

Gradient norm: 161.83332308322406


Epoch 4 of 5 | Iteration:   1%|          | 10/1212 [00:05<08:38,  2.32it/s]

Gradient norm: 161.71986502652237


Epoch 4 of 5 | Iteration:   1%|          | 11/1212 [00:05<08:36,  2.33it/s]

Gradient norm: 161.53352004570715


Epoch 4 of 5 | Iteration:   1%|          | 12/1212 [00:06<08:33,  2.34it/s]

Gradient norm: 161.38717008788447


Epoch 4 of 5 | Iteration:   1%|          | 13/1212 [00:06<09:33,  2.09it/s]

Gradient norm: 163.37694690106713


Epoch 4 of 5 | Iteration:   1%|          | 14/1212 [00:06<08:56,  2.23it/s]

Gradient norm: 163.9397563809951


Epoch 4 of 5 | Iteration:   1%|          | 15/1212 [00:07<08:47,  2.27it/s]

Gradient norm: 163.9269658046819


Epoch 4 of 5 | Iteration:   1%|▏         | 16/1212 [00:07<08:59,  2.22it/s]

Gradient norm: 165.93010198504487


Epoch 4 of 5 | Iteration:   1%|▏         | 17/1212 [00:08<08:43,  2.28it/s]

Gradient norm: 19.71676521247529


Epoch 4 of 5 | Iteration:   1%|▏         | 18/1212 [00:08<08:18,  2.39it/s]

Gradient norm: 162.41039963990568


Epoch 4 of 5 | Iteration:   2%|▏         | 19/1212 [00:09<08:37,  2.30it/s]

Gradient norm: 162.542538034343


Epoch 4 of 5 | Iteration:   2%|▏         | 20/1212 [00:09<08:27,  2.35it/s]

Gradient norm: 162.36933632309223


Epoch 4 of 5 | Iteration:   2%|▏         | 21/1212 [00:09<08:11,  2.42it/s]

Gradient norm: 162.70453696914248


Epoch 4 of 5 | Iteration:   2%|▏         | 22/1212 [00:10<09:03,  2.19it/s]

Gradient norm: 162.68543348568335


Epoch 4 of 5 | Iteration:   2%|▏         | 23/1212 [00:10<08:40,  2.29it/s]

Gradient norm: 162.2057665195014


Epoch 4 of 5 | Iteration:   2%|▏         | 24/1212 [00:11<08:50,  2.24it/s]

Gradient norm: 163.3896958122793


Epoch 4 of 5 | Iteration:   2%|▏         | 25/1212 [00:11<08:38,  2.29it/s]

Gradient norm: 162.72091934792968


Epoch 4 of 5 | Iteration:   2%|▏         | 26/1212 [00:12<09:29,  2.08it/s]

Gradient norm: 167.6079033214145


Epoch 4 of 5 | Iteration:   2%|▏         | 27/1212 [00:12<09:37,  2.05it/s]

Gradient norm: 167.55369000294704


Epoch 4 of 5 | Iteration:   2%|▏         | 28/1212 [00:13<09:54,  1.99it/s]

Gradient norm: 168.48361651945447


Epoch 4 of 5 | Iteration:   2%|▏         | 29/1212 [00:13<10:05,  1.95it/s]

Gradient norm: 168.71048925862115


Epoch 4 of 5 | Iteration:   2%|▏         | 30/1212 [00:14<10:28,  1.88it/s]

Gradient norm: 172.463515938557


Epoch 4 of 5 | Iteration:   3%|▎         | 31/1212 [00:15<10:38,  1.85it/s]

Gradient norm: 172.40397398532602


Epoch 4 of 5 | Iteration:   3%|▎         | 32/1212 [00:15<11:54,  1.65it/s]

Gradient norm: 172.77086254745333


Epoch 4 of 5 | Iteration:   3%|▎         | 33/1212 [00:16<12:05,  1.63it/s]

Gradient norm: 1.9734927132485491


Epoch 4 of 5 | Iteration:   3%|▎         | 34/1212 [00:16<10:55,  1.80it/s]

Gradient norm: 3.4579670645993787


Epoch 4 of 5 | Iteration:   3%|▎         | 35/1212 [00:17<10:09,  1.93it/s]

Gradient norm: 4.512672171036038


Epoch 4 of 5 | Iteration:   3%|▎         | 36/1212 [00:17<09:23,  2.09it/s]

Gradient norm: 4.664992756028927


Epoch 4 of 5 | Iteration:   3%|▎         | 37/1212 [00:18<09:16,  2.11it/s]

Gradient norm: 7.500785879055792


Epoch 4 of 5 | Iteration:   3%|▎         | 38/1212 [00:18<08:52,  2.20it/s]

Gradient norm: 7.651369140451364


Epoch 4 of 5 | Iteration:   3%|▎         | 39/1212 [00:18<08:29,  2.30it/s]

Gradient norm: 7.752530983107572


Epoch 4 of 5 | Iteration:   3%|▎         | 40/1212 [00:19<08:10,  2.39it/s]

Gradient norm: 25.296087678818136


Epoch 4 of 5 | Iteration:   3%|▎         | 41/1212 [00:19<07:59,  2.44it/s]

Gradient norm: 25.575644654944025


Epoch 4 of 5 | Iteration:   3%|▎         | 42/1212 [00:20<08:01,  2.43it/s]

Gradient norm: 28.06873406645638


Epoch 4 of 5 | Iteration:   4%|▎         | 43/1212 [00:20<07:49,  2.49it/s]

Gradient norm: 30.528717398383957


Epoch 4 of 5 | Iteration:   4%|▎         | 44/1212 [00:21<09:06,  2.14it/s]

Gradient norm: 30.60145530514021


Epoch 4 of 5 | Iteration:   4%|▎         | 45/1212 [00:21<09:54,  1.96it/s]

Gradient norm: 30.39979877016271


Epoch 4 of 5 | Iteration:   4%|▍         | 46/1212 [00:22<10:44,  1.81it/s]

Gradient norm: 38.52041216073239


Epoch 4 of 5 | Iteration:   4%|▍         | 47/1212 [00:22<09:45,  1.99it/s]

Gradient norm: 38.95077824744341


Epoch 4 of 5 | Iteration:   4%|▍         | 48/1212 [00:23<09:33,  2.03it/s]

Gradient norm: 39.050954293381025


Epoch 4 of 5 | Iteration:   4%|▍         | 49/1212 [00:23<10:27,  1.85it/s]

Gradient norm: 62.867033340458455


Epoch 4 of 5 | Iteration:   4%|▍         | 50/1212 [00:24<09:38,  2.01it/s]

Gradient norm: 62.84770069423315


Epoch 4 of 5 | Iteration:   4%|▍         | 51/1212 [00:24<09:05,  2.13it/s]

Gradient norm: 81.28602048168823


Epoch 4 of 5 | Iteration:   4%|▍         | 52/1212 [00:25<08:39,  2.23it/s]

Gradient norm: 81.83198523979782


Epoch 4 of 5 | Iteration:   4%|▍         | 53/1212 [00:25<08:18,  2.33it/s]

Gradient norm: 95.12443255363544


Epoch 4 of 5 | Iteration:   4%|▍         | 54/1212 [00:25<08:03,  2.39it/s]

Gradient norm: 94.99919547446737


Epoch 4 of 5 | Iteration:   5%|▍         | 55/1212 [00:26<08:38,  2.23it/s]

Gradient norm: 95.21196386855779


Epoch 4 of 5 | Iteration:   5%|▍         | 56/1212 [00:27<09:34,  2.01it/s]

Gradient norm: 95.41523394104394


Epoch 4 of 5 | Iteration:   5%|▍         | 57/1212 [00:27<09:32,  2.02it/s]

Gradient norm: 94.5455006186117


Epoch 4 of 5 | Iteration:   5%|▍         | 58/1212 [00:28<10:35,  1.82it/s]

Gradient norm: 95.74734343374872


Epoch 4 of 5 | Iteration:   5%|▍         | 59/1212 [00:28<10:14,  1.88it/s]

Gradient norm: 96.45225746446532


Epoch 4 of 5 | Iteration:   5%|▍         | 60/1212 [00:29<10:33,  1.82it/s]

Gradient norm: 96.39891904428472


Epoch 4 of 5 | Iteration:   5%|▌         | 61/1212 [00:29<10:31,  1.82it/s]

Gradient norm: 96.31305422473102


Epoch 4 of 5 | Iteration:   5%|▌         | 62/1212 [00:30<10:22,  1.85it/s]

Gradient norm: 96.46776650466394


Epoch 4 of 5 | Iteration:   5%|▌         | 63/1212 [00:30<09:28,  2.02it/s]

Gradient norm: 155.53984139142997


Epoch 4 of 5 | Iteration:   5%|▌         | 64/1212 [00:31<09:23,  2.04it/s]

Gradient norm: 401.34753886833835


Epoch 4 of 5 | Iteration:   5%|▌         | 65/1212 [00:31<09:04,  2.11it/s]

Gradient norm: 6.326868666828214


Epoch 4 of 5 | Iteration:   5%|▌         | 66/1212 [00:32<08:34,  2.23it/s]

Gradient norm: 9.899851748205457


Epoch 4 of 5 | Iteration:   6%|▌         | 67/1212 [00:32<09:48,  1.95it/s]

Gradient norm: 11.22120711618215


Epoch 4 of 5 | Iteration:   6%|▌         | 68/1212 [00:33<09:30,  2.01it/s]

Gradient norm: 13.456213417194366


Epoch 4 of 5 | Iteration:   6%|▌         | 69/1212 [00:33<09:03,  2.10it/s]

Gradient norm: 13.72042431462101


Epoch 4 of 5 | Iteration:   6%|▌         | 70/1212 [00:33<08:30,  2.24it/s]

Gradient norm: 16.22727456425897


Epoch 4 of 5 | Iteration:   6%|▌         | 71/1212 [00:34<08:09,  2.33it/s]

Gradient norm: 16.480156498999946


Epoch 4 of 5 | Iteration:   6%|▌         | 72/1212 [00:34<08:18,  2.29it/s]

Gradient norm: 16.791355532813704


Epoch 4 of 5 | Iteration:   6%|▌         | 73/1212 [00:35<08:39,  2.19it/s]

Gradient norm: 16.52252002622139


Epoch 4 of 5 | Iteration:   6%|▌         | 74/1212 [00:35<08:11,  2.32it/s]

Gradient norm: 105.88775482179425


Epoch 4 of 5 | Iteration:   6%|▌         | 75/1212 [00:36<08:22,  2.26it/s]

Gradient norm: 105.72354481943022


Epoch 4 of 5 | Iteration:   6%|▋         | 76/1212 [00:36<08:05,  2.34it/s]

Gradient norm: 106.56295975658448


Epoch 4 of 5 | Iteration:   6%|▋         | 77/1212 [00:36<07:58,  2.37it/s]

Gradient norm: 106.209906689332


Epoch 4 of 5 | Iteration:   6%|▋         | 78/1212 [00:37<07:44,  2.44it/s]

Gradient norm: 651.3606010179087


Epoch 4 of 5 | Iteration:   7%|▋         | 79/1212 [00:37<07:39,  2.47it/s]

Gradient norm: 651.4262376267815


Epoch 4 of 5 | Iteration:   7%|▋         | 80/1212 [00:38<07:35,  2.48it/s]

Gradient norm: 651.1978897932005


Epoch 4 of 5 | Iteration:   7%|▋         | 81/1212 [00:38<07:33,  2.49it/s]

Gradient norm: 2.7771106289218235


Epoch 4 of 5 | Iteration:   7%|▋         | 82/1212 [00:38<07:28,  2.52it/s]

Gradient norm: 6.371227040257853


Epoch 4 of 5 | Iteration:   7%|▋         | 83/1212 [00:39<07:24,  2.54it/s]

Gradient norm: 7.2426110195845474


Epoch 4 of 5 | Iteration:   7%|▋         | 84/1212 [00:39<08:19,  2.26it/s]

Gradient norm: 64.78991019995276


Epoch 4 of 5 | Iteration:   7%|▋         | 85/1212 [00:40<08:28,  2.21it/s]

Gradient norm: 64.7124320899993


Epoch 4 of 5 | Iteration:   7%|▋         | 86/1212 [00:40<09:08,  2.05it/s]

Gradient norm: 64.18436473271036


Epoch 4 of 5 | Iteration:   7%|▋         | 87/1212 [00:41<09:19,  2.01it/s]

Gradient norm: 70.74887427541339


Epoch 4 of 5 | Iteration:   7%|▋         | 88/1212 [00:42<10:12,  1.83it/s]

Gradient norm: 70.76155156659576


Epoch 4 of 5 | Iteration:   7%|▋         | 89/1212 [00:42<09:58,  1.88it/s]

Gradient norm: 72.23516815379148


Epoch 4 of 5 | Iteration:   7%|▋         | 90/1212 [00:43<10:17,  1.82it/s]

Gradient norm: 74.06746252579832


Epoch 4 of 5 | Iteration:   8%|▊         | 91/1212 [00:43<10:45,  1.74it/s]

Gradient norm: 74.11943045077247


Epoch 4 of 5 | Iteration:   8%|▊         | 92/1212 [00:44<09:50,  1.90it/s]

Gradient norm: 74.32010984346019


Epoch 4 of 5 | Iteration:   8%|▊         | 93/1212 [00:44<09:02,  2.06it/s]

Gradient norm: 74.84018245628303


Epoch 4 of 5 | Iteration:   8%|▊         | 94/1212 [00:45<09:55,  1.88it/s]

Gradient norm: 76.36227173070641


Epoch 4 of 5 | Iteration:   8%|▊         | 95/1212 [00:45<09:18,  2.00it/s]

Gradient norm: 76.05723969535195


Epoch 4 of 5 | Iteration:   8%|▊         | 96/1212 [00:46<08:48,  2.11it/s]

Gradient norm: 75.70021312403924


Epoch 4 of 5 | Iteration:   8%|▊         | 97/1212 [00:46<08:56,  2.08it/s]

Gradient norm: 364.43294947616494


Epoch 4 of 5 | Iteration:   8%|▊         | 98/1212 [00:46<08:38,  2.15it/s]

Gradient norm: 362.98073241941245


Epoch 4 of 5 | Iteration:   8%|▊         | 99/1212 [00:47<08:18,  2.23it/s]

Gradient norm: 360.6320840406562


Epoch 4 of 5 | Iteration:   8%|▊         | 100/1212 [00:47<08:50,  2.10it/s]

Gradient norm: 361.35472400339654


Epoch 4 of 5 | Iteration:   8%|▊         | 101/1212 [00:48<08:33,  2.16it/s]

Gradient norm: 360.76156665820776


Epoch 4 of 5 | Iteration:   8%|▊         | 102/1212 [00:48<09:12,  2.01it/s]

Gradient norm: 359.6068937854826


Epoch 4 of 5 | Iteration:   8%|▊         | 103/1212 [00:49<08:41,  2.13it/s]

Gradient norm: 359.4090689939869


Epoch 4 of 5 | Iteration:   9%|▊         | 104/1212 [00:49<08:15,  2.23it/s]

Gradient norm: 359.2934009291029


Epoch 4 of 5 | Iteration:   9%|▊         | 105/1212 [00:50<08:10,  2.26it/s]

Gradient norm: 358.90541872106184


Epoch 4 of 5 | Iteration:   9%|▊         | 106/1212 [00:50<08:43,  2.11it/s]

Gradient norm: 382.6627742886224


Epoch 4 of 5 | Iteration:   9%|▉         | 107/1212 [00:51<08:18,  2.22it/s]

Gradient norm: 382.5948922778185


Epoch 4 of 5 | Iteration:   9%|▉         | 108/1212 [00:51<07:55,  2.32it/s]

Gradient norm: 382.7061303457124


Epoch 4 of 5 | Iteration:   9%|▉         | 109/1212 [00:52<08:26,  2.18it/s]

Gradient norm: 382.8365346586214


Epoch 4 of 5 | Iteration:   9%|▉         | 110/1212 [00:52<08:00,  2.29it/s]

Gradient norm: 384.6300839432846


Epoch 4 of 5 | Iteration:   9%|▉         | 111/1212 [00:53<09:20,  1.97it/s]

Gradient norm: 387.4021574383709


Epoch 4 of 5 | Iteration:   9%|▉         | 112/1212 [00:53<08:48,  2.08it/s]

Gradient norm: 388.03065088101795


Epoch 4 of 5 | Iteration:   9%|▉         | 113/1212 [00:53<08:10,  2.24it/s]

Gradient norm: 5.814725715412619


Epoch 4 of 5 | Iteration:   9%|▉         | 114/1212 [00:54<08:58,  2.04it/s]

Gradient norm: 6.751279471050344


Epoch 4 of 5 | Iteration:   9%|▉         | 115/1212 [00:55<09:28,  1.93it/s]

Gradient norm: 6.816950509784542


Epoch 4 of 5 | Iteration:  10%|▉         | 116/1212 [00:55<11:15,  1.62it/s]

Gradient norm: 10.089799480683642


Epoch 4 of 5 | Iteration:  10%|▉         | 117/1212 [00:56<11:08,  1.64it/s]

Gradient norm: 10.17420155230335


Epoch 4 of 5 | Iteration:  10%|▉         | 118/1212 [00:57<12:01,  1.52it/s]

Gradient norm: 11.028924422786703


Epoch 4 of 5 | Iteration:  10%|▉         | 119/1212 [00:57<11:13,  1.62it/s]

Gradient norm: 17.13368687570987


Epoch 4 of 5 | Iteration:  10%|▉         | 120/1212 [00:58<09:56,  1.83it/s]

Gradient norm: 17.32372777531222


Epoch 4 of 5 | Iteration:  10%|▉         | 121/1212 [00:58<09:08,  1.99it/s]

Gradient norm: 104.48154685213902


Epoch 4 of 5 | Iteration:  10%|█         | 122/1212 [00:58<08:38,  2.10it/s]

Gradient norm: 108.23950202216828


Epoch 4 of 5 | Iteration:  10%|█         | 123/1212 [00:59<08:57,  2.03it/s]

Gradient norm: 110.02629544003547


Epoch 4 of 5 | Iteration:  10%|█         | 124/1212 [01:00<09:41,  1.87it/s]

Gradient norm: 110.62865546079425


Epoch 4 of 5 | Iteration:  10%|█         | 125/1212 [01:00<08:59,  2.02it/s]

Gradient norm: 143.88648002499104


Epoch 4 of 5 | Iteration:  10%|█         | 126/1212 [01:00<08:20,  2.17it/s]

Gradient norm: 144.4380032373451


Epoch 4 of 5 | Iteration:  10%|█         | 127/1212 [01:01<08:02,  2.25it/s]

Gradient norm: 146.00480516632712


Epoch 4 of 5 | Iteration:  11%|█         | 128/1212 [01:01<08:04,  2.24it/s]

Gradient norm: 146.0687477954304


Epoch 4 of 5 | Iteration:  11%|█         | 129/1212 [01:02<08:11,  2.20it/s]

Gradient norm: 423.59876885496635


Epoch 4 of 5 | Iteration:  11%|█         | 130/1212 [01:02<08:34,  2.10it/s]

Gradient norm: 428.0700128142843


Epoch 4 of 5 | Iteration:  11%|█         | 131/1212 [01:03<08:06,  2.22it/s]

Gradient norm: 427.76708020021744


Epoch 4 of 5 | Iteration:  11%|█         | 132/1212 [01:03<07:48,  2.30it/s]

Gradient norm: 426.1954102183448


Epoch 4 of 5 | Iteration:  11%|█         | 133/1212 [01:04<08:03,  2.23it/s]

Gradient norm: 430.10922034393946


Epoch 4 of 5 | Iteration:  11%|█         | 134/1212 [01:04<09:08,  1.96it/s]

Gradient norm: 457.47804032822177


Epoch 4 of 5 | Iteration:  11%|█         | 135/1212 [01:05<08:34,  2.09it/s]

Gradient norm: 457.81257772689924


Epoch 4 of 5 | Iteration:  11%|█         | 136/1212 [01:05<08:18,  2.16it/s]

Gradient norm: 458.0884464657835


Epoch 4 of 5 | Iteration:  11%|█▏        | 137/1212 [01:05<08:01,  2.23it/s]

Gradient norm: 460.0436518191228


Epoch 4 of 5 | Iteration:  11%|█▏        | 138/1212 [01:06<08:44,  2.05it/s]

Gradient norm: 458.8556131077843


Epoch 4 of 5 | Iteration:  11%|█▏        | 139/1212 [01:06<08:13,  2.17it/s]

Gradient norm: 455.75007003461815


Epoch 4 of 5 | Iteration:  12%|█▏        | 140/1212 [01:07<08:00,  2.23it/s]

Gradient norm: 455.55314672581585


Epoch 4 of 5 | Iteration:  12%|█▏        | 141/1212 [01:07<08:53,  2.01it/s]

Gradient norm: 463.396329753801


Epoch 4 of 5 | Iteration:  12%|█▏        | 142/1212 [01:08<08:59,  1.98it/s]

Gradient norm: 462.41911901319446


Epoch 4 of 5 | Iteration:  12%|█▏        | 143/1212 [01:09<09:13,  1.93it/s]

Gradient norm: 462.65791969158715


Epoch 4 of 5 | Iteration:  12%|█▏        | 144/1212 [01:09<09:15,  1.92it/s]

Gradient norm: 459.2291868224244


Epoch 4 of 5 | Iteration:  12%|█▏        | 145/1212 [01:10<09:05,  1.96it/s]

Gradient norm: 2.5224585131506516


Epoch 4 of 5 | Iteration:  12%|█▏        | 146/1212 [01:10<09:23,  1.89it/s]

Gradient norm: 5.325078805609273


Epoch 4 of 5 | Iteration:  12%|█▏        | 147/1212 [01:11<09:59,  1.78it/s]

Gradient norm: 63.49375693289649


Epoch 4 of 5 | Iteration:  12%|█▏        | 148/1212 [01:11<09:07,  1.94it/s]

Gradient norm: 63.184920028239105


Epoch 4 of 5 | Iteration:  12%|█▏        | 149/1212 [01:12<09:01,  1.96it/s]

Gradient norm: 63.67376043143036


Epoch 4 of 5 | Iteration:  12%|█▏        | 150/1212 [01:12<08:21,  2.12it/s]

Gradient norm: 66.7532130453232


Epoch 4 of 5 | Iteration:  12%|█▏        | 151/1212 [01:13<08:20,  2.12it/s]

Gradient norm: 67.40455602706773


Epoch 4 of 5 | Iteration:  13%|█▎        | 152/1212 [01:13<07:49,  2.26it/s]

Gradient norm: 66.15927064023087


Epoch 4 of 5 | Iteration:  13%|█▎        | 153/1212 [01:13<08:22,  2.11it/s]

Gradient norm: 68.00056865890012


Epoch 4 of 5 | Iteration:  13%|█▎        | 154/1212 [01:14<08:02,  2.19it/s]

Gradient norm: 78.54232289468524


Epoch 4 of 5 | Iteration:  13%|█▎        | 155/1212 [01:14<07:36,  2.32it/s]

Gradient norm: 81.66025853321364


Epoch 4 of 5 | Iteration:  13%|█▎        | 156/1212 [01:15<08:33,  2.05it/s]

Gradient norm: 88.20508012659457


Epoch 4 of 5 | Iteration:  13%|█▎        | 157/1212 [01:15<08:31,  2.06it/s]

Gradient norm: 87.78205030113968


Epoch 4 of 5 | Iteration:  13%|█▎        | 158/1212 [01:16<08:16,  2.12it/s]

Gradient norm: 87.79908974602277


Epoch 4 of 5 | Iteration:  13%|█▎        | 159/1212 [01:16<07:55,  2.22it/s]

Gradient norm: 87.90739765002685


Epoch 4 of 5 | Iteration:  13%|█▎        | 160/1212 [01:17<08:04,  2.17it/s]

Gradient norm: 86.50249238500808


Epoch 4 of 5 | Iteration:  13%|█▎        | 161/1212 [01:17<07:48,  2.24it/s]

Gradient norm: 8.620772057632568


Epoch 4 of 5 | Iteration:  13%|█▎        | 162/1212 [01:17<07:33,  2.32it/s]

Gradient norm: 26.662032509861092


Epoch 4 of 5 | Iteration:  13%|█▎        | 163/1212 [01:18<08:03,  2.17it/s]

Gradient norm: 26.778510441384565


Epoch 4 of 5 | Iteration:  14%|█▎        | 164/1212 [01:18<07:49,  2.23it/s]

Gradient norm: 26.745486126786687


Epoch 4 of 5 | Iteration:  14%|█▎        | 165/1212 [01:19<07:35,  2.30it/s]

Gradient norm: 135.97526106214644


Epoch 4 of 5 | Iteration:  14%|█▎        | 166/1212 [01:19<08:27,  2.06it/s]

Gradient norm: 135.6289395851796


Epoch 4 of 5 | Iteration:  14%|█▍        | 167/1212 [01:20<08:15,  2.11it/s]

Gradient norm: 134.8334197338023


Epoch 4 of 5 | Iteration:  14%|█▍        | 168/1212 [01:20<07:57,  2.19it/s]

Gradient norm: 133.77482944558318


Epoch 4 of 5 | Iteration:  14%|█▍        | 169/1212 [01:21<07:52,  2.21it/s]

Gradient norm: 134.85104830026322


Epoch 4 of 5 | Iteration:  14%|█▍        | 170/1212 [01:21<08:22,  2.08it/s]

Gradient norm: 137.08697733735926


Epoch 4 of 5 | Iteration:  14%|█▍        | 171/1212 [01:22<08:42,  1.99it/s]

Gradient norm: 137.08370222253902


Epoch 4 of 5 | Iteration:  14%|█▍        | 172/1212 [01:23<09:44,  1.78it/s]

Gradient norm: 142.8343276740539


Epoch 4 of 5 | Iteration:  14%|█▍        | 173/1212 [01:23<09:28,  1.83it/s]

Gradient norm: 140.66468841194384


Epoch 4 of 5 | Iteration:  14%|█▍        | 174/1212 [01:24<09:19,  1.85it/s]

Gradient norm: 139.38721559783912


Epoch 4 of 5 | Iteration:  14%|█▍        | 175/1212 [01:24<09:41,  1.78it/s]

Gradient norm: 140.0089499272868


Epoch 4 of 5 | Iteration:  15%|█▍        | 176/1212 [01:25<09:54,  1.74it/s]

Gradient norm: 140.2762040648764


Epoch 4 of 5 | Iteration:  15%|█▍        | 177/1212 [01:26<10:57,  1.57it/s]

Gradient norm: 5.453402351181313


Epoch 4 of 5 | Iteration:  15%|█▍        | 178/1212 [01:26<09:48,  1.76it/s]

Gradient norm: 5.772424734203843


Epoch 4 of 5 | Iteration:  15%|█▍        | 179/1212 [01:26<09:06,  1.89it/s]

Gradient norm: 6.03492198299378


Epoch 4 of 5 | Iteration:  15%|█▍        | 180/1212 [01:27<08:28,  2.03it/s]

Gradient norm: 97.99793583059599


Epoch 4 of 5 | Iteration:  15%|█▍        | 181/1212 [01:27<08:30,  2.02it/s]

Gradient norm: 98.37384051364472


Epoch 4 of 5 | Iteration:  15%|█▌        | 182/1212 [01:28<07:56,  2.16it/s]

Gradient norm: 98.26693323990098


Epoch 4 of 5 | Iteration:  15%|█▌        | 183/1212 [01:28<07:39,  2.24it/s]

Gradient norm: 98.71434588593291


Epoch 4 of 5 | Iteration:  15%|█▌        | 184/1212 [01:29<07:27,  2.30it/s]

Gradient norm: 126.07548429524361


Epoch 4 of 5 | Iteration:  15%|█▌        | 185/1212 [01:29<07:19,  2.34it/s]

Gradient norm: 128.80773160397456


Epoch 4 of 5 | Iteration:  15%|█▌        | 186/1212 [01:29<08:02,  2.13it/s]

Gradient norm: 128.74737583831555


Epoch 4 of 5 | Iteration:  15%|█▌        | 187/1212 [01:30<07:38,  2.24it/s]

Gradient norm: 131.5381092498867


Epoch 4 of 5 | Iteration:  16%|█▌        | 188/1212 [01:30<08:02,  2.12it/s]

Gradient norm: 141.89568805551656


Epoch 4 of 5 | Iteration:  16%|█▌        | 189/1212 [01:31<07:33,  2.26it/s]

Gradient norm: 141.15413639348515


Epoch 4 of 5 | Iteration:  16%|█▌        | 190/1212 [01:31<07:55,  2.15it/s]

Gradient norm: 141.38356483454413


Epoch 4 of 5 | Iteration:  16%|█▌        | 191/1212 [01:32<07:53,  2.16it/s]

Gradient norm: 140.94842201709594


Epoch 4 of 5 | Iteration:  16%|█▌        | 192/1212 [01:32<07:51,  2.17it/s]

Gradient norm: 129.54034717230678


Epoch 4 of 5 | Iteration:  16%|█▌        | 193/1212 [01:33<08:20,  2.04it/s]

Gradient norm: 5.256741872363649


Epoch 4 of 5 | Iteration:  16%|█▌        | 194/1212 [01:33<07:58,  2.13it/s]

Gradient norm: 5.502888393361771


Epoch 4 of 5 | Iteration:  16%|█▌        | 195/1212 [01:34<07:33,  2.24it/s]

Gradient norm: 23.52090135559918


Epoch 4 of 5 | Iteration:  16%|█▌        | 196/1212 [01:34<07:13,  2.35it/s]

Gradient norm: 23.52090135559918


Epoch 4 of 5 | Iteration:  16%|█▋        | 197/1212 [01:34<07:24,  2.28it/s]

Gradient norm: 41.33808586480143


Epoch 4 of 5 | Iteration:  16%|█▋        | 198/1212 [01:35<07:36,  2.22it/s]

Gradient norm: 905.5699307051442


Epoch 4 of 5 | Iteration:  16%|█▋        | 199/1212 [01:35<08:06,  2.08it/s]

Gradient norm: 905.9430249940569


Epoch 4 of 5 | Iteration:  17%|█▋        | 200/1212 [01:36<08:43,  1.93it/s]

Gradient norm: 905.9114623820515


Epoch 4 of 5 | Iteration:  17%|█▋        | 201/1212 [01:37<09:12,  1.83it/s]

Gradient norm: 908.3020274302008


Epoch 4 of 5 | Iteration:  17%|█▋        | 202/1212 [01:37<09:57,  1.69it/s]

Gradient norm: 908.2103915347055


Epoch 4 of 5 | Iteration:  17%|█▋        | 203/1212 [01:38<10:02,  1.68it/s]

Gradient norm: 907.330754580634


Epoch 4 of 5 | Iteration:  17%|█▋        | 204/1212 [01:39<10:00,  1.68it/s]

Gradient norm: 906.738551757567


Epoch 4 of 5 | Iteration:  17%|█▋        | 205/1212 [01:39<08:52,  1.89it/s]

Gradient norm: 906.7193962976942


Epoch 4 of 5 | Iteration:  17%|█▋        | 206/1212 [01:39<08:16,  2.03it/s]

Gradient norm: 906.6478490910094


Epoch 4 of 5 | Iteration:  17%|█▋        | 207/1212 [01:40<07:51,  2.13it/s]

Gradient norm: 907.1631835670909


Epoch 4 of 5 | Iteration:  17%|█▋        | 208/1212 [01:40<07:55,  2.11it/s]

Gradient norm: 965.5187200484984


Epoch 4 of 5 | Iteration:  17%|█▋        | 209/1212 [01:41<07:59,  2.09it/s]

Gradient norm: 4.962368273124702


Epoch 4 of 5 | Iteration:  17%|█▋        | 210/1212 [01:41<08:00,  2.09it/s]

Gradient norm: 24.338433851445128


Epoch 4 of 5 | Iteration:  17%|█▋        | 211/1212 [01:42<07:46,  2.14it/s]

Gradient norm: 24.389456043127627


Epoch 4 of 5 | Iteration:  17%|█▋        | 212/1212 [01:42<07:21,  2.26it/s]

Gradient norm: 25.19635948745509


Epoch 4 of 5 | Iteration:  18%|█▊        | 213/1212 [01:43<07:59,  2.08it/s]

Gradient norm: 24.84771277888656


Epoch 4 of 5 | Iteration:  18%|█▊        | 214/1212 [01:43<08:23,  1.98it/s]

Gradient norm: 31.15853050224404


Epoch 4 of 5 | Iteration:  18%|█▊        | 215/1212 [01:44<07:49,  2.13it/s]

Gradient norm: 30.799582586710567


Epoch 4 of 5 | Iteration:  18%|█▊        | 216/1212 [01:44<07:31,  2.21it/s]

Gradient norm: 30.801622113408097


Epoch 4 of 5 | Iteration:  18%|█▊        | 217/1212 [01:44<07:14,  2.29it/s]

Gradient norm: 30.75538208878424


Epoch 4 of 5 | Iteration:  18%|█▊        | 218/1212 [01:45<06:57,  2.38it/s]

Gradient norm: 31.23675631310171


Epoch 4 of 5 | Iteration:  18%|█▊        | 219/1212 [01:45<06:53,  2.40it/s]

Gradient norm: 80.66597073972645


Epoch 4 of 5 | Iteration:  18%|█▊        | 220/1212 [01:46<06:37,  2.49it/s]

Gradient norm: 80.93296330737924


Epoch 4 of 5 | Iteration:  18%|█▊        | 221/1212 [01:46<06:40,  2.48it/s]

Gradient norm: 80.57156545491459


Epoch 4 of 5 | Iteration:  18%|█▊        | 222/1212 [01:46<06:44,  2.44it/s]

Gradient norm: 80.71691952203376


Epoch 4 of 5 | Iteration:  18%|█▊        | 223/1212 [01:47<06:50,  2.41it/s]

Gradient norm: 81.1560408565136


Epoch 4 of 5 | Iteration:  18%|█▊        | 224/1212 [01:47<06:47,  2.43it/s]

Gradient norm: 83.04380584158984


Epoch 4 of 5 | Iteration:  19%|█▊        | 225/1212 [01:48<07:06,  2.31it/s]

Gradient norm: 39.72368882005322


Epoch 4 of 5 | Iteration:  19%|█▊        | 226/1212 [01:48<06:52,  2.39it/s]

Gradient norm: 42.14172722450481


Epoch 4 of 5 | Iteration:  19%|█▊        | 227/1212 [01:49<07:09,  2.29it/s]

Gradient norm: 42.12590469725003


Epoch 4 of 5 | Iteration:  19%|█▉        | 228/1212 [01:49<07:40,  2.14it/s]

Gradient norm: 42.61556363175914


Epoch 4 of 5 | Iteration:  19%|█▉        | 229/1212 [01:50<08:03,  2.03it/s]

Gradient norm: 42.665140857848336


Epoch 4 of 5 | Iteration:  19%|█▉        | 230/1212 [01:50<08:49,  1.85it/s]

Gradient norm: 45.06659196915193


Epoch 4 of 5 | Iteration:  19%|█▉        | 231/1212 [01:51<08:54,  1.84it/s]

Gradient norm: 46.02358922348137


Epoch 4 of 5 | Iteration:  19%|█▉        | 232/1212 [01:51<09:10,  1.78it/s]

Gradient norm: 46.896339581996415


Epoch 4 of 5 | Iteration:  19%|█▉        | 233/1212 [01:52<08:58,  1.82it/s]

Gradient norm: 51.70017073290168


Epoch 4 of 5 | Iteration:  19%|█▉        | 234/1212 [01:53<09:10,  1.78it/s]

Gradient norm: 51.563211268528576


Epoch 4 of 5 | Iteration:  19%|█▉        | 235/1212 [01:53<09:00,  1.81it/s]

Gradient norm: 51.329173544510226


Epoch 4 of 5 | Iteration:  19%|█▉        | 236/1212 [01:53<08:08,  2.00it/s]

Gradient norm: 51.532431294604564


Epoch 4 of 5 | Iteration:  20%|█▉        | 237/1212 [01:54<07:57,  2.04it/s]

Gradient norm: 59.49702714369136


Epoch 4 of 5 | Iteration:  20%|█▉        | 238/1212 [01:55<08:41,  1.87it/s]

Gradient norm: 58.809777655073674


Epoch 4 of 5 | Iteration:  20%|█▉        | 239/1212 [01:55<07:58,  2.03it/s]

Gradient norm: 65.37373772342391


Epoch 4 of 5 | Iteration:  20%|█▉        | 240/1212 [01:56<08:52,  1.83it/s]

Gradient norm: 63.15012302654408


Epoch 4 of 5 | Iteration:  20%|█▉        | 241/1212 [01:56<08:33,  1.89it/s]

Gradient norm: 5.9081651306022955


Epoch 4 of 5 | Iteration:  20%|█▉        | 242/1212 [01:57<08:19,  1.94it/s]

Gradient norm: 6.748284885378668


Epoch 4 of 5 | Iteration:  20%|██        | 243/1212 [01:57<07:44,  2.09it/s]

Gradient norm: 13.624626932630369


Epoch 4 of 5 | Iteration:  20%|██        | 244/1212 [01:58<07:59,  2.02it/s]

Gradient norm: 13.844776010831474


Epoch 4 of 5 | Iteration:  20%|██        | 245/1212 [01:58<07:28,  2.16it/s]

Gradient norm: 13.881099706418974


Epoch 4 of 5 | Iteration:  20%|██        | 246/1212 [01:58<07:29,  2.15it/s]

Gradient norm: 125.15768183154172


Epoch 4 of 5 | Iteration:  20%|██        | 247/1212 [01:59<07:05,  2.27it/s]

Gradient norm: 130.24142390767335


Epoch 4 of 5 | Iteration:  20%|██        | 248/1212 [01:59<07:07,  2.25it/s]

Gradient norm: 130.03494463807567


Epoch 4 of 5 | Iteration:  21%|██        | 249/1212 [02:00<06:58,  2.30it/s]

Gradient norm: 130.00336893546364


Epoch 4 of 5 | Iteration:  21%|██        | 250/1212 [02:00<06:42,  2.39it/s]

Gradient norm: 131.49861564102716


Epoch 4 of 5 | Iteration:  21%|██        | 251/1212 [02:00<06:36,  2.43it/s]

Gradient norm: 131.6162991360821


Epoch 4 of 5 | Iteration:  21%|██        | 252/1212 [02:01<07:33,  2.12it/s]

Gradient norm: 132.47863991765212


Epoch 4 of 5 | Iteration:  21%|██        | 253/1212 [02:01<07:18,  2.19it/s]

Gradient norm: 132.5154327640103


Epoch 4 of 5 | Iteration:  21%|██        | 254/1212 [02:02<06:57,  2.29it/s]

Gradient norm: 165.60722242449097


Epoch 4 of 5 | Iteration:  21%|██        | 255/1212 [02:02<06:48,  2.34it/s]

Gradient norm: 165.75712047817225


Epoch 4 of 5 | Iteration:  21%|██        | 256/1212 [02:03<08:09,  1.95it/s]

Gradient norm: 165.8621887994794


Epoch 4 of 5 | Iteration:  21%|██        | 257/1212 [02:04<08:12,  1.94it/s]

Gradient norm: 24.39060067383846


Epoch 4 of 5 | Iteration:  21%|██▏       | 258/1212 [02:04<08:51,  1.80it/s]

Gradient norm: 24.385029075899542


Epoch 4 of 5 | Iteration:  21%|██▏       | 259/1212 [02:05<09:05,  1.75it/s]

Gradient norm: 25.379098092144798


Epoch 4 of 5 | Iteration:  21%|██▏       | 260/1212 [02:06<09:55,  1.60it/s]

Gradient norm: 26.398377197024967


Epoch 4 of 5 | Iteration:  22%|██▏       | 261/1212 [02:06<09:44,  1.63it/s]

Gradient norm: 25.664754927309748


Epoch 4 of 5 | Iteration:  22%|██▏       | 262/1212 [02:07<08:57,  1.77it/s]

Gradient norm: 24.93561568058931


Epoch 4 of 5 | Iteration:  22%|██▏       | 263/1212 [02:07<09:05,  1.74it/s]

Gradient norm: 25.218805255317903


Epoch 4 of 5 | Iteration:  22%|██▏       | 264/1212 [02:08<08:18,  1.90it/s]

Gradient norm: 26.150315972902945


Epoch 4 of 5 | Iteration:  22%|██▏       | 265/1212 [02:08<07:38,  2.06it/s]

Gradient norm: 26.426201495240512


Epoch 4 of 5 | Iteration:  22%|██▏       | 266/1212 [02:08<07:16,  2.17it/s]

Gradient norm: 53.80912561008495


Epoch 4 of 5 | Iteration:  22%|██▏       | 267/1212 [02:09<07:00,  2.25it/s]

Gradient norm: 54.30301363012207


Epoch 4 of 5 | Iteration:  22%|██▏       | 268/1212 [02:09<06:42,  2.34it/s]

Gradient norm: 51.35490951917273


Epoch 4 of 5 | Iteration:  22%|██▏       | 269/1212 [02:10<06:28,  2.43it/s]

Gradient norm: 51.424691416121036


Epoch 4 of 5 | Iteration:  22%|██▏       | 270/1212 [02:10<06:29,  2.42it/s]

Gradient norm: 56.07389476570634


Epoch 4 of 5 | Iteration:  22%|██▏       | 271/1212 [02:10<06:20,  2.47it/s]

Gradient norm: 60.69867133696854


Epoch 4 of 5 | Iteration:  22%|██▏       | 272/1212 [02:11<07:36,  2.06it/s]

Gradient norm: 62.64602970429982


Epoch 4 of 5 | Iteration:  23%|██▎       | 273/1212 [02:11<07:15,  2.16it/s]

Gradient norm: 4.397303123351415


Epoch 4 of 5 | Iteration:  23%|██▎       | 274/1212 [02:12<07:07,  2.20it/s]

Gradient norm: 15.919932675032285


Epoch 4 of 5 | Iteration:  23%|██▎       | 275/1212 [02:12<07:02,  2.22it/s]

Gradient norm: 15.994447666808066


Epoch 4 of 5 | Iteration:  23%|██▎       | 276/1212 [02:13<06:43,  2.32it/s]

Gradient norm: 17.700140125444896


Epoch 4 of 5 | Iteration:  23%|██▎       | 277/1212 [02:13<06:49,  2.28it/s]

Gradient norm: 17.354501273747342


Epoch 4 of 5 | Iteration:  23%|██▎       | 278/1212 [02:14<06:34,  2.37it/s]

Gradient norm: 20.33724870927026


Epoch 4 of 5 | Iteration:  23%|██▎       | 279/1212 [02:14<07:15,  2.14it/s]

Gradient norm: 69.38494781145884


Epoch 4 of 5 | Iteration:  23%|██▎       | 280/1212 [02:14<06:50,  2.27it/s]

Gradient norm: 101.98688797905062


Epoch 4 of 5 | Iteration:  23%|██▎       | 281/1212 [02:15<07:14,  2.14it/s]

Gradient norm: 104.84066853072439


Epoch 4 of 5 | Iteration:  23%|██▎       | 282/1212 [02:15<06:50,  2.27it/s]

Gradient norm: 105.11389238040887


Epoch 4 of 5 | Iteration:  23%|██▎       | 283/1212 [02:16<06:59,  2.22it/s]

Gradient norm: 105.18317151623282


Epoch 4 of 5 | Iteration:  23%|██▎       | 284/1212 [02:16<06:46,  2.28it/s]

Gradient norm: 104.67567445902529


Epoch 4 of 5 | Iteration:  24%|██▎       | 285/1212 [02:17<07:09,  2.16it/s]

Gradient norm: 104.27089343082045


Epoch 4 of 5 | Iteration:  24%|██▎       | 286/1212 [02:17<07:25,  2.08it/s]

Gradient norm: 104.82609859396085


Epoch 4 of 5 | Iteration:  24%|██▎       | 287/1212 [02:18<07:40,  2.01it/s]

Gradient norm: 110.64105225404812


Epoch 4 of 5 | Iteration:  24%|██▍       | 288/1212 [02:19<08:41,  1.77it/s]

Gradient norm: 110.9429650151651


Epoch 4 of 5 | Iteration:  24%|██▍       | 289/1212 [02:19<08:41,  1.77it/s]

Gradient norm: 8.091481278042346


Epoch 4 of 5 | Iteration:  24%|██▍       | 290/1212 [02:20<08:55,  1.72it/s]

Gradient norm: 8.25384631428763


Epoch 4 of 5 | Iteration:  24%|██▍       | 291/1212 [02:20<08:37,  1.78it/s]

Gradient norm: 8.718615405171246


Epoch 4 of 5 | Iteration:  24%|██▍       | 292/1212 [02:21<08:26,  1.82it/s]

Gradient norm: 9.230615304915055


Epoch 4 of 5 | Iteration:  24%|██▍       | 293/1212 [02:21<07:47,  1.96it/s]

Gradient norm: 10.18858319772355


Epoch 4 of 5 | Iteration:  24%|██▍       | 294/1212 [02:22<07:40,  1.99it/s]

Gradient norm: 10.583048140150389


Epoch 4 of 5 | Iteration:  24%|██▍       | 295/1212 [02:22<07:36,  2.01it/s]

Gradient norm: 13.497352836283888


Epoch 4 of 5 | Iteration:  24%|██▍       | 296/1212 [02:23<07:12,  2.12it/s]

Gradient norm: 13.497352836283888


Epoch 4 of 5 | Iteration:  25%|██▍       | 297/1212 [02:23<06:58,  2.19it/s]

Gradient norm: 13.438981330369296


Epoch 4 of 5 | Iteration:  25%|██▍       | 298/1212 [02:24<08:26,  1.80it/s]

Gradient norm: 13.447600787071057


Epoch 4 of 5 | Iteration:  25%|██▍       | 299/1212 [02:24<08:46,  1.73it/s]

Gradient norm: 18.73104930148228


Epoch 4 of 5 | Iteration:  25%|██▍       | 300/1212 [02:25<09:04,  1.67it/s]

Gradient norm: 29.07685001712497


Epoch 4 of 5 | Iteration:  25%|██▍       | 301/1212 [02:25<08:11,  1.85it/s]

Gradient norm: 29.541784526767486


Epoch 4 of 5 | Iteration:  25%|██▍       | 302/1212 [02:26<08:01,  1.89it/s]

Gradient norm: 29.748688317192187


Epoch 4 of 5 | Iteration:  25%|██▌       | 303/1212 [02:27<08:20,  1.82it/s]

Gradient norm: 30.146237549602827


Epoch 4 of 5 | Iteration:  25%|██▌       | 304/1212 [02:27<07:41,  1.97it/s]

Gradient norm: 30.0532417478095


Epoch 4 of 5 | Iteration:  25%|██▌       | 305/1212 [02:27<07:12,  2.10it/s]

Gradient norm: 74.2845201300583


Epoch 4 of 5 | Iteration:  25%|██▌       | 306/1212 [02:28<07:46,  1.94it/s]

Gradient norm: 74.71982562400231


Epoch 4 of 5 | Iteration:  25%|██▌       | 307/1212 [02:28<07:15,  2.08it/s]

Gradient norm: 75.23124100305587


Epoch 4 of 5 | Iteration:  25%|██▌       | 308/1212 [02:29<07:32,  2.00it/s]

Gradient norm: 75.21006656893968


Epoch 4 of 5 | Iteration:  25%|██▌       | 309/1212 [02:29<07:09,  2.10it/s]

Gradient norm: 75.70355202625339


Epoch 4 of 5 | Iteration:  26%|██▌       | 310/1212 [02:30<07:35,  1.98it/s]

Gradient norm: 75.55336302622473


Epoch 4 of 5 | Iteration:  26%|██▌       | 311/1212 [02:30<07:48,  1.92it/s]

Gradient norm: 78.62200932218757


Epoch 4 of 5 | Iteration:  26%|██▌       | 312/1212 [02:31<07:55,  1.89it/s]

Gradient norm: 78.78191181820316


Epoch 4 of 5 | Iteration:  26%|██▌       | 313/1212 [02:32<08:06,  1.85it/s]

Gradient norm: 78.72773515476553


Epoch 4 of 5 | Iteration:  26%|██▌       | 314/1212 [02:32<07:54,  1.89it/s]

Gradient norm: 79.01478096292841


Epoch 4 of 5 | Iteration:  26%|██▌       | 315/1212 [02:33<08:02,  1.86it/s]

Gradient norm: 77.80029831910463


Epoch 4 of 5 | Iteration:  26%|██▌       | 316/1212 [02:33<08:36,  1.74it/s]

Gradient norm: 78.76617040889933


Epoch 4 of 5 | Iteration:  26%|██▌       | 317/1212 [02:34<08:04,  1.85it/s]

Gradient norm: 79.71621199265417


Epoch 4 of 5 | Iteration:  26%|██▌       | 318/1212 [02:34<07:45,  1.92it/s]

Gradient norm: 79.81791484514922


Epoch 4 of 5 | Iteration:  26%|██▋       | 319/1212 [02:35<07:10,  2.07it/s]

Gradient norm: 79.92493722174657


Epoch 4 of 5 | Iteration:  26%|██▋       | 320/1212 [02:35<07:08,  2.08it/s]

Gradient norm: 79.65820839829897


Epoch 4 of 5 | Iteration:  26%|██▋       | 321/1212 [02:36<06:41,  2.22it/s]

Gradient norm: 3.9627419321785577


Epoch 4 of 5 | Iteration:  27%|██▋       | 322/1212 [02:36<06:27,  2.30it/s]

Gradient norm: 9.568771015263874


Epoch 4 of 5 | Iteration:  27%|██▋       | 323/1212 [02:36<06:47,  2.18it/s]

Gradient norm: 13.757515373275261


Epoch 4 of 5 | Iteration:  27%|██▋       | 324/1212 [02:37<06:28,  2.29it/s]

Gradient norm: 13.791099749607163


Epoch 4 of 5 | Iteration:  27%|██▋       | 325/1212 [02:37<07:02,  2.10it/s]

Gradient norm: 14.521752229592328


Epoch 4 of 5 | Iteration:  27%|██▋       | 326/1212 [02:38<06:40,  2.21it/s]

Gradient norm: 15.199255657483828


Epoch 4 of 5 | Iteration:  27%|██▋       | 327/1212 [02:38<06:27,  2.28it/s]

Gradient norm: 18.7812392374479


Epoch 4 of 5 | Iteration:  27%|██▋       | 328/1212 [02:39<06:21,  2.32it/s]

Gradient norm: 19.292477893936134


Epoch 4 of 5 | Iteration:  27%|██▋       | 329/1212 [02:39<06:12,  2.37it/s]

Gradient norm: 19.420743703676617


Epoch 4 of 5 | Iteration:  27%|██▋       | 330/1212 [02:39<06:07,  2.40it/s]

Gradient norm: 19.393472765335172


Epoch 4 of 5 | Iteration:  27%|██▋       | 331/1212 [02:40<06:08,  2.39it/s]

Gradient norm: 20.616433581642333


Epoch 4 of 5 | Iteration:  27%|██▋       | 332/1212 [02:40<06:15,  2.34it/s]

Gradient norm: 21.95625209578649


Epoch 4 of 5 | Iteration:  27%|██▋       | 333/1212 [02:41<06:01,  2.43it/s]

Gradient norm: 26.497181216418863


Epoch 4 of 5 | Iteration:  28%|██▊       | 334/1212 [02:41<06:16,  2.33it/s]

Gradient norm: 30.898005460930232


Epoch 4 of 5 | Iteration:  28%|██▊       | 335/1212 [02:42<06:15,  2.34it/s]

Gradient norm: 32.3227896189725


Epoch 4 of 5 | Iteration:  28%|██▊       | 336/1212 [02:42<06:07,  2.39it/s]

Gradient norm: 32.45757750083962


Epoch 4 of 5 | Iteration:  28%|██▊       | 337/1212 [02:42<06:27,  2.26it/s]

Gradient norm: 5.533262451728605


Epoch 4 of 5 | Iteration:  28%|██▊       | 338/1212 [02:43<06:12,  2.35it/s]

Gradient norm: 5.969820492557955


Epoch 4 of 5 | Iteration:  28%|██▊       | 339/1212 [02:43<05:59,  2.43it/s]

Gradient norm: 9.891626316461782


Epoch 4 of 5 | Iteration:  28%|██▊       | 340/1212 [02:44<06:31,  2.23it/s]

Gradient norm: 72.10715196688751


Epoch 4 of 5 | Iteration:  28%|██▊       | 341/1212 [02:44<07:02,  2.06it/s]

Gradient norm: 72.20733390406265


Epoch 4 of 5 | Iteration:  28%|██▊       | 342/1212 [02:45<07:09,  2.03it/s]

Gradient norm: 127.8679403016616


Epoch 4 of 5 | Iteration:  28%|██▊       | 343/1212 [02:46<08:15,  1.75it/s]

Gradient norm: 179.7319256081749


Epoch 4 of 5 | Iteration:  28%|██▊       | 344/1212 [02:46<07:55,  1.83it/s]

Gradient norm: 179.75792896921584


Epoch 4 of 5 | Iteration:  28%|██▊       | 345/1212 [02:47<08:30,  1.70it/s]

Gradient norm: 180.72790130674926


Epoch 4 of 5 | Iteration:  29%|██▊       | 346/1212 [02:47<08:06,  1.78it/s]

Gradient norm: 179.94724829227803


Epoch 4 of 5 | Iteration:  29%|██▊       | 347/1212 [02:48<07:39,  1.88it/s]

Gradient norm: 179.9567336494617


Epoch 4 of 5 | Iteration:  29%|██▊       | 348/1212 [02:48<07:02,  2.04it/s]

Gradient norm: 180.13999083688367


Epoch 4 of 5 | Iteration:  29%|██▉       | 349/1212 [02:49<06:46,  2.13it/s]

Gradient norm: 179.7308530530497


Epoch 4 of 5 | Iteration:  29%|██▉       | 350/1212 [02:49<06:23,  2.25it/s]

Gradient norm: 178.86724849456337


Epoch 4 of 5 | Iteration:  29%|██▉       | 351/1212 [02:49<06:16,  2.29it/s]

Gradient norm: 178.13615300190654


Epoch 4 of 5 | Iteration:  29%|██▉       | 352/1212 [02:50<06:14,  2.29it/s]

Gradient norm: 206.54154017286652


Epoch 4 of 5 | Iteration:  29%|██▉       | 353/1212 [02:50<06:19,  2.26it/s]

Gradient norm: 2.6440037528996996


Epoch 4 of 5 | Iteration:  29%|██▉       | 354/1212 [02:51<07:11,  1.99it/s]

Gradient norm: 5.997960375488457


Epoch 4 of 5 | Iteration:  29%|██▉       | 355/1212 [02:51<06:42,  2.13it/s]

Gradient norm: 7.939514036527527


Epoch 4 of 5 | Iteration:  29%|██▉       | 356/1212 [02:52<06:57,  2.05it/s]

Gradient norm: 8.110064415013206


Epoch 4 of 5 | Iteration:  29%|██▉       | 357/1212 [02:52<06:39,  2.14it/s]

Gradient norm: 8.364934223153046


Epoch 4 of 5 | Iteration:  30%|██▉       | 358/1212 [02:53<06:19,  2.25it/s]

Gradient norm: 19.41686376021638


Epoch 4 of 5 | Iteration:  30%|██▉       | 359/1212 [02:53<06:04,  2.34it/s]

Gradient norm: 21.04919905701315


Epoch 4 of 5 | Iteration:  30%|██▉       | 360/1212 [02:54<06:38,  2.14it/s]

Gradient norm: 21.43615543433113


Epoch 4 of 5 | Iteration:  30%|██▉       | 361/1212 [02:54<06:46,  2.09it/s]

Gradient norm: 126.2463111420103


Epoch 4 of 5 | Iteration:  30%|██▉       | 362/1212 [02:54<06:22,  2.22it/s]

Gradient norm: 126.63219108625886


Epoch 4 of 5 | Iteration:  30%|██▉       | 363/1212 [02:55<06:38,  2.13it/s]

Gradient norm: 126.8776294361997


Epoch 4 of 5 | Iteration:  30%|███       | 364/1212 [02:55<06:28,  2.18it/s]

Gradient norm: 126.87197523919471


Epoch 4 of 5 | Iteration:  30%|███       | 365/1212 [02:56<06:28,  2.18it/s]

Gradient norm: 127.4334595604687


Epoch 4 of 5 | Iteration:  30%|███       | 366/1212 [02:56<06:12,  2.27it/s]

Gradient norm: 127.89945595836281


Epoch 4 of 5 | Iteration:  30%|███       | 367/1212 [02:57<06:18,  2.23it/s]

Gradient norm: 127.88196066230665


Epoch 4 of 5 | Iteration:  30%|███       | 368/1212 [02:57<06:43,  2.09it/s]

Gradient norm: 136.03123646175314


Epoch 4 of 5 | Iteration:  30%|███       | 369/1212 [02:58<07:00,  2.01it/s]

Gradient norm: 1.9939118916533685


Epoch 4 of 5 | Iteration:  31%|███       | 370/1212 [02:58<06:59,  2.01it/s]

Gradient norm: 5.4939371028028825


Epoch 4 of 5 | Iteration:  31%|███       | 371/1212 [02:59<07:16,  1.93it/s]

Gradient norm: 6.156411170955891


Epoch 4 of 5 | Iteration:  31%|███       | 372/1212 [02:59<07:19,  1.91it/s]

Gradient norm: 1552.2260332475432


Epoch 4 of 5 | Iteration:  31%|███       | 373/1212 [03:00<07:20,  1.90it/s]

Gradient norm: 1552.184094456558


Epoch 4 of 5 | Iteration:  31%|███       | 374/1212 [03:01<07:38,  1.83it/s]

Gradient norm: 1552.1584190227843


Epoch 4 of 5 | Iteration:  31%|███       | 375/1212 [03:01<07:13,  1.93it/s]

Gradient norm: 1552.7125698584605


Epoch 4 of 5 | Iteration:  31%|███       | 376/1212 [03:01<06:38,  2.10it/s]

Gradient norm: 1553.103401600152


Epoch 4 of 5 | Iteration:  31%|███       | 377/1212 [03:02<07:11,  1.94it/s]

Gradient norm: 1552.6603693774039


Epoch 4 of 5 | Iteration:  31%|███       | 378/1212 [03:03<07:21,  1.89it/s]

Gradient norm: 1553.1112592869058


Epoch 4 of 5 | Iteration:  31%|███▏      | 379/1212 [03:03<06:44,  2.06it/s]

Gradient norm: 1553.1658965189295


Epoch 4 of 5 | Iteration:  31%|███▏      | 380/1212 [03:03<07:01,  1.97it/s]

Gradient norm: 1552.8519317844587


Epoch 4 of 5 | Iteration:  31%|███▏      | 381/1212 [03:04<06:40,  2.08it/s]

Gradient norm: 1552.7850501452633


Epoch 4 of 5 | Iteration:  32%|███▏      | 382/1212 [03:04<06:26,  2.15it/s]

Gradient norm: 1552.7057896444303


Epoch 4 of 5 | Iteration:  32%|███▏      | 383/1212 [03:05<06:11,  2.23it/s]

Gradient norm: 1552.6941016209962


Epoch 4 of 5 | Iteration:  32%|███▏      | 384/1212 [03:05<06:01,  2.29it/s]

Gradient norm: 1552.6922120101055


Epoch 4 of 5 | Iteration:  32%|███▏      | 385/1212 [03:06<06:50,  2.02it/s]

Gradient norm: 31.975727636113266


Epoch 4 of 5 | Iteration:  32%|███▏      | 386/1212 [03:06<06:20,  2.17it/s]

Gradient norm: 44.364072247287744


Epoch 4 of 5 | Iteration:  32%|███▏      | 387/1212 [03:07<07:05,  1.94it/s]

Gradient norm: 87.00552112841949


Epoch 4 of 5 | Iteration:  32%|███▏      | 388/1212 [03:07<06:32,  2.10it/s]

Gradient norm: 86.99558363142357


Epoch 4 of 5 | Iteration:  32%|███▏      | 389/1212 [03:08<06:49,  2.01it/s]

Gradient norm: 87.03766663450823


Epoch 4 of 5 | Iteration:  32%|███▏      | 390/1212 [03:08<06:21,  2.15it/s]

Gradient norm: 85.9252336982677


Epoch 4 of 5 | Iteration:  32%|███▏      | 391/1212 [03:09<06:09,  2.22it/s]

Gradient norm: 236.2764710792181


Epoch 4 of 5 | Iteration:  32%|███▏      | 392/1212 [03:09<05:56,  2.30it/s]

Gradient norm: 228.42420266938788


Epoch 4 of 5 | Iteration:  32%|███▏      | 393/1212 [03:09<05:42,  2.39it/s]

Gradient norm: 225.17257961936576


Epoch 4 of 5 | Iteration:  33%|███▎      | 394/1212 [03:10<05:39,  2.41it/s]

Gradient norm: 284.75251711029733


Epoch 4 of 5 | Iteration:  33%|███▎      | 395/1212 [03:10<05:44,  2.37it/s]

Gradient norm: 284.6697098076168


Epoch 4 of 5 | Iteration:  33%|███▎      | 396/1212 [03:11<06:48,  2.00it/s]

Gradient norm: 283.6195659514672


Epoch 4 of 5 | Iteration:  33%|███▎      | 397/1212 [03:11<07:17,  1.86it/s]

Gradient norm: 283.11038306936064


Epoch 4 of 5 | Iteration:  33%|███▎      | 398/1212 [03:12<06:53,  1.97it/s]

Gradient norm: 282.9641493879973


Epoch 4 of 5 | Iteration:  33%|███▎      | 399/1212 [03:12<07:05,  1.91it/s]

Gradient norm: 282.68533901471676


Epoch 4 of 5 | Iteration:  33%|███▎      | 400/1212 [03:13<06:58,  1.94it/s]

Gradient norm: 282.47430372878233


Epoch 4 of 5 | Iteration:  33%|███▎      | 401/1212 [03:13<06:59,  1.93it/s]

Gradient norm: 2.145997887303315


Epoch 4 of 5 | Iteration:  33%|███▎      | 402/1212 [03:14<07:30,  1.80it/s]

Gradient norm: 11.649350732765962


Epoch 4 of 5 | Iteration:  33%|███▎      | 403/1212 [03:15<07:46,  1.74it/s]

Gradient norm: 17.627881994815525


Epoch 4 of 5 | Iteration:  33%|███▎      | 404/1212 [03:15<07:08,  1.89it/s]

Gradient norm: 18.057362875739216


Epoch 4 of 5 | Iteration:  33%|███▎      | 405/1212 [03:16<06:43,  2.00it/s]

Gradient norm: 17.974796029723123


Epoch 4 of 5 | Iteration:  33%|███▎      | 406/1212 [03:16<06:34,  2.04it/s]

Gradient norm: 18.05442152150682


Epoch 4 of 5 | Iteration:  34%|███▎      | 407/1212 [03:16<06:17,  2.13it/s]

Gradient norm: 19.52486339155067


Epoch 4 of 5 | Iteration:  34%|███▎      | 408/1212 [03:17<05:54,  2.27it/s]

Gradient norm: 19.743989548046557


Epoch 4 of 5 | Iteration:  34%|███▎      | 409/1212 [03:17<06:21,  2.11it/s]

Gradient norm: 19.789474037013548


Epoch 4 of 5 | Iteration:  34%|███▍      | 410/1212 [03:18<06:00,  2.23it/s]

Gradient norm: 21.162905633543613


Epoch 4 of 5 | Iteration:  34%|███▍      | 411/1212 [03:18<05:51,  2.28it/s]

Gradient norm: 21.099998949965393


Epoch 4 of 5 | Iteration:  34%|███▍      | 412/1212 [03:19<06:35,  2.02it/s]

Gradient norm: 23.142607986221115


Epoch 4 of 5 | Iteration:  34%|███▍      | 413/1212 [03:19<06:14,  2.14it/s]

Gradient norm: 77.25443767526926


Epoch 4 of 5 | Iteration:  34%|███▍      | 414/1212 [03:20<05:48,  2.29it/s]

Gradient norm: 76.98547104633549


Epoch 4 of 5 | Iteration:  34%|███▍      | 415/1212 [03:20<05:37,  2.36it/s]

Gradient norm: 99.93008424941073


Epoch 4 of 5 | Iteration:  34%|███▍      | 416/1212 [03:20<05:37,  2.36it/s]

Gradient norm: 99.95962437102109


Epoch 4 of 5 | Iteration:  34%|███▍      | 417/1212 [03:21<05:31,  2.39it/s]

Gradient norm: 9.200925893646321


Epoch 4 of 5 | Iteration:  34%|███▍      | 418/1212 [03:21<05:43,  2.31it/s]

Gradient norm: 10.469989933821438


Epoch 4 of 5 | Iteration:  35%|███▍      | 419/1212 [03:22<05:31,  2.39it/s]

Gradient norm: 10.774547608879898


Epoch 4 of 5 | Iteration:  35%|███▍      | 420/1212 [03:22<05:21,  2.47it/s]

Gradient norm: 22.423615943842293


Epoch 4 of 5 | Iteration:  35%|███▍      | 421/1212 [03:23<05:47,  2.28it/s]

Gradient norm: 22.426028270572207


Epoch 4 of 5 | Iteration:  35%|███▍      | 422/1212 [03:23<05:39,  2.33it/s]

Gradient norm: 22.846289112676423


Epoch 4 of 5 | Iteration:  35%|███▍      | 423/1212 [03:23<05:46,  2.28it/s]

Gradient norm: 208.50193677496983


Epoch 4 of 5 | Iteration:  35%|███▍      | 424/1212 [03:24<06:13,  2.11it/s]

Gradient norm: 321.1045208859142


Epoch 4 of 5 | Iteration:  35%|███▌      | 425/1212 [03:25<06:28,  2.02it/s]

Gradient norm: 321.32410490490815


Epoch 4 of 5 | Iteration:  35%|███▌      | 426/1212 [03:25<06:40,  1.96it/s]

Gradient norm: 322.67695381216544


Epoch 4 of 5 | Iteration:  35%|███▌      | 427/1212 [03:26<06:56,  1.88it/s]

Gradient norm: 327.7033457445826


Epoch 4 of 5 | Iteration:  35%|███▌      | 428/1212 [03:26<07:02,  1.85it/s]

Gradient norm: 327.0784524774154


Epoch 4 of 5 | Iteration:  35%|███▌      | 429/1212 [03:27<07:06,  1.83it/s]

Gradient norm: 326.80683935186033


Epoch 4 of 5 | Iteration:  35%|███▌      | 430/1212 [03:27<07:34,  1.72it/s]

Gradient norm: 326.7389730808129


Epoch 4 of 5 | Iteration:  36%|███▌      | 431/1212 [03:28<07:00,  1.86it/s]

Gradient norm: 326.1070380890575


Epoch 4 of 5 | Iteration:  36%|███▌      | 432/1212 [03:28<06:40,  1.95it/s]

Gradient norm: 326.96201207629565


Epoch 4 of 5 | Iteration:  36%|███▌      | 433/1212 [03:29<06:41,  1.94it/s]

Gradient norm: 2.8823000396203566


Epoch 4 of 5 | Iteration:  36%|███▌      | 434/1212 [03:29<06:32,  1.98it/s]

Gradient norm: 9.65323658763318


Epoch 4 of 5 | Iteration:  36%|███▌      | 435/1212 [03:30<06:07,  2.11it/s]

Gradient norm: 124.12308396767807


Epoch 4 of 5 | Iteration:  36%|███▌      | 436/1212 [03:30<06:21,  2.03it/s]

Gradient norm: 124.0419334858809


Epoch 4 of 5 | Iteration:  36%|███▌      | 437/1212 [03:31<05:57,  2.17it/s]

Gradient norm: 124.20417398048677


Epoch 4 of 5 | Iteration:  36%|███▌      | 438/1212 [03:31<05:53,  2.19it/s]

Gradient norm: 124.22669152587217


Epoch 4 of 5 | Iteration:  36%|███▌      | 439/1212 [03:32<05:45,  2.24it/s]

Gradient norm: 124.48215715440853


Epoch 4 of 5 | Iteration:  36%|███▋      | 440/1212 [03:32<05:38,  2.28it/s]

Gradient norm: 124.53614803941943


Epoch 4 of 5 | Iteration:  36%|███▋      | 441/1212 [03:32<05:27,  2.36it/s]

Gradient norm: 124.42980335499378


Epoch 4 of 5 | Iteration:  36%|███▋      | 442/1212 [03:33<05:17,  2.42it/s]

Gradient norm: 554.9492654567894


Epoch 4 of 5 | Iteration:  37%|███▋      | 443/1212 [03:33<05:10,  2.47it/s]

Gradient norm: 547.5493885159763


Epoch 4 of 5 | Iteration:  37%|███▋      | 444/1212 [03:34<05:10,  2.47it/s]

Gradient norm: 547.9235413804138


Epoch 4 of 5 | Iteration:  37%|███▋      | 445/1212 [03:34<05:21,  2.38it/s]

Gradient norm: 547.5439512095924


Epoch 4 of 5 | Iteration:  37%|███▋      | 446/1212 [03:35<06:16,  2.03it/s]

Gradient norm: 547.2039438209217


Epoch 4 of 5 | Iteration:  37%|███▋      | 447/1212 [03:35<05:58,  2.14it/s]

Gradient norm: 548.8605578129224


Epoch 4 of 5 | Iteration:  37%|███▋      | 448/1212 [03:35<05:46,  2.20it/s]

Gradient norm: 542.2572655088686


Epoch 4 of 5 | Iteration:  37%|███▋      | 449/1212 [03:36<05:40,  2.24it/s]

Gradient norm: 2.194750792446849


Epoch 4 of 5 | Iteration:  37%|███▋      | 450/1212 [03:36<05:43,  2.22it/s]

Gradient norm: 50.940175768381


Epoch 4 of 5 | Iteration:  37%|███▋      | 451/1212 [03:37<05:42,  2.22it/s]

Gradient norm: 51.15306017380735


Epoch 4 of 5 | Iteration:  37%|███▋      | 452/1212 [03:37<05:35,  2.26it/s]

Gradient norm: 51.7252047125936


Epoch 4 of 5 | Iteration:  37%|███▋      | 453/1212 [03:38<05:49,  2.17it/s]

Gradient norm: 67.57576621319981


Epoch 4 of 5 | Iteration:  37%|███▋      | 454/1212 [03:38<06:03,  2.08it/s]

Gradient norm: 67.94631746546378


Epoch 4 of 5 | Iteration:  38%|███▊      | 455/1212 [03:39<07:00,  1.80it/s]

Gradient norm: 68.74335568243936


Epoch 4 of 5 | Iteration:  38%|███▊      | 456/1212 [03:40<07:10,  1.76it/s]

Gradient norm: 71.1169232052032


Epoch 4 of 5 | Iteration:  38%|███▊      | 457/1212 [03:40<07:30,  1.68it/s]

Gradient norm: 72.24244652762924


Epoch 4 of 5 | Iteration:  38%|███▊      | 458/1212 [03:41<07:17,  1.72it/s]

Gradient norm: 78.16147293069126


Epoch 4 of 5 | Iteration:  38%|███▊      | 459/1212 [03:41<07:01,  1.79it/s]

Gradient norm: 79.0320817312602


Epoch 4 of 5 | Iteration:  38%|███▊      | 460/1212 [03:42<06:37,  1.89it/s]

Gradient norm: 79.05243409631862


Epoch 4 of 5 | Iteration:  38%|███▊      | 461/1212 [03:42<06:07,  2.04it/s]

Gradient norm: 79.42640968606888


Epoch 4 of 5 | Iteration:  38%|███▊      | 462/1212 [03:43<05:47,  2.16it/s]

Gradient norm: 81.65935727258642


Epoch 4 of 5 | Iteration:  38%|███▊      | 463/1212 [03:43<06:55,  1.80it/s]

Gradient norm: 82.39320190328053


Epoch 4 of 5 | Iteration:  38%|███▊      | 464/1212 [03:44<07:32,  1.65it/s]

Gradient norm: 88.07831969079939


Epoch 4 of 5 | Iteration:  38%|███▊      | 465/1212 [03:45<06:58,  1.78it/s]

Gradient norm: 14.456236009308327


Epoch 4 of 5 | Iteration:  38%|███▊      | 466/1212 [03:45<06:19,  1.97it/s]

Gradient norm: 15.887509456242164


Epoch 4 of 5 | Iteration:  39%|███▊      | 467/1212 [03:45<06:19,  1.96it/s]

Gradient norm: 16.01302286351253


Epoch 4 of 5 | Iteration:  39%|███▊      | 468/1212 [03:46<05:47,  2.14it/s]

Gradient norm: 16.13792386278166


Epoch 4 of 5 | Iteration:  39%|███▊      | 469/1212 [03:46<05:29,  2.25it/s]

Gradient norm: 16.020903573978586


Epoch 4 of 5 | Iteration:  39%|███▉      | 470/1212 [03:47<05:32,  2.23it/s]

Gradient norm: 22.25307510946041


Epoch 4 of 5 | Iteration:  39%|███▉      | 471/1212 [03:47<05:14,  2.35it/s]

Gradient norm: 22.739541318423925


Epoch 4 of 5 | Iteration:  39%|███▉      | 472/1212 [03:48<05:42,  2.16it/s]

Gradient norm: 34.93554958478604


Epoch 4 of 5 | Iteration:  39%|███▉      | 473/1212 [03:48<05:29,  2.24it/s]

Gradient norm: 38.41615049105558


Epoch 4 of 5 | Iteration:  39%|███▉      | 474/1212 [03:48<05:14,  2.35it/s]

Gradient norm: 94.53691445939022


Epoch 4 of 5 | Iteration:  39%|███▉      | 475/1212 [03:49<05:15,  2.34it/s]

Gradient norm: 94.73461897397095


Epoch 4 of 5 | Iteration:  39%|███▉      | 476/1212 [03:49<05:06,  2.40it/s]

Gradient norm: 94.70040186941429


Epoch 4 of 5 | Iteration:  39%|███▉      | 477/1212 [03:50<05:59,  2.05it/s]

Gradient norm: 94.69294590659575


Epoch 4 of 5 | Iteration:  39%|███▉      | 478/1212 [03:50<06:08,  1.99it/s]

Gradient norm: 93.82041283058224


Epoch 4 of 5 | Iteration:  40%|███▉      | 479/1212 [03:51<05:45,  2.12it/s]

Gradient norm: 94.04991459424323


Epoch 4 of 5 | Iteration:  40%|███▉      | 480/1212 [03:51<06:06,  2.00it/s]

Gradient norm: 94.31726061678245


Epoch 4 of 5 | Iteration:  40%|███▉      | 481/1212 [03:52<06:47,  1.79it/s]

Gradient norm: 193.2689547880117


Epoch 4 of 5 | Iteration:  40%|███▉      | 482/1212 [03:53<06:44,  1.81it/s]

Gradient norm: 193.50375111777586


Epoch 4 of 5 | Iteration:  40%|███▉      | 483/1212 [03:53<06:36,  1.84it/s]

Gradient norm: 193.65457413784955


Epoch 4 of 5 | Iteration:  40%|███▉      | 484/1212 [03:54<06:46,  1.79it/s]

Gradient norm: 213.04424848753456


Epoch 4 of 5 | Iteration:  40%|████      | 485/1212 [03:54<06:38,  1.82it/s]

Gradient norm: 212.79460875477108


Epoch 4 of 5 | Iteration:  40%|████      | 486/1212 [03:55<07:24,  1.63it/s]

Gradient norm: 212.1689041348836


Epoch 4 of 5 | Iteration:  40%|████      | 487/1212 [03:55<06:32,  1.85it/s]

Gradient norm: 213.70883184105702


Epoch 4 of 5 | Iteration:  40%|████      | 488/1212 [03:56<05:57,  2.03it/s]

Gradient norm: 213.77961042565764


Epoch 4 of 5 | Iteration:  40%|████      | 489/1212 [03:56<05:51,  2.06it/s]

Gradient norm: 220.40046748900164


Epoch 4 of 5 | Iteration:  40%|████      | 490/1212 [03:57<05:29,  2.19it/s]

Gradient norm: 220.5223005859616


Epoch 4 of 5 | Iteration:  41%|████      | 491/1212 [03:57<05:10,  2.32it/s]

Gradient norm: 224.81892462122352


Epoch 4 of 5 | Iteration:  41%|████      | 492/1212 [03:57<05:16,  2.28it/s]

Gradient norm: 224.47557229226263


Epoch 4 of 5 | Iteration:  41%|████      | 493/1212 [03:58<05:40,  2.11it/s]

Gradient norm: 227.1511555005296


Epoch 4 of 5 | Iteration:  41%|████      | 494/1212 [03:58<05:18,  2.25it/s]

Gradient norm: 227.2913939428488


Epoch 4 of 5 | Iteration:  41%|████      | 495/1212 [03:59<05:42,  2.09it/s]

Gradient norm: 226.73705811587757


Epoch 4 of 5 | Iteration:  41%|████      | 496/1212 [03:59<05:39,  2.11it/s]

Gradient norm: 226.85380598456


Epoch 4 of 5 | Iteration:  41%|████      | 497/1212 [04:00<05:19,  2.24it/s]

Gradient norm: 4.3365630492042495


Epoch 4 of 5 | Iteration:  41%|████      | 498/1212 [04:00<05:12,  2.28it/s]

Gradient norm: 19.173695084531854


Epoch 4 of 5 | Iteration:  41%|████      | 499/1212 [04:01<05:00,  2.37it/s]

Gradient norm: 19.84803223363652


Epoch 4 of 5 | Iteration:  41%|████▏     | 500/1212 [04:01<05:11,  2.29it/s]

Gradient norm: 20.4250186936741


Epoch 4 of 5 | Iteration:  41%|████▏     | 501/1212 [04:01<04:54,  2.42it/s]

Gradient norm: 22.538787672469283


Epoch 4 of 5 | Iteration:  41%|████▏     | 502/1212 [04:02<04:49,  2.45it/s]

Gradient norm: 26.92215699095135


Epoch 4 of 5 | Iteration:  42%|████▏     | 503/1212 [04:02<04:43,  2.50it/s]

Gradient norm: 32.03921673523686


Epoch 4 of 5 | Iteration:  42%|████▏     | 504/1212 [04:03<05:36,  2.10it/s]

Gradient norm: 31.847180907708303


Epoch 4 of 5 | Iteration:  42%|████▏     | 505/1212 [04:03<05:18,  2.22it/s]

Gradient norm: 31.278988662404934


Epoch 4 of 5 | Iteration:  42%|████▏     | 506/1212 [04:04<05:03,  2.33it/s]

Gradient norm: 31.179739741320134


Epoch 4 of 5 | Iteration:  42%|████▏     | 507/1212 [04:04<05:00,  2.35it/s]

Gradient norm: 36.873074040142356


Epoch 4 of 5 | Iteration:  42%|████▏     | 508/1212 [04:04<04:53,  2.40it/s]

Gradient norm: 242.59304526593363


Epoch 4 of 5 | Iteration:  42%|████▏     | 509/1212 [04:05<05:03,  2.31it/s]

Gradient norm: 242.43758243199449


Epoch 4 of 5 | Iteration:  42%|████▏     | 510/1212 [04:05<05:18,  2.21it/s]

Gradient norm: 242.9141305537051


Epoch 4 of 5 | Iteration:  42%|████▏     | 511/1212 [04:06<05:23,  2.17it/s]

Gradient norm: 242.95734727167394


Epoch 4 of 5 | Iteration:  42%|████▏     | 512/1212 [04:06<05:44,  2.03it/s]

Gradient norm: 242.85221506355361


Epoch 4 of 5 | Iteration:  42%|████▏     | 513/1212 [04:07<05:52,  1.98it/s]

Gradient norm: 12.808512265408567


Epoch 4 of 5 | Iteration:  42%|████▏     | 514/1212 [04:07<06:00,  1.93it/s]

Gradient norm: 13.52542086212994


Epoch 4 of 5 | Iteration:  42%|████▏     | 515/1212 [04:08<06:13,  1.87it/s]

Gradient norm: 14.32595962670986


Epoch 4 of 5 | Iteration:  43%|████▎     | 516/1212 [04:08<05:49,  1.99it/s]

Gradient norm: 29.977401396616838


Epoch 4 of 5 | Iteration:  43%|████▎     | 517/1212 [04:09<06:01,  1.92it/s]

Gradient norm: 50.52668379623059


Epoch 4 of 5 | Iteration:  43%|████▎     | 518/1212 [04:09<05:31,  2.09it/s]

Gradient norm: 51.086842035326775


Epoch 4 of 5 | Iteration:  43%|████▎     | 519/1212 [04:10<05:16,  2.19it/s]

Gradient norm: 51.95216448230761


Epoch 4 of 5 | Iteration:  43%|████▎     | 520/1212 [04:10<05:04,  2.27it/s]

Gradient norm: 52.02691403999506


Epoch 4 of 5 | Iteration:  43%|████▎     | 521/1212 [04:11<05:08,  2.24it/s]

Gradient norm: 47.53470299970092


Epoch 4 of 5 | Iteration:  43%|████▎     | 522/1212 [04:11<05:01,  2.29it/s]

Gradient norm: 47.96949526678108


Epoch 4 of 5 | Iteration:  43%|████▎     | 523/1212 [04:11<04:48,  2.39it/s]

Gradient norm: 48.22186741629597


Epoch 4 of 5 | Iteration:  43%|████▎     | 524/1212 [04:12<04:40,  2.45it/s]

Gradient norm: 48.61492767196346


Epoch 4 of 5 | Iteration:  43%|████▎     | 525/1212 [04:12<04:51,  2.36it/s]

Gradient norm: 48.482872543665344


Epoch 4 of 5 | Iteration:  43%|████▎     | 526/1212 [04:13<05:14,  2.18it/s]

Gradient norm: 47.79027796153504


Epoch 4 of 5 | Iteration:  43%|████▎     | 527/1212 [04:13<05:02,  2.26it/s]

Gradient norm: 44.05609700847543


Epoch 4 of 5 | Iteration:  44%|████▎     | 528/1212 [04:14<04:54,  2.32it/s]

Gradient norm: 84.86002101345429


Epoch 4 of 5 | Iteration:  44%|████▎     | 529/1212 [04:14<04:45,  2.39it/s]

Gradient norm: 38.83364264369022


Epoch 4 of 5 | Iteration:  44%|████▎     | 530/1212 [04:14<04:39,  2.44it/s]

Gradient norm: 38.77492210874537


Epoch 4 of 5 | Iteration:  44%|████▍     | 531/1212 [04:15<04:33,  2.49it/s]

Gradient norm: 38.970622420731814


Epoch 4 of 5 | Iteration:  44%|████▍     | 532/1212 [04:15<04:33,  2.49it/s]

Gradient norm: 38.94056629156495


Epoch 4 of 5 | Iteration:  44%|████▍     | 533/1212 [04:16<04:47,  2.37it/s]

Gradient norm: 39.49278229745628


Epoch 4 of 5 | Iteration:  44%|████▍     | 534/1212 [04:16<04:47,  2.36it/s]

Gradient norm: 39.969591504217775


Epoch 4 of 5 | Iteration:  44%|████▍     | 535/1212 [04:17<04:38,  2.43it/s]

Gradient norm: 40.79903036778347


Epoch 4 of 5 | Iteration:  44%|████▍     | 536/1212 [04:17<04:29,  2.51it/s]

Gradient norm: 76.51479955322593


Epoch 4 of 5 | Iteration:  44%|████▍     | 537/1212 [04:17<04:33,  2.47it/s]

Gradient norm: 78.57560764853699


Epoch 4 of 5 | Iteration:  44%|████▍     | 538/1212 [04:18<04:34,  2.46it/s]

Gradient norm: 78.77504783280926


Epoch 4 of 5 | Iteration:  44%|████▍     | 539/1212 [04:18<04:26,  2.52it/s]

Gradient norm: 82.27824418797516


Epoch 4 of 5 | Iteration:  45%|████▍     | 540/1212 [04:19<04:56,  2.26it/s]

Gradient norm: 81.76470426111135


Epoch 4 of 5 | Iteration:  45%|████▍     | 541/1212 [04:19<05:36,  2.00it/s]

Gradient norm: 79.46518848926107


Epoch 4 of 5 | Iteration:  45%|████▍     | 542/1212 [04:20<06:21,  1.75it/s]

Gradient norm: 73.11212571330462


Epoch 4 of 5 | Iteration:  45%|████▍     | 543/1212 [04:21<06:29,  1.72it/s]

Gradient norm: 73.04686463654417


Epoch 4 of 5 | Iteration:  45%|████▍     | 544/1212 [04:21<06:33,  1.70it/s]

Gradient norm: 72.7863675444022


Epoch 4 of 5 | Iteration:  45%|████▍     | 545/1212 [04:22<06:32,  1.70it/s]

Gradient norm: 22.26507257648475


Epoch 4 of 5 | Iteration:  45%|████▌     | 546/1212 [04:22<05:46,  1.92it/s]

Gradient norm: 23.023486834116056


Epoch 4 of 5 | Iteration:  45%|████▌     | 547/1212 [04:23<05:27,  2.03it/s]

Gradient norm: 23.182301882563134


Epoch 4 of 5 | Iteration:  45%|████▌     | 548/1212 [04:23<05:03,  2.18it/s]

Gradient norm: 23.567799236507696


Epoch 4 of 5 | Iteration:  45%|████▌     | 549/1212 [04:23<04:45,  2.32it/s]

Gradient norm: 27.182528800016794


Epoch 4 of 5 | Iteration:  45%|████▌     | 550/1212 [04:24<05:13,  2.11it/s]

Gradient norm: 33.84746065093931


Epoch 4 of 5 | Iteration:  45%|████▌     | 551/1212 [04:24<05:02,  2.19it/s]

Gradient norm: 33.69199671127327


Epoch 4 of 5 | Iteration:  46%|████▌     | 552/1212 [04:25<04:49,  2.28it/s]

Gradient norm: 35.18223602862194


Epoch 4 of 5 | Iteration:  46%|████▌     | 553/1212 [04:26<05:58,  1.84it/s]

Gradient norm: 36.95254664660797


Epoch 4 of 5 | Iteration:  46%|████▌     | 554/1212 [04:26<05:23,  2.04it/s]

Gradient norm: 36.93659797948405


Epoch 4 of 5 | Iteration:  46%|████▌     | 555/1212 [04:26<04:59,  2.19it/s]

Gradient norm: 37.647978105458066


Epoch 4 of 5 | Iteration:  46%|████▌     | 556/1212 [04:27<04:48,  2.27it/s]

Gradient norm: 38.118362056377585


Epoch 4 of 5 | Iteration:  46%|████▌     | 557/1212 [04:27<04:43,  2.31it/s]

Gradient norm: 39.64960473899176


Epoch 4 of 5 | Iteration:  46%|████▌     | 558/1212 [04:27<04:31,  2.41it/s]

Gradient norm: 39.369249934828


Epoch 4 of 5 | Iteration:  46%|████▌     | 559/1212 [04:28<04:38,  2.35it/s]

Gradient norm: 155.46324991970874


Epoch 4 of 5 | Iteration:  46%|████▌     | 560/1212 [04:28<04:35,  2.37it/s]

Gradient norm: 160.26302834214223


Epoch 4 of 5 | Iteration:  46%|████▋     | 561/1212 [04:29<04:39,  2.33it/s]

Gradient norm: 3.6264167884497707


Epoch 4 of 5 | Iteration:  46%|████▋     | 562/1212 [04:29<04:31,  2.39it/s]

Gradient norm: 3.944438754223507


Epoch 4 of 5 | Iteration:  46%|████▋     | 563/1212 [04:30<04:52,  2.22it/s]

Gradient norm: 5.076061449416153


Epoch 4 of 5 | Iteration:  47%|████▋     | 564/1212 [04:30<04:39,  2.32it/s]

Gradient norm: 5.303061159419033


Epoch 4 of 5 | Iteration:  47%|████▋     | 565/1212 [04:30<04:26,  2.43it/s]

Gradient norm: 7.532569620242279


Epoch 4 of 5 | Iteration:  47%|████▋     | 566/1212 [04:31<04:24,  2.44it/s]

Gradient norm: 9.792038696217137


Epoch 4 of 5 | Iteration:  47%|████▋     | 567/1212 [04:31<05:09,  2.08it/s]

Gradient norm: 9.857250444951239


Epoch 4 of 5 | Iteration:  47%|████▋     | 568/1212 [04:32<05:09,  2.08it/s]

Gradient norm: 26.421933938897


Epoch 4 of 5 | Iteration:  47%|████▋     | 569/1212 [04:33<05:42,  1.88it/s]

Gradient norm: 32.82414878469148


Epoch 4 of 5 | Iteration:  47%|████▋     | 570/1212 [04:33<05:46,  1.86it/s]

Gradient norm: 44.34721308594299


Epoch 4 of 5 | Iteration:  47%|████▋     | 571/1212 [04:34<05:49,  1.84it/s]

Gradient norm: 44.464786170680824


Epoch 4 of 5 | Iteration:  47%|████▋     | 572/1212 [04:34<06:04,  1.76it/s]

Gradient norm: 45.31473173830085


Epoch 4 of 5 | Iteration:  47%|████▋     | 573/1212 [04:35<06:03,  1.76it/s]

Gradient norm: 45.12936286138227


Epoch 4 of 5 | Iteration:  47%|████▋     | 574/1212 [04:36<06:33,  1.62it/s]

Gradient norm: 45.44840012989092


Epoch 4 of 5 | Iteration:  47%|████▋     | 575/1212 [04:36<06:24,  1.66it/s]

Gradient norm: 56.56649065322376


Epoch 4 of 5 | Iteration:  48%|████▊     | 576/1212 [04:37<05:42,  1.86it/s]

Gradient norm: 55.76256910596223


Epoch 4 of 5 | Iteration:  48%|████▊     | 577/1212 [04:37<05:14,  2.02it/s]

Gradient norm: 1.4520945786588166


Epoch 4 of 5 | Iteration:  48%|████▊     | 578/1212 [04:37<04:52,  2.16it/s]

Gradient norm: 7.888905722493453


Epoch 4 of 5 | Iteration:  48%|████▊     | 579/1212 [04:38<04:41,  2.25it/s]

Gradient norm: 8.245016055569254


Epoch 4 of 5 | Iteration:  48%|████▊     | 580/1212 [04:38<04:45,  2.21it/s]

Gradient norm: 44.50170417826378


Epoch 4 of 5 | Iteration:  48%|████▊     | 581/1212 [04:39<04:33,  2.30it/s]

Gradient norm: 44.62314536991512


Epoch 4 of 5 | Iteration:  48%|████▊     | 582/1212 [04:39<04:25,  2.37it/s]

Gradient norm: 44.67357281131158


Epoch 4 of 5 | Iteration:  48%|████▊     | 583/1212 [04:39<04:23,  2.38it/s]

Gradient norm: 53.00622905095636


Epoch 4 of 5 | Iteration:  48%|████▊     | 584/1212 [04:40<04:19,  2.42it/s]

Gradient norm: 53.34494963700841


Epoch 4 of 5 | Iteration:  48%|████▊     | 585/1212 [04:40<04:30,  2.32it/s]

Gradient norm: 53.34433160061509


Epoch 4 of 5 | Iteration:  48%|████▊     | 586/1212 [04:41<04:24,  2.36it/s]

Gradient norm: 53.519553372098976


Epoch 4 of 5 | Iteration:  48%|████▊     | 587/1212 [04:41<04:21,  2.39it/s]

Gradient norm: 53.78542561123495


Epoch 4 of 5 | Iteration:  49%|████▊     | 588/1212 [04:42<04:16,  2.44it/s]

Gradient norm: 53.83668698069232


Epoch 4 of 5 | Iteration:  49%|████▊     | 589/1212 [04:42<04:39,  2.23it/s]

Gradient norm: 55.3244310679665


Epoch 4 of 5 | Iteration:  49%|████▊     | 590/1212 [04:42<04:27,  2.32it/s]

Gradient norm: 55.42909075948802


Epoch 4 of 5 | Iteration:  49%|████▉     | 591/1212 [04:43<04:18,  2.40it/s]

Gradient norm: 55.30968737620866


Epoch 4 of 5 | Iteration:  49%|████▉     | 592/1212 [04:43<04:39,  2.22it/s]

Gradient norm: 54.94482744759117


Epoch 4 of 5 | Iteration:  49%|████▉     | 593/1212 [04:44<04:45,  2.17it/s]

Gradient norm: 139.40559508792694


Epoch 4 of 5 | Iteration:  49%|████▉     | 594/1212 [04:44<04:33,  2.26it/s]

Gradient norm: 139.50803116450598


Epoch 4 of 5 | Iteration:  49%|████▉     | 595/1212 [04:45<04:57,  2.07it/s]

Gradient norm: 139.78147983062289


Epoch 4 of 5 | Iteration:  49%|████▉     | 596/1212 [04:45<05:16,  1.95it/s]

Gradient norm: 139.8149558141861


Epoch 4 of 5 | Iteration:  49%|████▉     | 597/1212 [04:46<05:13,  1.96it/s]

Gradient norm: 140.319840710152


Epoch 4 of 5 | Iteration:  49%|████▉     | 598/1212 [04:47<05:30,  1.86it/s]

Gradient norm: 150.33020223624436


Epoch 4 of 5 | Iteration:  49%|████▉     | 599/1212 [04:47<05:26,  1.88it/s]

Gradient norm: 150.00437075873168


Epoch 4 of 5 | Iteration:  50%|████▉     | 600/1212 [04:48<05:22,  1.90it/s]

Gradient norm: 149.60104672134966


Epoch 4 of 5 | Iteration:  50%|████▉     | 601/1212 [04:48<05:29,  1.86it/s]

Gradient norm: 149.51066184923647


Epoch 4 of 5 | Iteration:  50%|████▉     | 602/1212 [04:49<05:31,  1.84it/s]

Gradient norm: 149.53176324604786


Epoch 4 of 5 | Iteration:  50%|████▉     | 603/1212 [04:49<05:38,  1.80it/s]

Gradient norm: 149.35608350824288


Epoch 4 of 5 | Iteration:  50%|████▉     | 604/1212 [04:50<05:22,  1.89it/s]

Gradient norm: 150.53178433233091


Epoch 4 of 5 | Iteration:  50%|████▉     | 605/1212 [04:50<05:05,  1.99it/s]

Gradient norm: 150.5573138580744


Epoch 4 of 5 | Iteration:  50%|█████     | 606/1212 [04:51<04:48,  2.10it/s]

Gradient norm: 150.70928876024246


Epoch 4 of 5 | Iteration:  50%|█████     | 607/1212 [04:51<04:36,  2.19it/s]

Gradient norm: 150.63342032730742


Epoch 4 of 5 | Iteration:  50%|█████     | 608/1212 [04:52<05:10,  1.94it/s]

Gradient norm: 150.60919666747083


Epoch 4 of 5 | Iteration:  50%|█████     | 609/1212 [04:52<04:48,  2.09it/s]

Gradient norm: 352.15506472900506


Epoch 4 of 5 | Iteration:  50%|█████     | 610/1212 [04:52<04:30,  2.22it/s]

Gradient norm: 352.29516977511906


Epoch 4 of 5 | Iteration:  50%|█████     | 611/1212 [04:53<04:37,  2.17it/s]

Gradient norm: 351.66768951793495


Epoch 4 of 5 | Iteration:  50%|█████     | 612/1212 [04:53<04:21,  2.30it/s]

Gradient norm: 352.0322862296797


Epoch 4 of 5 | Iteration:  51%|█████     | 613/1212 [04:54<04:13,  2.36it/s]

Gradient norm: 352.12800918197394


Epoch 4 of 5 | Iteration:  51%|█████     | 614/1212 [04:54<04:12,  2.37it/s]

Gradient norm: 352.37767582811256


Epoch 4 of 5 | Iteration:  51%|█████     | 615/1212 [04:55<04:11,  2.38it/s]

Gradient norm: 352.3678746625356


Epoch 4 of 5 | Iteration:  51%|█████     | 616/1212 [04:55<04:23,  2.26it/s]

Gradient norm: 347.54188729393474


Epoch 4 of 5 | Iteration:  51%|█████     | 617/1212 [04:55<04:10,  2.38it/s]

Gradient norm: 347.78161025528397


Epoch 4 of 5 | Iteration:  51%|█████     | 618/1212 [04:56<04:33,  2.17it/s]

Gradient norm: 348.0262804854994


Epoch 4 of 5 | Iteration:  51%|█████     | 619/1212 [04:56<04:34,  2.16it/s]

Gradient norm: 346.87703986499025


Epoch 4 of 5 | Iteration:  51%|█████     | 620/1212 [04:57<04:23,  2.24it/s]

Gradient norm: 346.0908889885411


Epoch 4 of 5 | Iteration:  51%|█████     | 621/1212 [04:57<04:38,  2.13it/s]

Gradient norm: 346.6001968970613


Epoch 4 of 5 | Iteration:  51%|█████▏    | 622/1212 [04:58<04:32,  2.16it/s]

Gradient norm: 346.498173866621


Epoch 4 of 5 | Iteration:  51%|█████▏    | 623/1212 [04:58<04:28,  2.19it/s]

Gradient norm: 346.84969256471214


Epoch 4 of 5 | Iteration:  51%|█████▏    | 624/1212 [04:59<04:17,  2.28it/s]

Gradient norm: 346.80939603207094


Epoch 4 of 5 | Iteration:  52%|█████▏    | 625/1212 [04:59<04:10,  2.34it/s]

Gradient norm: 3.347768461301683


Epoch 4 of 5 | Iteration:  52%|█████▏    | 626/1212 [05:00<04:43,  2.07it/s]

Gradient norm: 21.607704931508966


Epoch 4 of 5 | Iteration:  52%|█████▏    | 627/1212 [05:00<05:01,  1.94it/s]

Gradient norm: 21.302572224677427


Epoch 4 of 5 | Iteration:  52%|█████▏    | 628/1212 [05:01<05:36,  1.73it/s]

Gradient norm: 22.974646225920303


Epoch 4 of 5 | Iteration:  52%|█████▏    | 629/1212 [05:01<05:14,  1.85it/s]

Gradient norm: 23.31743180806549


Epoch 4 of 5 | Iteration:  52%|█████▏    | 630/1212 [05:02<05:28,  1.77it/s]

Gradient norm: 44.22256058137223


Epoch 4 of 5 | Iteration:  52%|█████▏    | 631/1212 [05:03<05:35,  1.73it/s]

Gradient norm: 44.423210224384256


Epoch 4 of 5 | Iteration:  52%|█████▏    | 632/1212 [05:03<05:14,  1.85it/s]

Gradient norm: 44.67268066220282


Epoch 4 of 5 | Iteration:  52%|█████▏    | 633/1212 [05:04<05:13,  1.85it/s]

Gradient norm: 44.53212368566694


Epoch 4 of 5 | Iteration:  52%|█████▏    | 634/1212 [05:04<05:13,  1.84it/s]

Gradient norm: 46.15056884937961


Epoch 4 of 5 | Iteration:  52%|█████▏    | 635/1212 [05:05<05:18,  1.81it/s]

Gradient norm: 48.053488570235515


Epoch 4 of 5 | Iteration:  52%|█████▏    | 636/1212 [05:05<04:51,  1.98it/s]

Gradient norm: 886.8088987817201


Epoch 4 of 5 | Iteration:  53%|█████▎    | 637/1212 [05:06<04:28,  2.14it/s]

Gradient norm: 886.6956413857516


Epoch 4 of 5 | Iteration:  53%|█████▎    | 638/1212 [05:06<04:34,  2.09it/s]

Gradient norm: 887.2002322627857


Epoch 4 of 5 | Iteration:  53%|█████▎    | 639/1212 [05:06<04:19,  2.21it/s]

Gradient norm: 886.7194392084214


Epoch 4 of 5 | Iteration:  53%|█████▎    | 640/1212 [05:07<04:08,  2.30it/s]

Gradient norm: 886.7001170983639


Epoch 4 of 5 | Iteration:  53%|█████▎    | 641/1212 [05:07<04:05,  2.33it/s]

Gradient norm: 2.69493312285096


Epoch 4 of 5 | Iteration:  53%|█████▎    | 642/1212 [05:08<04:04,  2.33it/s]

Gradient norm: 3.721455999491517


Epoch 4 of 5 | Iteration:  53%|█████▎    | 643/1212 [05:08<03:57,  2.40it/s]

Gradient norm: 12.875555583185184


Epoch 4 of 5 | Iteration:  53%|█████▎    | 644/1212 [05:08<03:54,  2.42it/s]

Gradient norm: 12.819857681823597


Epoch 4 of 5 | Iteration:  53%|█████▎    | 645/1212 [05:09<04:37,  2.05it/s]

Gradient norm: 815.5583415410367


Epoch 4 of 5 | Iteration:  53%|█████▎    | 646/1212 [05:10<04:28,  2.11it/s]

Gradient norm: 816.3072759389645


Epoch 4 of 5 | Iteration:  53%|█████▎    | 647/1212 [05:10<04:11,  2.25it/s]

Gradient norm: 815.3787407306584


Epoch 4 of 5 | Iteration:  53%|█████▎    | 648/1212 [05:11<05:05,  1.85it/s]

Gradient norm: 815.0201042404094


Epoch 4 of 5 | Iteration:  54%|█████▎    | 649/1212 [05:11<04:58,  1.88it/s]

Gradient norm: 814.2164880535674


Epoch 4 of 5 | Iteration:  54%|█████▎    | 650/1212 [05:12<04:41,  2.00it/s]

Gradient norm: 814.1692838629697


Epoch 4 of 5 | Iteration:  54%|█████▎    | 651/1212 [05:12<04:19,  2.16it/s]

Gradient norm: 814.2492916752966


Epoch 4 of 5 | Iteration:  54%|█████▍    | 652/1212 [05:12<04:08,  2.25it/s]

Gradient norm: 818.6835952993098


Epoch 4 of 5 | Iteration:  54%|█████▍    | 653/1212 [05:13<04:08,  2.25it/s]

Gradient norm: 820.7242562239225


Epoch 4 of 5 | Iteration:  54%|█████▍    | 654/1212 [05:14<04:42,  1.98it/s]

Gradient norm: 823.6289780688544


Epoch 4 of 5 | Iteration:  54%|█████▍    | 655/1212 [05:14<04:54,  1.89it/s]

Gradient norm: 823.7156039960099


Epoch 4 of 5 | Iteration:  54%|█████▍    | 656/1212 [05:15<05:07,  1.81it/s]

Gradient norm: 821.5100978143303


Epoch 4 of 5 | Iteration:  54%|█████▍    | 657/1212 [05:15<05:31,  1.68it/s]

Gradient norm: 124.05465588364478


Epoch 4 of 5 | Iteration:  54%|█████▍    | 658/1212 [05:16<05:27,  1.69it/s]

Gradient norm: 124.16640627361663


Epoch 4 of 5 | Iteration:  54%|█████▍    | 659/1212 [05:16<05:13,  1.76it/s]

Gradient norm: 123.09799014969528


Epoch 4 of 5 | Iteration:  54%|█████▍    | 660/1212 [05:17<04:42,  1.95it/s]

Gradient norm: 123.34878855903585


Epoch 4 of 5 | Iteration:  55%|█████▍    | 661/1212 [05:17<04:21,  2.11it/s]

Gradient norm: 123.31731048323752


Epoch 4 of 5 | Iteration:  55%|█████▍    | 662/1212 [05:18<04:07,  2.22it/s]

Gradient norm: 123.56731717170203


Epoch 4 of 5 | Iteration:  55%|█████▍    | 663/1212 [05:18<04:28,  2.05it/s]

Gradient norm: 123.89152588388556


Epoch 4 of 5 | Iteration:  55%|█████▍    | 664/1212 [05:19<04:14,  2.16it/s]

Gradient norm: 123.825393871715


Epoch 4 of 5 | Iteration:  55%|█████▍    | 665/1212 [05:19<04:00,  2.28it/s]

Gradient norm: 123.79166478331021


Epoch 4 of 5 | Iteration:  55%|█████▍    | 666/1212 [05:19<03:51,  2.36it/s]

Gradient norm: 123.9276431189243


Epoch 4 of 5 | Iteration:  55%|█████▌    | 667/1212 [05:20<03:51,  2.36it/s]

Gradient norm: 126.8836487305214


Epoch 4 of 5 | Iteration:  55%|█████▌    | 668/1212 [05:20<04:13,  2.14it/s]

Gradient norm: 126.60771135427449


Epoch 4 of 5 | Iteration:  55%|█████▌    | 669/1212 [05:21<04:02,  2.24it/s]

Gradient norm: 132.35948042454814


Epoch 4 of 5 | Iteration:  55%|█████▌    | 670/1212 [05:21<04:05,  2.21it/s]

Gradient norm: 132.8523659255773


Epoch 4 of 5 | Iteration:  55%|█████▌    | 671/1212 [05:22<04:23,  2.06it/s]

Gradient norm: 132.83177965053068


Epoch 4 of 5 | Iteration:  55%|█████▌    | 672/1212 [05:22<04:12,  2.14it/s]

Gradient norm: 133.18459491636133


Epoch 4 of 5 | Iteration:  56%|█████▌    | 673/1212 [05:23<03:55,  2.29it/s]

Gradient norm: 6.7297323605145625


Epoch 4 of 5 | Iteration:  56%|█████▌    | 674/1212 [05:23<03:50,  2.33it/s]

Gradient norm: 10.510767446862044


Epoch 4 of 5 | Iteration:  56%|█████▌    | 675/1212 [05:24<03:59,  2.25it/s]

Gradient norm: 16.966230226585335


Epoch 4 of 5 | Iteration:  56%|█████▌    | 676/1212 [05:24<04:04,  2.19it/s]

Gradient norm: 18.821380580250246


Epoch 4 of 5 | Iteration:  56%|█████▌    | 677/1212 [05:25<04:22,  2.03it/s]

Gradient norm: 22.547005533197424


Epoch 4 of 5 | Iteration:  56%|█████▌    | 678/1212 [05:25<04:56,  1.80it/s]

Gradient norm: 22.87036664250457


Epoch 4 of 5 | Iteration:  56%|█████▌    | 679/1212 [05:26<04:26,  2.00it/s]

Gradient norm: 23.414842644870937


Epoch 4 of 5 | Iteration:  56%|█████▌    | 680/1212 [05:26<04:13,  2.10it/s]

Gradient norm: 23.329562241633173


Epoch 4 of 5 | Iteration:  56%|█████▌    | 681/1212 [05:27<04:12,  2.11it/s]

Gradient norm: 46.828448883362775


Epoch 4 of 5 | Iteration:  56%|█████▋    | 682/1212 [05:27<04:33,  1.93it/s]

Gradient norm: 46.31059975715355


Epoch 4 of 5 | Iteration:  56%|█████▋    | 683/1212 [05:28<04:23,  2.01it/s]

Gradient norm: 46.50241083639362


Epoch 4 of 5 | Iteration:  56%|█████▋    | 684/1212 [05:28<04:32,  1.93it/s]

Gradient norm: 47.57802366749121


Epoch 4 of 5 | Iteration:  57%|█████▋    | 685/1212 [05:29<04:43,  1.86it/s]

Gradient norm: 57.50540803188176


Epoch 4 of 5 | Iteration:  57%|█████▋    | 686/1212 [05:29<04:42,  1.86it/s]

Gradient norm: 58.07958375538136


Epoch 4 of 5 | Iteration:  57%|█████▋    | 687/1212 [05:30<04:47,  1.83it/s]

Gradient norm: 85.58261671604525


Epoch 4 of 5 | Iteration:  57%|█████▋    | 688/1212 [05:30<04:51,  1.80it/s]

Gradient norm: 86.14878107287551


Epoch 4 of 5 | Iteration:  57%|█████▋    | 689/1212 [05:31<04:24,  1.98it/s]

Gradient norm: 8.12507152784034


Epoch 4 of 5 | Iteration:  57%|█████▋    | 690/1212 [05:31<04:09,  2.09it/s]

Gradient norm: 21.823170000531377


Epoch 4 of 5 | Iteration:  57%|█████▋    | 691/1212 [05:32<03:56,  2.20it/s]

Gradient norm: 21.67900934567703


Epoch 4 of 5 | Iteration:  57%|█████▋    | 692/1212 [05:32<03:58,  2.18it/s]

Gradient norm: 22.891959543724255


Epoch 4 of 5 | Iteration:  57%|█████▋    | 693/1212 [05:32<03:44,  2.32it/s]

Gradient norm: 23.243440149435294


Epoch 4 of 5 | Iteration:  57%|█████▋    | 694/1212 [05:33<03:40,  2.34it/s]

Gradient norm: 23.344875843389538


Epoch 4 of 5 | Iteration:  57%|█████▋    | 695/1212 [05:33<03:32,  2.43it/s]

Gradient norm: 23.60808779322693


Epoch 4 of 5 | Iteration:  57%|█████▋    | 696/1212 [05:34<04:09,  2.07it/s]

Gradient norm: 62.6985100482702


Epoch 4 of 5 | Iteration:  58%|█████▊    | 697/1212 [05:34<03:55,  2.19it/s]

Gradient norm: 62.4163324005351


Epoch 4 of 5 | Iteration:  58%|█████▊    | 698/1212 [05:35<03:41,  2.32it/s]

Gradient norm: 62.33957477512793


Epoch 4 of 5 | Iteration:  58%|█████▊    | 699/1212 [05:35<03:33,  2.40it/s]

Gradient norm: 62.5233527728135


Epoch 4 of 5 | Iteration:  58%|█████▊    | 700/1212 [05:36<03:46,  2.26it/s]

Gradient norm: 63.520824855541676


Epoch 4 of 5 | Iteration:  58%|█████▊    | 701/1212 [05:36<04:03,  2.10it/s]

Gradient norm: 63.25195929860428


Epoch 4 of 5 | Iteration:  58%|█████▊    | 702/1212 [05:37<04:13,  2.01it/s]

Gradient norm: 65.12790582063744


Epoch 4 of 5 | Iteration:  58%|█████▊    | 703/1212 [05:37<03:59,  2.12it/s]

Gradient norm: 65.35569891908563


Epoch 4 of 5 | Iteration:  58%|█████▊    | 704/1212 [05:38<03:59,  2.12it/s]

Gradient norm: 68.27031251488968


Epoch 4 of 5 | Iteration:  58%|█████▊    | 705/1212 [05:38<03:56,  2.14it/s]

Gradient norm: 17.000732532882576


Epoch 4 of 5 | Iteration:  58%|█████▊    | 706/1212 [05:38<03:52,  2.18it/s]

Gradient norm: 19.042989742625583


Epoch 4 of 5 | Iteration:  58%|█████▊    | 707/1212 [05:39<03:38,  2.31it/s]

Gradient norm: 19.026387860720142


Epoch 4 of 5 | Iteration:  58%|█████▊    | 708/1212 [05:39<03:31,  2.38it/s]

Gradient norm: 19.319755454235114


Epoch 4 of 5 | Iteration:  58%|█████▊    | 709/1212 [05:40<03:34,  2.34it/s]

Gradient norm: 19.58751434594367


Epoch 4 of 5 | Iteration:  59%|█████▊    | 710/1212 [05:40<03:40,  2.28it/s]

Gradient norm: 19.964088438012123


Epoch 4 of 5 | Iteration:  59%|█████▊    | 711/1212 [05:41<03:54,  2.14it/s]

Gradient norm: 43.27252421937833


Epoch 4 of 5 | Iteration:  59%|█████▊    | 712/1212 [05:41<04:05,  2.03it/s]

Gradient norm: 43.15969534864504


Epoch 4 of 5 | Iteration:  59%|█████▉    | 713/1212 [05:42<04:15,  1.96it/s]

Gradient norm: 43.13704499924914


Epoch 4 of 5 | Iteration:  59%|█████▉    | 714/1212 [05:42<04:46,  1.74it/s]

Gradient norm: 43.274198992562205


Epoch 4 of 5 | Iteration:  59%|█████▉    | 715/1212 [05:43<04:47,  1.73it/s]

Gradient norm: 43.32204488082398


Epoch 4 of 5 | Iteration:  59%|█████▉    | 716/1212 [05:44<04:39,  1.78it/s]

Gradient norm: 43.29161339323754


Epoch 4 of 5 | Iteration:  59%|█████▉    | 717/1212 [05:44<04:11,  1.97it/s]

Gradient norm: 43.39252395784093


Epoch 4 of 5 | Iteration:  59%|█████▉    | 718/1212 [05:45<04:37,  1.78it/s]

Gradient norm: 43.631727240132946


Epoch 4 of 5 | Iteration:  59%|█████▉    | 719/1212 [05:45<04:14,  1.94it/s]

Gradient norm: 43.59970045626443


Epoch 4 of 5 | Iteration:  59%|█████▉    | 720/1212 [05:45<03:53,  2.11it/s]

Gradient norm: 57.573277403372344


Epoch 4 of 5 | Iteration:  59%|█████▉    | 721/1212 [05:46<03:40,  2.23it/s]

Gradient norm: 2.1653949538029185


Epoch 4 of 5 | Iteration:  60%|█████▉    | 722/1212 [05:46<03:42,  2.21it/s]

Gradient norm: 9.443284406734039


Epoch 4 of 5 | Iteration:  60%|█████▉    | 723/1212 [05:47<03:31,  2.32it/s]

Gradient norm: 9.432350182239835


Epoch 4 of 5 | Iteration:  60%|█████▉    | 724/1212 [05:47<03:37,  2.24it/s]

Gradient norm: 10.217867673777556


Epoch 4 of 5 | Iteration:  60%|█████▉    | 725/1212 [05:48<03:26,  2.36it/s]

Gradient norm: 46.14684518635632


Epoch 4 of 5 | Iteration:  60%|█████▉    | 726/1212 [05:48<03:23,  2.39it/s]

Gradient norm: 46.24818580722101


Epoch 4 of 5 | Iteration:  60%|█████▉    | 727/1212 [05:48<03:22,  2.39it/s]

Gradient norm: 46.36667826775787


Epoch 4 of 5 | Iteration:  60%|██████    | 728/1212 [05:49<03:21,  2.40it/s]

Gradient norm: 46.52903265697952


Epoch 4 of 5 | Iteration:  60%|██████    | 729/1212 [05:49<03:16,  2.46it/s]

Gradient norm: 46.59284593270259


Epoch 4 of 5 | Iteration:  60%|██████    | 730/1212 [05:50<03:37,  2.22it/s]

Gradient norm: 47.26566044615982


Epoch 4 of 5 | Iteration:  60%|██████    | 731/1212 [05:50<03:55,  2.05it/s]

Gradient norm: 47.26244692234103


Epoch 4 of 5 | Iteration:  60%|██████    | 732/1212 [05:51<03:57,  2.02it/s]

Gradient norm: 47.463745466059294


Epoch 4 of 5 | Iteration:  60%|██████    | 733/1212 [05:51<03:55,  2.03it/s]

Gradient norm: 47.48531238059292


Epoch 4 of 5 | Iteration:  61%|██████    | 734/1212 [05:52<03:38,  2.19it/s]

Gradient norm: 46.739279534267155


Epoch 4 of 5 | Iteration:  61%|██████    | 735/1212 [05:52<03:29,  2.27it/s]

Gradient norm: 47.080868893465535


Epoch 4 of 5 | Iteration:  61%|██████    | 736/1212 [05:53<03:35,  2.21it/s]

Gradient norm: 52.161263737745536


Epoch 4 of 5 | Iteration:  61%|██████    | 737/1212 [05:53<03:25,  2.31it/s]

Gradient norm: 5.980483210817023


Epoch 4 of 5 | Iteration:  61%|██████    | 738/1212 [05:53<03:20,  2.36it/s]

Gradient norm: 23.385954840970527


Epoch 4 of 5 | Iteration:  61%|██████    | 739/1212 [05:54<03:37,  2.18it/s]

Gradient norm: 30.32462236629631


Epoch 4 of 5 | Iteration:  61%|██████    | 740/1212 [05:54<03:49,  2.05it/s]

Gradient norm: 30.260588795922086


Epoch 4 of 5 | Iteration:  61%|██████    | 741/1212 [05:55<03:56,  1.99it/s]

Gradient norm: 57.30523641788985


Epoch 4 of 5 | Iteration:  61%|██████    | 742/1212 [05:55<03:58,  1.97it/s]

Gradient norm: 57.60228686899738


Epoch 4 of 5 | Iteration:  61%|██████▏   | 743/1212 [05:56<04:05,  1.91it/s]

Gradient norm: 228.09214836322863


Epoch 4 of 5 | Iteration:  61%|██████▏   | 744/1212 [05:57<04:14,  1.84it/s]

Gradient norm: 228.42906134898885


Epoch 4 of 5 | Iteration:  61%|██████▏   | 745/1212 [05:57<04:02,  1.92it/s]

Gradient norm: 228.41926061485788


Epoch 4 of 5 | Iteration:  62%|██████▏   | 746/1212 [05:57<03:45,  2.07it/s]

Gradient norm: 228.04489989845047


Epoch 4 of 5 | Iteration:  62%|██████▏   | 747/1212 [05:58<03:28,  2.23it/s]

Gradient norm: 228.36716436632696


Epoch 4 of 5 | Iteration:  62%|██████▏   | 748/1212 [05:58<03:26,  2.25it/s]

Gradient norm: 229.68936960717193


Epoch 4 of 5 | Iteration:  62%|██████▏   | 749/1212 [05:59<03:16,  2.36it/s]

Gradient norm: 229.99840251936766


Epoch 4 of 5 | Iteration:  62%|██████▏   | 750/1212 [05:59<03:15,  2.36it/s]

Gradient norm: 1656.2278483987873


Epoch 4 of 5 | Iteration:  62%|██████▏   | 751/1212 [05:59<03:07,  2.46it/s]

Gradient norm: 1656.8840216538579


Epoch 4 of 5 | Iteration:  62%|██████▏   | 752/1212 [06:00<03:06,  2.47it/s]

Gradient norm: 1657.0326918939943


Epoch 4 of 5 | Iteration:  62%|██████▏   | 753/1212 [06:00<03:02,  2.51it/s]

Gradient norm: 4.558965511060579


Epoch 4 of 5 | Iteration:  62%|██████▏   | 754/1212 [06:01<03:01,  2.52it/s]

Gradient norm: 39.752159187051056


Epoch 4 of 5 | Iteration:  62%|██████▏   | 755/1212 [06:01<03:05,  2.47it/s]

Gradient norm: 40.048608205081344


Epoch 4 of 5 | Iteration:  62%|██████▏   | 756/1212 [06:02<03:27,  2.20it/s]

Gradient norm: 40.432948531975875


Epoch 4 of 5 | Iteration:  62%|██████▏   | 757/1212 [06:02<03:41,  2.06it/s]

Gradient norm: 40.42444819704478


Epoch 4 of 5 | Iteration:  63%|██████▎   | 758/1212 [06:03<03:32,  2.14it/s]

Gradient norm: 40.44756621962683


Epoch 4 of 5 | Iteration:  63%|██████▎   | 759/1212 [06:03<03:18,  2.28it/s]

Gradient norm: 68.49612976936102


Epoch 4 of 5 | Iteration:  63%|██████▎   | 760/1212 [06:03<03:12,  2.35it/s]

Gradient norm: 68.4523037785261


Epoch 4 of 5 | Iteration:  63%|██████▎   | 761/1212 [06:04<03:04,  2.45it/s]

Gradient norm: 68.64358567874196


Epoch 4 of 5 | Iteration:  63%|██████▎   | 762/1212 [06:04<03:17,  2.28it/s]

Gradient norm: 68.73020060451248


Epoch 4 of 5 | Iteration:  63%|██████▎   | 763/1212 [06:05<03:30,  2.14it/s]

Gradient norm: 69.0185163335267


Epoch 4 of 5 | Iteration:  63%|██████▎   | 764/1212 [06:05<03:41,  2.02it/s]

Gradient norm: 69.29047911068197


Epoch 4 of 5 | Iteration:  63%|██████▎   | 765/1212 [06:06<03:46,  1.98it/s]

Gradient norm: 70.90060787789616


Epoch 4 of 5 | Iteration:  63%|██████▎   | 766/1212 [06:06<03:30,  2.12it/s]

Gradient norm: 70.3284419538993


Epoch 4 of 5 | Iteration:  63%|██████▎   | 767/1212 [06:07<03:28,  2.14it/s]

Gradient norm: 71.24611919295236


Epoch 4 of 5 | Iteration:  63%|██████▎   | 768/1212 [06:07<03:41,  2.01it/s]

Gradient norm: 70.87688139600905


Epoch 4 of 5 | Iteration:  63%|██████▎   | 769/1212 [06:08<03:37,  2.04it/s]

Gradient norm: 2.833020298441485


Epoch 4 of 5 | Iteration:  64%|██████▎   | 770/1212 [06:08<03:36,  2.04it/s]

Gradient norm: 4.030409365686843


Epoch 4 of 5 | Iteration:  64%|██████▎   | 771/1212 [06:09<03:41,  1.99it/s]

Gradient norm: 31.91988851794975


Epoch 4 of 5 | Iteration:  64%|██████▎   | 772/1212 [06:10<04:10,  1.75it/s]

Gradient norm: 30.94286939048071


Epoch 4 of 5 | Iteration:  64%|██████▍   | 773/1212 [06:10<04:09,  1.76it/s]

Gradient norm: 31.483089311617835


Epoch 4 of 5 | Iteration:  64%|██████▍   | 774/1212 [06:11<04:12,  1.74it/s]

Gradient norm: 31.39963461624282


Epoch 4 of 5 | Iteration:  64%|██████▍   | 775/1212 [06:11<03:45,  1.94it/s]

Gradient norm: 33.27261961172688


Epoch 4 of 5 | Iteration:  64%|██████▍   | 776/1212 [06:12<03:39,  1.99it/s]

Gradient norm: 33.67466114744393


Epoch 4 of 5 | Iteration:  64%|██████▍   | 777/1212 [06:12<03:33,  2.04it/s]

Gradient norm: 34.188019195937


Epoch 4 of 5 | Iteration:  64%|██████▍   | 778/1212 [06:12<03:29,  2.07it/s]

Gradient norm: 33.85529435310718


Epoch 4 of 5 | Iteration:  64%|██████▍   | 779/1212 [06:13<03:17,  2.19it/s]

Gradient norm: 33.91258697014447


Epoch 4 of 5 | Iteration:  64%|██████▍   | 780/1212 [06:13<03:25,  2.10it/s]

Gradient norm: 34.98269596107494


Epoch 4 of 5 | Iteration:  64%|██████▍   | 781/1212 [06:14<03:36,  1.99it/s]

Gradient norm: 44.30872230564773


Epoch 4 of 5 | Iteration:  65%|██████▍   | 782/1212 [06:14<03:24,  2.10it/s]

Gradient norm: 44.25653153499799


Epoch 4 of 5 | Iteration:  65%|██████▍   | 783/1212 [06:15<03:15,  2.19it/s]

Gradient norm: 47.61931250563501


Epoch 4 of 5 | Iteration:  65%|██████▍   | 784/1212 [06:15<03:30,  2.04it/s]

Gradient norm: 47.44709405329502


Epoch 4 of 5 | Iteration:  65%|██████▍   | 785/1212 [06:16<03:16,  2.17it/s]

Gradient norm: 26.923085975877964


Epoch 4 of 5 | Iteration:  65%|██████▍   | 786/1212 [06:16<03:11,  2.23it/s]

Gradient norm: 31.298564786740076


Epoch 4 of 5 | Iteration:  65%|██████▍   | 787/1212 [06:17<03:05,  2.29it/s]

Gradient norm: 31.293200803823648


Epoch 4 of 5 | Iteration:  65%|██████▌   | 788/1212 [06:17<03:00,  2.35it/s]

Gradient norm: 31.469674897322896


Epoch 4 of 5 | Iteration:  65%|██████▌   | 789/1212 [06:17<03:10,  2.22it/s]

Gradient norm: 31.64539072890265


Epoch 4 of 5 | Iteration:  65%|██████▌   | 790/1212 [06:18<03:11,  2.20it/s]

Gradient norm: 30.93721796977495


Epoch 4 of 5 | Iteration:  65%|██████▌   | 791/1212 [06:18<03:14,  2.17it/s]

Gradient norm: 30.79440258044798


Epoch 4 of 5 | Iteration:  65%|██████▌   | 792/1212 [06:19<03:26,  2.04it/s]

Gradient norm: 32.0063982740229


Epoch 4 of 5 | Iteration:  65%|██████▌   | 793/1212 [06:19<03:32,  1.98it/s]

Gradient norm: 30.020335698917304


Epoch 4 of 5 | Iteration:  66%|██████▌   | 794/1212 [06:20<03:22,  2.06it/s]

Gradient norm: 30.308007445133285


Epoch 4 of 5 | Iteration:  66%|██████▌   | 795/1212 [06:20<03:13,  2.15it/s]

Gradient norm: 165.62390022215942


Epoch 4 of 5 | Iteration:  66%|██████▌   | 796/1212 [06:21<03:23,  2.04it/s]

Gradient norm: 165.49974074384104


Epoch 4 of 5 | Iteration:  66%|██████▌   | 797/1212 [06:21<03:24,  2.03it/s]

Gradient norm: 165.26127918699046


Epoch 4 of 5 | Iteration:  66%|██████▌   | 798/1212 [06:22<03:35,  1.93it/s]

Gradient norm: 165.2984150717647


Epoch 4 of 5 | Iteration:  66%|██████▌   | 799/1212 [06:23<03:44,  1.84it/s]

Gradient norm: 165.4230462860155


Epoch 4 of 5 | Iteration:  66%|██████▌   | 800/1212 [06:23<03:43,  1.84it/s]

Gradient norm: 165.6561389618408


Epoch 4 of 5 | Iteration:  66%|██████▌   | 801/1212 [06:24<04:03,  1.69it/s]

Gradient norm: 7.194011349414788


Epoch 4 of 5 | Iteration:  66%|██████▌   | 802/1212 [06:24<03:48,  1.79it/s]

Gradient norm: 7.274487131644322


Epoch 4 of 5 | Iteration:  66%|██████▋   | 803/1212 [06:25<03:26,  1.98it/s]

Gradient norm: 24.811488727368857


Epoch 4 of 5 | Iteration:  66%|██████▋   | 804/1212 [06:25<03:11,  2.14it/s]

Gradient norm: 45.15432511079987


Epoch 4 of 5 | Iteration:  66%|██████▋   | 805/1212 [06:26<03:09,  2.14it/s]

Gradient norm: 45.2605535747911


Epoch 4 of 5 | Iteration:  67%|██████▋   | 806/1212 [06:26<03:23,  2.00it/s]

Gradient norm: 443.8926546447758


Epoch 4 of 5 | Iteration:  67%|██████▋   | 807/1212 [06:27<03:18,  2.04it/s]

Gradient norm: 442.9382380696788


Epoch 4 of 5 | Iteration:  67%|██████▋   | 808/1212 [06:27<03:06,  2.16it/s]

Gradient norm: 425.16457492157076


Epoch 4 of 5 | Iteration:  67%|██████▋   | 809/1212 [06:27<03:00,  2.23it/s]

Gradient norm: 394.0673981398673


Epoch 4 of 5 | Iteration:  67%|██████▋   | 810/1212 [06:28<02:51,  2.34it/s]

Gradient norm: 393.68875613123856


Epoch 4 of 5 | Iteration:  67%|██████▋   | 811/1212 [06:28<02:49,  2.36it/s]

Gradient norm: 392.66314588182973


Epoch 4 of 5 | Iteration:  67%|██████▋   | 812/1212 [06:29<03:31,  1.89it/s]

Gradient norm: 391.9278700627413


Epoch 4 of 5 | Iteration:  67%|██████▋   | 813/1212 [06:30<03:35,  1.85it/s]

Gradient norm: 391.49780929052514


Epoch 4 of 5 | Iteration:  67%|██████▋   | 814/1212 [06:30<03:16,  2.02it/s]

Gradient norm: 393.5331041600373


Epoch 4 of 5 | Iteration:  67%|██████▋   | 815/1212 [06:30<03:05,  2.14it/s]

Gradient norm: 392.80087342509376


Epoch 4 of 5 | Iteration:  67%|██████▋   | 816/1212 [06:31<03:16,  2.02it/s]

Gradient norm: 400.3500345905822


Epoch 4 of 5 | Iteration:  67%|██████▋   | 817/1212 [06:31<03:11,  2.06it/s]

Gradient norm: 1.8930838795989804


Epoch 4 of 5 | Iteration:  67%|██████▋   | 818/1212 [06:32<03:20,  1.97it/s]

Gradient norm: 13.187634756522934


Epoch 4 of 5 | Iteration:  68%|██████▊   | 819/1212 [06:32<03:06,  2.10it/s]

Gradient norm: 15.094018330064415


Epoch 4 of 5 | Iteration:  68%|██████▊   | 820/1212 [06:33<03:00,  2.17it/s]

Gradient norm: 15.55454884517185


Epoch 4 of 5 | Iteration:  68%|██████▊   | 821/1212 [06:33<03:13,  2.02it/s]

Gradient norm: 28.247897169949074


Epoch 4 of 5 | Iteration:  68%|██████▊   | 822/1212 [06:34<03:31,  1.84it/s]

Gradient norm: 29.085788064052306


Epoch 4 of 5 | Iteration:  68%|██████▊   | 823/1212 [06:35<03:35,  1.81it/s]

Gradient norm: 32.16254206442246


Epoch 4 of 5 | Iteration:  68%|██████▊   | 824/1212 [06:35<03:27,  1.87it/s]

Gradient norm: 32.27597201132961


Epoch 4 of 5 | Iteration:  68%|██████▊   | 825/1212 [06:36<03:34,  1.80it/s]

Gradient norm: 32.38001789873088


Epoch 4 of 5 | Iteration:  68%|██████▊   | 826/1212 [06:36<03:28,  1.85it/s]

Gradient norm: 32.23073864705543


Epoch 4 of 5 | Iteration:  68%|██████▊   | 827/1212 [06:37<03:42,  1.73it/s]

Gradient norm: 31.77737277148965


Epoch 4 of 5 | Iteration:  68%|██████▊   | 828/1212 [06:37<03:39,  1.75it/s]

Gradient norm: 34.157826076170736


Epoch 4 of 5 | Iteration:  68%|██████▊   | 829/1212 [06:38<03:35,  1.77it/s]

Gradient norm: 34.22438853622687


Epoch 4 of 5 | Iteration:  68%|██████▊   | 830/1212 [06:38<03:34,  1.78it/s]

Gradient norm: 33.98515824299469


Epoch 4 of 5 | Iteration:  69%|██████▊   | 831/1212 [06:39<03:44,  1.70it/s]

Gradient norm: 33.90091145054202


Epoch 4 of 5 | Iteration:  69%|██████▊   | 832/1212 [06:40<03:31,  1.79it/s]

Gradient norm: 65.75313370810832


Epoch 4 of 5 | Iteration:  69%|██████▊   | 833/1212 [06:40<03:13,  1.96it/s]

Gradient norm: 0.8164871213982076


Epoch 4 of 5 | Iteration:  69%|██████▉   | 834/1212 [06:40<02:56,  2.14it/s]

Gradient norm: 104.12743240919839


Epoch 4 of 5 | Iteration:  69%|██████▉   | 835/1212 [06:41<02:47,  2.25it/s]

Gradient norm: 103.8005768891595


Epoch 4 of 5 | Iteration:  69%|██████▉   | 836/1212 [06:41<03:10,  1.98it/s]

Gradient norm: 106.23723013130953


Epoch 4 of 5 | Iteration:  69%|██████▉   | 837/1212 [06:42<02:57,  2.11it/s]

Gradient norm: 107.13732028237935


Epoch 4 of 5 | Iteration:  69%|██████▉   | 838/1212 [06:42<02:51,  2.18it/s]

Gradient norm: 107.39106231278444


Epoch 4 of 5 | Iteration:  69%|██████▉   | 839/1212 [06:43<03:01,  2.06it/s]

Gradient norm: 107.52019446342248


Epoch 4 of 5 | Iteration:  69%|██████▉   | 840/1212 [06:43<02:48,  2.20it/s]

Gradient norm: 109.04071778569315


Epoch 4 of 5 | Iteration:  69%|██████▉   | 841/1212 [06:44<02:39,  2.32it/s]

Gradient norm: 109.06057263446958


Epoch 4 of 5 | Iteration:  69%|██████▉   | 842/1212 [06:44<02:34,  2.40it/s]

Gradient norm: 108.9566647210287


Epoch 4 of 5 | Iteration:  70%|██████▉   | 843/1212 [06:44<02:32,  2.42it/s]

Gradient norm: 108.4338776771355


Epoch 4 of 5 | Iteration:  70%|██████▉   | 844/1212 [06:45<02:27,  2.49it/s]

Gradient norm: 107.24205424469758


Epoch 4 of 5 | Iteration:  70%|██████▉   | 845/1212 [06:45<02:32,  2.41it/s]

Gradient norm: 110.4636877285422


Epoch 4 of 5 | Iteration:  70%|██████▉   | 846/1212 [06:46<02:29,  2.45it/s]

Gradient norm: 110.35556208771438


Epoch 4 of 5 | Iteration:  70%|██████▉   | 847/1212 [06:46<02:39,  2.28it/s]

Gradient norm: 111.41514224422924


Epoch 4 of 5 | Iteration:  70%|██████▉   | 848/1212 [06:46<02:42,  2.24it/s]

Gradient norm: 111.37800136013047


Epoch 4 of 5 | Iteration:  70%|███████   | 849/1212 [06:47<02:49,  2.14it/s]

Gradient norm: 1.481940575935656


Epoch 4 of 5 | Iteration:  70%|███████   | 850/1212 [06:47<02:48,  2.14it/s]

Gradient norm: 2.853010705554352


Epoch 4 of 5 | Iteration:  70%|███████   | 851/1212 [06:48<03:06,  1.93it/s]

Gradient norm: 15.22976379791442


Epoch 4 of 5 | Iteration:  70%|███████   | 852/1212 [06:49<03:03,  1.96it/s]

Gradient norm: 15.77110191327446


Epoch 4 of 5 | Iteration:  70%|███████   | 853/1212 [06:49<03:03,  1.95it/s]

Gradient norm: 15.89861285555976


Epoch 4 of 5 | Iteration:  70%|███████   | 854/1212 [06:50<03:21,  1.78it/s]

Gradient norm: 16.55533056340156


Epoch 4 of 5 | Iteration:  71%|███████   | 855/1212 [06:51<03:36,  1.65it/s]

Gradient norm: 16.782608038775457


Epoch 4 of 5 | Iteration:  71%|███████   | 856/1212 [06:51<03:38,  1.63it/s]

Gradient norm: 17.569232263833506


Epoch 4 of 5 | Iteration:  71%|███████   | 857/1212 [06:52<03:27,  1.71it/s]

Gradient norm: 17.786247518494203


Epoch 4 of 5 | Iteration:  71%|███████   | 858/1212 [06:52<03:30,  1.68it/s]

Gradient norm: 18.16415686321357


Epoch 4 of 5 | Iteration:  71%|███████   | 859/1212 [06:53<03:07,  1.89it/s]

Gradient norm: 18.16810455614807


Epoch 4 of 5 | Iteration:  71%|███████   | 860/1212 [06:53<02:51,  2.05it/s]

Gradient norm: 18.35626393443217


Epoch 4 of 5 | Iteration:  71%|███████   | 861/1212 [06:54<02:56,  1.99it/s]

Gradient norm: 19.027693187517986


Epoch 4 of 5 | Iteration:  71%|███████   | 862/1212 [06:54<02:43,  2.14it/s]

Gradient norm: 20.06633519608693


Epoch 4 of 5 | Iteration:  71%|███████   | 863/1212 [06:54<02:35,  2.25it/s]

Gradient norm: 22.808951473687607


Epoch 4 of 5 | Iteration:  71%|███████▏  | 864/1212 [06:55<02:31,  2.30it/s]

Gradient norm: 22.918402353035724


Epoch 4 of 5 | Iteration:  71%|███████▏  | 865/1212 [06:55<02:54,  1.99it/s]

Gradient norm: 5.989411580465674


Epoch 4 of 5 | Iteration:  71%|███████▏  | 866/1212 [06:56<02:57,  1.95it/s]

Gradient norm: 16.320326820824263


Epoch 4 of 5 | Iteration:  72%|███████▏  | 867/1212 [06:56<02:43,  2.11it/s]

Gradient norm: 20.145954747424042


Epoch 4 of 5 | Iteration:  72%|███████▏  | 868/1212 [06:57<02:33,  2.24it/s]

Gradient norm: 20.184845077344843


Epoch 4 of 5 | Iteration:  72%|███████▏  | 869/1212 [06:57<02:27,  2.33it/s]

Gradient norm: 22.71664751229813


Epoch 4 of 5 | Iteration:  72%|███████▏  | 870/1212 [06:58<02:27,  2.32it/s]

Gradient norm: 430.198358529007


Epoch 4 of 5 | Iteration:  72%|███████▏  | 871/1212 [06:58<02:21,  2.42it/s]

Gradient norm: 429.1720094904546


Epoch 4 of 5 | Iteration:  72%|███████▏  | 872/1212 [06:58<02:23,  2.37it/s]

Gradient norm: 429.4311853987627


Epoch 4 of 5 | Iteration:  72%|███████▏  | 873/1212 [06:59<02:22,  2.38it/s]

Gradient norm: 417.29722965511496


Epoch 4 of 5 | Iteration:  72%|███████▏  | 874/1212 [06:59<02:19,  2.42it/s]

Gradient norm: 417.3519931722015


Epoch 4 of 5 | Iteration:  72%|███████▏  | 875/1212 [07:00<02:17,  2.45it/s]

Gradient norm: 416.8829445721521


Epoch 4 of 5 | Iteration:  72%|███████▏  | 876/1212 [07:00<02:20,  2.39it/s]

Gradient norm: 418.66795244590423


Epoch 4 of 5 | Iteration:  72%|███████▏  | 877/1212 [07:01<02:34,  2.17it/s]

Gradient norm: 418.03447936391035


Epoch 4 of 5 | Iteration:  72%|███████▏  | 878/1212 [07:01<02:32,  2.18it/s]

Gradient norm: 417.6967793976016


Epoch 4 of 5 | Iteration:  73%|███████▎  | 879/1212 [07:02<02:32,  2.18it/s]

Gradient norm: 417.39984046846484


Epoch 4 of 5 | Iteration:  73%|███████▎  | 880/1212 [07:02<02:41,  2.06it/s]

Gradient norm: 417.7894061213918


Epoch 4 of 5 | Iteration:  73%|███████▎  | 881/1212 [07:03<02:43,  2.02it/s]

Gradient norm: 3.1013127018859743


Epoch 4 of 5 | Iteration:  73%|███████▎  | 882/1212 [07:03<02:57,  1.85it/s]

Gradient norm: 11.298410630505455


Epoch 4 of 5 | Iteration:  73%|███████▎  | 883/1212 [07:04<02:57,  1.86it/s]

Gradient norm: 23.664787231323984


Epoch 4 of 5 | Iteration:  73%|███████▎  | 884/1212 [07:04<03:05,  1.76it/s]

Gradient norm: 25.772059416423712


Epoch 4 of 5 | Iteration:  73%|███████▎  | 885/1212 [07:05<03:04,  1.78it/s]

Gradient norm: 27.822215756697773


Epoch 4 of 5 | Iteration:  73%|███████▎  | 886/1212 [07:05<02:47,  1.95it/s]

Gradient norm: 27.95914954417673


Epoch 4 of 5 | Iteration:  73%|███████▎  | 887/1212 [07:06<02:41,  2.01it/s]

Gradient norm: 56.17097381631457


Epoch 4 of 5 | Iteration:  73%|███████▎  | 888/1212 [07:06<02:29,  2.17it/s]

Gradient norm: 55.00327437802969


Epoch 4 of 5 | Iteration:  73%|███████▎  | 889/1212 [07:07<02:21,  2.29it/s]

Gradient norm: 54.763889352291955


Epoch 4 of 5 | Iteration:  73%|███████▎  | 890/1212 [07:07<02:18,  2.33it/s]

Gradient norm: 56.03384431886739


Epoch 4 of 5 | Iteration:  74%|███████▎  | 891/1212 [07:07<02:17,  2.33it/s]

Gradient norm: 56.57638555309287


Epoch 4 of 5 | Iteration:  74%|███████▎  | 892/1212 [07:08<02:20,  2.28it/s]

Gradient norm: 59.114841302727214


Epoch 4 of 5 | Iteration:  74%|███████▎  | 893/1212 [07:08<02:16,  2.34it/s]

Gradient norm: 59.623317115429934


Epoch 4 of 5 | Iteration:  74%|███████▍  | 894/1212 [07:09<02:10,  2.43it/s]

Gradient norm: 62.074257829651955


Epoch 4 of 5 | Iteration:  74%|███████▍  | 895/1212 [07:09<02:20,  2.25it/s]

Gradient norm: 63.98759286665438


Epoch 4 of 5 | Iteration:  74%|███████▍  | 896/1212 [07:10<02:19,  2.26it/s]

Gradient norm: 62.783936377563634


Epoch 4 of 5 | Iteration:  74%|███████▍  | 897/1212 [07:10<02:14,  2.35it/s]

Gradient norm: 2.11625247888153


Epoch 4 of 5 | Iteration:  74%|███████▍  | 898/1212 [07:10<02:17,  2.28it/s]

Gradient norm: 19.830016297917933


Epoch 4 of 5 | Iteration:  74%|███████▍  | 899/1212 [07:11<02:17,  2.28it/s]

Gradient norm: 26.614777667402162


Epoch 4 of 5 | Iteration:  74%|███████▍  | 900/1212 [07:11<02:11,  2.37it/s]

Gradient norm: 2136.7064219751264


Epoch 4 of 5 | Iteration:  74%|███████▍  | 901/1212 [07:12<02:10,  2.39it/s]

Gradient norm: 2137.048347621236


Epoch 4 of 5 | Iteration:  74%|███████▍  | 902/1212 [07:12<02:18,  2.24it/s]

Gradient norm: 2137.421172278615


Epoch 4 of 5 | Iteration:  75%|███████▍  | 903/1212 [07:13<02:24,  2.14it/s]

Gradient norm: 2137.0182717143084


Epoch 4 of 5 | Iteration:  75%|███████▍  | 904/1212 [07:13<02:38,  1.94it/s]

Gradient norm: 2137.9479282871607


Epoch 4 of 5 | Iteration:  75%|███████▍  | 905/1212 [07:14<02:24,  2.13it/s]

Gradient norm: 2138.992598576569


Epoch 4 of 5 | Iteration:  75%|███████▍  | 906/1212 [07:14<02:28,  2.06it/s]

Gradient norm: 2139.1313159176125


Epoch 4 of 5 | Iteration:  75%|███████▍  | 907/1212 [07:15<02:18,  2.21it/s]

Gradient norm: 2139.14015173533


Epoch 4 of 5 | Iteration:  75%|███████▍  | 908/1212 [07:15<02:18,  2.19it/s]

Gradient norm: 2139.0773043145623


Epoch 4 of 5 | Iteration:  75%|███████▌  | 909/1212 [07:16<02:38,  1.91it/s]

Gradient norm: 2139.175073509126


Epoch 4 of 5 | Iteration:  75%|███████▌  | 910/1212 [07:16<02:38,  1.90it/s]

Gradient norm: 2141.2211305601045


Epoch 4 of 5 | Iteration:  75%|███████▌  | 911/1212 [07:17<02:39,  1.89it/s]

Gradient norm: 2141.0802430713475


Epoch 4 of 5 | Iteration:  75%|███████▌  | 912/1212 [07:17<02:37,  1.91it/s]

Gradient norm: 2140.9280096548737


Epoch 4 of 5 | Iteration:  75%|███████▌  | 913/1212 [07:18<02:44,  1.82it/s]

Gradient norm: 173.21151226954348


Epoch 4 of 5 | Iteration:  75%|███████▌  | 914/1212 [07:18<02:46,  1.79it/s]

Gradient norm: 173.5799959254462


Epoch 4 of 5 | Iteration:  75%|███████▌  | 915/1212 [07:19<02:35,  1.91it/s]

Gradient norm: 173.56571949712097


Epoch 4 of 5 | Iteration:  76%|███████▌  | 916/1212 [07:19<02:24,  2.05it/s]

Gradient norm: 173.46502146305605


Epoch 4 of 5 | Iteration:  76%|███████▌  | 917/1212 [07:20<02:28,  1.98it/s]

Gradient norm: 169.84669828367066


Epoch 4 of 5 | Iteration:  76%|███████▌  | 918/1212 [07:20<02:19,  2.11it/s]

Gradient norm: 169.63356735395172


Epoch 4 of 5 | Iteration:  76%|███████▌  | 919/1212 [07:21<02:11,  2.22it/s]

Gradient norm: 169.8229093347897


Epoch 4 of 5 | Iteration:  76%|███████▌  | 920/1212 [07:21<02:06,  2.31it/s]

Gradient norm: 169.7550823233065


Epoch 4 of 5 | Iteration:  76%|███████▌  | 921/1212 [07:21<02:04,  2.34it/s]

Gradient norm: 169.79651891498736


Epoch 4 of 5 | Iteration:  76%|███████▌  | 922/1212 [07:22<02:23,  2.02it/s]

Gradient norm: 170.85465914885182


Epoch 4 of 5 | Iteration:  76%|███████▌  | 923/1212 [07:23<02:17,  2.09it/s]

Gradient norm: 170.92403474235445


Epoch 4 of 5 | Iteration:  76%|███████▌  | 924/1212 [07:23<02:10,  2.21it/s]

Gradient norm: 170.9486822031177


Epoch 4 of 5 | Iteration:  76%|███████▋  | 925/1212 [07:23<02:05,  2.29it/s]

Gradient norm: 171.98661644308362


Epoch 4 of 5 | Iteration:  76%|███████▋  | 926/1212 [07:24<02:06,  2.25it/s]

Gradient norm: 172.05260707623248


Epoch 4 of 5 | Iteration:  76%|███████▋  | 927/1212 [07:24<02:07,  2.23it/s]

Gradient norm: 172.05380873904087


Epoch 4 of 5 | Iteration:  77%|███████▋  | 928/1212 [07:25<02:10,  2.18it/s]

Gradient norm: 171.9323317548594


Epoch 4 of 5 | Iteration:  77%|███████▋  | 929/1212 [07:25<02:05,  2.25it/s]

Gradient norm: 4.480969544448573


Epoch 4 of 5 | Iteration:  77%|███████▋  | 930/1212 [07:26<02:02,  2.30it/s]

Gradient norm: 5.194995950644629


Epoch 4 of 5 | Iteration:  77%|███████▋  | 931/1212 [07:26<01:57,  2.39it/s]

Gradient norm: 15.205601720028541


Epoch 4 of 5 | Iteration:  77%|███████▋  | 932/1212 [07:26<01:55,  2.43it/s]

Gradient norm: 14.41517902847404


Epoch 4 of 5 | Iteration:  77%|███████▋  | 933/1212 [07:27<01:54,  2.44it/s]

Gradient norm: 19.590275092471106


Epoch 4 of 5 | Iteration:  77%|███████▋  | 934/1212 [07:27<01:52,  2.47it/s]

Gradient norm: 19.867278441043748


Epoch 4 of 5 | Iteration:  77%|███████▋  | 935/1212 [07:28<01:58,  2.33it/s]

Gradient norm: 21.866374571032402


Epoch 4 of 5 | Iteration:  77%|███████▋  | 936/1212 [07:28<02:02,  2.24it/s]

Gradient norm: 28.59021195201358


Epoch 4 of 5 | Iteration:  77%|███████▋  | 937/1212 [07:29<02:02,  2.24it/s]

Gradient norm: 28.509589256504167


Epoch 4 of 5 | Iteration:  77%|███████▋  | 938/1212 [07:29<02:08,  2.13it/s]

Gradient norm: 28.273370350068216


Epoch 4 of 5 | Iteration:  77%|███████▋  | 939/1212 [07:30<02:11,  2.07it/s]

Gradient norm: 28.43081857678311


Epoch 4 of 5 | Iteration:  78%|███████▊  | 940/1212 [07:30<02:18,  1.96it/s]

Gradient norm: 28.579135234076038


Epoch 4 of 5 | Iteration:  78%|███████▊  | 941/1212 [07:31<02:21,  1.91it/s]

Gradient norm: 111.15860345060655


Epoch 4 of 5 | Iteration:  78%|███████▊  | 942/1212 [07:31<02:37,  1.71it/s]

Gradient norm: 112.26237166183505


Epoch 4 of 5 | Iteration:  78%|███████▊  | 943/1212 [07:32<02:36,  1.72it/s]

Gradient norm: 112.15921313502751


Epoch 4 of 5 | Iteration:  78%|███████▊  | 944/1212 [07:33<02:26,  1.83it/s]

Gradient norm: 112.22079572693956


Epoch 4 of 5 | Iteration:  78%|███████▊  | 945/1212 [07:33<02:13,  2.00it/s]

Gradient norm: 0.6021684755087414


Epoch 4 of 5 | Iteration:  78%|███████▊  | 946/1212 [07:33<02:19,  1.91it/s]

Gradient norm: 6.014971763993009


Epoch 4 of 5 | Iteration:  78%|███████▊  | 947/1212 [07:34<02:21,  1.87it/s]

Gradient norm: 6.556302200777312


Epoch 4 of 5 | Iteration:  78%|███████▊  | 948/1212 [07:35<02:19,  1.89it/s]

Gradient norm: 14.670413924532106


Epoch 4 of 5 | Iteration:  78%|███████▊  | 949/1212 [07:35<02:07,  2.07it/s]

Gradient norm: 14.794001936461116


Epoch 4 of 5 | Iteration:  78%|███████▊  | 950/1212 [07:35<01:58,  2.22it/s]

Gradient norm: 19.95222439343456


Epoch 4 of 5 | Iteration:  78%|███████▊  | 951/1212 [07:36<01:53,  2.30it/s]

Gradient norm: 49.995202539068416


Epoch 4 of 5 | Iteration:  79%|███████▊  | 952/1212 [07:36<02:03,  2.10it/s]

Gradient norm: 49.17992098460855


Epoch 4 of 5 | Iteration:  79%|███████▊  | 953/1212 [07:37<01:55,  2.24it/s]

Gradient norm: 55.550051597571134


Epoch 4 of 5 | Iteration:  79%|███████▊  | 954/1212 [07:37<01:50,  2.33it/s]

Gradient norm: 77.01783397025908


Epoch 4 of 5 | Iteration:  79%|███████▉  | 955/1212 [07:37<01:48,  2.36it/s]

Gradient norm: 76.50288482027486


Epoch 4 of 5 | Iteration:  79%|███████▉  | 956/1212 [07:38<01:45,  2.43it/s]

Gradient norm: 74.40058371239505


Epoch 4 of 5 | Iteration:  79%|███████▉  | 957/1212 [07:38<01:44,  2.43it/s]

Gradient norm: 75.04523212742517


Epoch 4 of 5 | Iteration:  79%|███████▉  | 958/1212 [07:39<02:07,  1.99it/s]

Gradient norm: 75.0212684354382


Epoch 4 of 5 | Iteration:  79%|███████▉  | 959/1212 [07:40<02:09,  1.95it/s]

Gradient norm: 74.48629354808062


Epoch 4 of 5 | Iteration:  79%|███████▉  | 960/1212 [07:40<02:12,  1.90it/s]

Gradient norm: 74.21122639028427


Epoch 4 of 5 | Iteration:  79%|███████▉  | 961/1212 [07:41<02:12,  1.89it/s]

Gradient norm: 7.367971066106359


Epoch 4 of 5 | Iteration:  79%|███████▉  | 962/1212 [07:41<02:00,  2.07it/s]

Gradient norm: 7.300825459315779


Epoch 4 of 5 | Iteration:  79%|███████▉  | 963/1212 [07:41<02:02,  2.03it/s]

Gradient norm: 7.541540239138039


Epoch 4 of 5 | Iteration:  80%|███████▉  | 964/1212 [07:42<01:53,  2.19it/s]

Gradient norm: 17.310880659864427


Epoch 4 of 5 | Iteration:  80%|███████▉  | 965/1212 [07:42<01:55,  2.14it/s]

Gradient norm: 18.14498581531643


Epoch 4 of 5 | Iteration:  80%|███████▉  | 966/1212 [07:43<01:59,  2.07it/s]

Gradient norm: 18.216990805576284


Epoch 4 of 5 | Iteration:  80%|███████▉  | 967/1212 [07:43<02:07,  1.93it/s]

Gradient norm: 18.21147556610618


Epoch 4 of 5 | Iteration:  80%|███████▉  | 968/1212 [07:44<02:06,  1.93it/s]

Gradient norm: 18.140374856673215


Epoch 4 of 5 | Iteration:  80%|███████▉  | 969/1212 [07:45<02:05,  1.93it/s]

Gradient norm: 24.975257559746684


Epoch 4 of 5 | Iteration:  80%|████████  | 970/1212 [07:45<02:10,  1.86it/s]

Gradient norm: 23.747703097961264


Epoch 4 of 5 | Iteration:  80%|████████  | 971/1212 [07:46<02:11,  1.83it/s]

Gradient norm: 23.669999050398637


Epoch 4 of 5 | Iteration:  80%|████████  | 972/1212 [07:46<02:09,  1.86it/s]

Gradient norm: 23.78550967742917


Epoch 4 of 5 | Iteration:  80%|████████  | 973/1212 [07:47<01:58,  2.02it/s]

Gradient norm: 23.948187069444735


Epoch 4 of 5 | Iteration:  80%|████████  | 974/1212 [07:47<02:04,  1.92it/s]

Gradient norm: 23.704478248801113


Epoch 4 of 5 | Iteration:  80%|████████  | 975/1212 [07:48<02:05,  1.89it/s]

Gradient norm: 29.298118339745606


Epoch 4 of 5 | Iteration:  81%|████████  | 976/1212 [07:48<02:02,  1.92it/s]

Gradient norm: 29.22287956732616


Epoch 4 of 5 | Iteration:  81%|████████  | 977/1212 [07:49<01:52,  2.09it/s]

Gradient norm: 15.982739246259948


Epoch 4 of 5 | Iteration:  81%|████████  | 978/1212 [07:49<01:45,  2.21it/s]

Gradient norm: 16.782342213117694


Epoch 4 of 5 | Iteration:  81%|████████  | 979/1212 [07:49<01:40,  2.31it/s]

Gradient norm: 20.29039458301558


Epoch 4 of 5 | Iteration:  81%|████████  | 980/1212 [07:50<01:59,  1.95it/s]

Gradient norm: 21.79393396237771


Epoch 4 of 5 | Iteration:  81%|████████  | 981/1212 [07:50<01:51,  2.07it/s]

Gradient norm: 22.92397093820238


Epoch 4 of 5 | Iteration:  81%|████████  | 982/1212 [07:51<01:43,  2.21it/s]

Gradient norm: 23.285523216113862


Epoch 4 of 5 | Iteration:  81%|████████  | 983/1212 [07:51<01:44,  2.20it/s]

Gradient norm: 25.04000213039802


Epoch 4 of 5 | Iteration:  81%|████████  | 984/1212 [07:52<01:40,  2.27it/s]

Gradient norm: 25.152272364129143


Epoch 4 of 5 | Iteration:  81%|████████▏ | 985/1212 [07:52<01:36,  2.36it/s]

Gradient norm: 36.66759453502637


Epoch 4 of 5 | Iteration:  81%|████████▏ | 986/1212 [07:53<01:34,  2.39it/s]

Gradient norm: 203.64401991802154


Epoch 4 of 5 | Iteration:  81%|████████▏ | 987/1212 [07:53<01:41,  2.22it/s]

Gradient norm: 203.99961673692354


Epoch 4 of 5 | Iteration:  82%|████████▏ | 988/1212 [07:54<02:02,  1.83it/s]

Gradient norm: 204.42693638125158


Epoch 4 of 5 | Iteration:  82%|████████▏ | 989/1212 [07:54<01:51,  2.01it/s]

Gradient norm: 212.9191673210927


Epoch 4 of 5 | Iteration:  82%|████████▏ | 990/1212 [07:55<01:43,  2.14it/s]

Gradient norm: 213.136467107436


Epoch 4 of 5 | Iteration:  82%|████████▏ | 991/1212 [07:55<01:46,  2.08it/s]

Gradient norm: 210.4682354905816


Epoch 4 of 5 | Iteration:  82%|████████▏ | 992/1212 [07:56<01:41,  2.16it/s]

Gradient norm: 210.14578880625052


Epoch 4 of 5 | Iteration:  82%|████████▏ | 993/1212 [07:56<01:44,  2.10it/s]

Gradient norm: 3.7141427031134144


Epoch 4 of 5 | Iteration:  82%|████████▏ | 994/1212 [07:57<01:47,  2.03it/s]

Gradient norm: 5.643154326814273


Epoch 4 of 5 | Iteration:  82%|████████▏ | 995/1212 [07:57<01:47,  2.02it/s]

Gradient norm: 26.849565939746277


Epoch 4 of 5 | Iteration:  82%|████████▏ | 996/1212 [07:58<01:54,  1.89it/s]

Gradient norm: 41.112369562319905


Epoch 4 of 5 | Iteration:  82%|████████▏ | 997/1212 [07:58<01:53,  1.89it/s]

Gradient norm: 41.47875626040475


Epoch 4 of 5 | Iteration:  82%|████████▏ | 998/1212 [07:59<01:57,  1.82it/s]

Gradient norm: 41.2834519471651


Epoch 4 of 5 | Iteration:  82%|████████▏ | 999/1212 [07:59<01:53,  1.87it/s]

Gradient norm: 40.478789629425776


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1000/1212 [08:00<01:49,  1.94it/s]

Gradient norm: 40.73132164095693


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1001/1212 [08:00<01:52,  1.87it/s]

Gradient norm: 91.97190971975706


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1002/1212 [08:01<01:46,  1.97it/s]

Gradient norm: 451.6942463308893


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1003/1212 [08:01<01:38,  2.13it/s]

Gradient norm: 451.7432753363097


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1004/1212 [08:02<01:35,  2.17it/s]

Gradient norm: 453.97796669626484


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1005/1212 [08:02<01:30,  2.30it/s]

Gradient norm: 454.6329148562367


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1006/1212 [08:02<01:33,  2.21it/s]

Gradient norm: 457.3773175944951


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1007/1212 [08:03<01:29,  2.29it/s]

Gradient norm: 456.58740623038693


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1008/1212 [08:03<01:32,  2.21it/s]

Gradient norm: 457.9915109402135


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1009/1212 [08:04<01:30,  2.25it/s]

Gradient norm: 11.372788624888159


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1010/1212 [08:04<01:27,  2.31it/s]

Gradient norm: 34.839035550335105


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1011/1212 [08:05<01:33,  2.15it/s]

Gradient norm: 38.105206930381975


Epoch 4 of 5 | Iteration:  83%|████████▎ | 1012/1212 [08:05<01:44,  1.92it/s]

Gradient norm: 40.73898748245371


Epoch 4 of 5 | Iteration:  84%|████████▎ | 1013/1212 [08:06<01:36,  2.07it/s]

Gradient norm: 40.374768169628645


Epoch 4 of 5 | Iteration:  84%|████████▎ | 1014/1212 [08:06<01:40,  1.96it/s]

Gradient norm: 40.99514633372959


Epoch 4 of 5 | Iteration:  84%|████████▎ | 1015/1212 [08:07<01:32,  2.12it/s]

Gradient norm: 41.83275829401494


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1016/1212 [08:07<01:31,  2.14it/s]

Gradient norm: 41.77774220520099


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1017/1212 [08:08<01:39,  1.96it/s]

Gradient norm: 41.047987416615996


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1018/1212 [08:08<01:40,  1.94it/s]

Gradient norm: 97.26150976331712


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1019/1212 [08:09<01:34,  2.04it/s]

Gradient norm: 240.9047422660447


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1020/1212 [08:09<01:35,  2.01it/s]

Gradient norm: 238.83813977733865


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1021/1212 [08:10<01:47,  1.78it/s]

Gradient norm: 238.69332637040696


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1022/1212 [08:11<01:44,  1.82it/s]

Gradient norm: 238.73207261762255


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1023/1212 [08:11<01:42,  1.85it/s]

Gradient norm: 238.84715033537736


Epoch 4 of 5 | Iteration:  84%|████████▍ | 1024/1212 [08:12<01:51,  1.68it/s]

Gradient norm: 237.77108785424744


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1025/1212 [08:12<01:48,  1.72it/s]

Gradient norm: 4.360299941360814


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1026/1212 [08:13<01:48,  1.72it/s]

Gradient norm: 18.98052948331741


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1027/1212 [08:13<01:40,  1.85it/s]

Gradient norm: 38.105188916216456


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1028/1212 [08:14<01:45,  1.74it/s]

Gradient norm: 38.295678264026115


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1029/1212 [08:14<01:40,  1.83it/s]

Gradient norm: 37.7328798471186


Epoch 4 of 5 | Iteration:  85%|████████▍ | 1030/1212 [08:15<01:29,  2.03it/s]

Gradient norm: 40.58961295307573


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1031/1212 [08:15<01:35,  1.89it/s]

Gradient norm: 40.63964621742653


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1032/1212 [08:16<01:27,  2.07it/s]

Gradient norm: 41.12550576458538


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1033/1212 [08:16<01:20,  2.21it/s]

Gradient norm: 43.154989763674116


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1034/1212 [08:17<01:18,  2.27it/s]

Gradient norm: 43.19746539507401


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1035/1212 [08:17<01:14,  2.38it/s]

Gradient norm: 68.38136026554882


Epoch 4 of 5 | Iteration:  85%|████████▌ | 1036/1212 [08:18<01:18,  2.25it/s]

Gradient norm: 68.51992711907718


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1037/1212 [08:18<01:14,  2.34it/s]

Gradient norm: 562.1632094219204


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1038/1212 [08:18<01:12,  2.41it/s]

Gradient norm: 558.0997386485338


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1039/1212 [08:19<01:12,  2.38it/s]

Gradient norm: 557.9113408735708


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1040/1212 [08:19<01:21,  2.12it/s]

Gradient norm: 559.7816844725039


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1041/1212 [08:20<01:22,  2.07it/s]

Gradient norm: 4.124078922264034


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1042/1212 [08:20<01:16,  2.23it/s]

Gradient norm: 4.817650198425137


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1043/1212 [08:21<01:19,  2.13it/s]

Gradient norm: 5.258346502773366


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1044/1212 [08:21<01:14,  2.26it/s]

Gradient norm: 6.860216698699909


Epoch 4 of 5 | Iteration:  86%|████████▌ | 1045/1212 [08:21<01:10,  2.37it/s]

Gradient norm: 7.050136744019274


Epoch 4 of 5 | Iteration:  86%|████████▋ | 1046/1212 [08:22<01:10,  2.34it/s]

Gradient norm: 109.31495427926406


Epoch 4 of 5 | Iteration:  86%|████████▋ | 1047/1212 [08:22<01:08,  2.40it/s]

Gradient norm: 112.66324643493358


Epoch 4 of 5 | Iteration:  86%|████████▋ | 1048/1212 [08:23<01:11,  2.29it/s]

Gradient norm: 112.90794096643776


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1049/1212 [08:23<01:16,  2.14it/s]

Gradient norm: 114.37807874444482


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1050/1212 [08:24<01:20,  2.02it/s]

Gradient norm: 114.23258372366735


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1051/1212 [08:25<01:29,  1.80it/s]

Gradient norm: 113.9227876232237


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1052/1212 [08:25<01:26,  1.85it/s]

Gradient norm: 115.87023092682446


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1053/1212 [08:26<01:25,  1.85it/s]

Gradient norm: 117.51792701633076


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1054/1212 [08:26<01:25,  1.84it/s]

Gradient norm: 117.96160131155503


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1055/1212 [08:27<01:18,  2.01it/s]

Gradient norm: 119.43105499315652


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1056/1212 [08:27<01:18,  2.00it/s]

Gradient norm: 171.65047554978824


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1057/1212 [08:28<01:24,  1.83it/s]

Gradient norm: 36.526963546728375


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1058/1212 [08:28<01:17,  1.98it/s]

Gradient norm: 36.50835146546056


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1059/1212 [08:29<01:12,  2.11it/s]

Gradient norm: 39.47028242580298


Epoch 4 of 5 | Iteration:  87%|████████▋ | 1060/1212 [08:29<01:08,  2.21it/s]

Gradient norm: 39.84492674337294


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1061/1212 [08:29<01:05,  2.29it/s]

Gradient norm: 39.87853204919224


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1062/1212 [08:30<01:03,  2.37it/s]

Gradient norm: 169.81008456359174


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1063/1212 [08:30<01:01,  2.43it/s]

Gradient norm: 169.6797309543393


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1064/1212 [08:31<01:03,  2.31it/s]

Gradient norm: 169.56913681997958


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1065/1212 [08:31<01:03,  2.32it/s]

Gradient norm: 2995.024801040361


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1066/1212 [08:32<01:12,  2.01it/s]

Gradient norm: 2996.432836736801


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1067/1212 [08:32<01:14,  1.95it/s]

Gradient norm: 2996.912296832823


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1068/1212 [08:33<01:13,  1.97it/s]

Gradient norm: 2996.5034548158


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1069/1212 [08:33<01:18,  1.81it/s]

Gradient norm: 2996.0915443906506


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1070/1212 [08:34<01:19,  1.79it/s]

Gradient norm: 2997.680807982517


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1071/1212 [08:34<01:11,  1.97it/s]

Gradient norm: 2980.7338318266407


Epoch 4 of 5 | Iteration:  88%|████████▊ | 1072/1212 [08:35<01:06,  2.11it/s]

Gradient norm: 2980.8110225445685


Epoch 4 of 5 | Iteration:  89%|████████▊ | 1073/1212 [08:35<01:02,  2.22it/s]

Gradient norm: 2.0784319345285702


Epoch 4 of 5 | Iteration:  89%|████████▊ | 1074/1212 [08:36<01:00,  2.30it/s]

Gradient norm: 20.790906060336727


Epoch 4 of 5 | Iteration:  89%|████████▊ | 1075/1212 [08:36<01:03,  2.17it/s]

Gradient norm: 21.746762608278587


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1076/1212 [08:37<01:13,  1.84it/s]

Gradient norm: 22.921870750495522


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1077/1212 [08:37<01:12,  1.87it/s]

Gradient norm: 25.101195339750365


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1078/1212 [08:38<01:10,  1.89it/s]

Gradient norm: 22.292031815976245


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1079/1212 [08:38<01:08,  1.95it/s]

Gradient norm: 22.550083233170195


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1080/1212 [08:39<01:08,  1.93it/s]

Gradient norm: 22.649343653624605


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1081/1212 [08:39<01:12,  1.80it/s]

Gradient norm: 22.79257323871636


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1082/1212 [08:40<01:15,  1.72it/s]

Gradient norm: 23.904331253827642


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1083/1212 [08:41<01:09,  1.86it/s]

Gradient norm: 42.38989387668264


Epoch 4 of 5 | Iteration:  89%|████████▉ | 1084/1212 [08:41<01:03,  2.02it/s]

Gradient norm: 42.47663519304737


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1085/1212 [08:41<00:58,  2.18it/s]

Gradient norm: 42.51057667707474


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1086/1212 [08:42<00:59,  2.13it/s]

Gradient norm: 43.34128007772971


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1087/1212 [08:42<00:57,  2.19it/s]

Gradient norm: 44.96477105396032


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1088/1212 [08:43<00:54,  2.26it/s]

Gradient norm: 45.01279247606881


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1089/1212 [08:43<00:52,  2.35it/s]

Gradient norm: 30.28790645179706


Epoch 4 of 5 | Iteration:  90%|████████▉ | 1090/1212 [08:43<00:53,  2.29it/s]

Gradient norm: 31.88347485024607


Epoch 4 of 5 | Iteration:  90%|█████████ | 1091/1212 [08:44<00:57,  2.12it/s]

Gradient norm: 32.37332148861425


Epoch 4 of 5 | Iteration:  90%|█████████ | 1092/1212 [08:44<00:53,  2.25it/s]

Gradient norm: 31.43226891788421


Epoch 4 of 5 | Iteration:  90%|█████████ | 1093/1212 [08:45<00:53,  2.21it/s]

Gradient norm: 32.619201361910164


Epoch 4 of 5 | Iteration:  90%|█████████ | 1094/1212 [08:45<00:52,  2.26it/s]

Gradient norm: 32.41662107377614


Epoch 4 of 5 | Iteration:  90%|█████████ | 1095/1212 [08:46<00:54,  2.14it/s]

Gradient norm: 87.75140397323244


Epoch 4 of 5 | Iteration:  90%|█████████ | 1096/1212 [08:46<00:50,  2.28it/s]

Gradient norm: 88.06402907282211


Epoch 4 of 5 | Iteration:  91%|█████████ | 1097/1212 [08:47<00:53,  2.13it/s]

Gradient norm: 113.84379496333536


Epoch 4 of 5 | Iteration:  91%|█████████ | 1098/1212 [08:47<00:50,  2.25it/s]

Gradient norm: 116.7482994875408


Epoch 4 of 5 | Iteration:  91%|█████████ | 1099/1212 [08:48<00:57,  1.97it/s]

Gradient norm: 121.8806444037024


Epoch 4 of 5 | Iteration:  91%|█████████ | 1100/1212 [08:48<00:58,  1.92it/s]

Gradient norm: 121.66905091278842


Epoch 4 of 5 | Iteration:  91%|█████████ | 1101/1212 [08:49<00:56,  1.98it/s]

Gradient norm: 414.6718759675252


Epoch 4 of 5 | Iteration:  91%|█████████ | 1102/1212 [08:49<00:51,  2.13it/s]

Gradient norm: 413.77869675844175


Epoch 4 of 5 | Iteration:  91%|█████████ | 1103/1212 [08:50<00:58,  1.85it/s]

Gradient norm: 413.8320322346167


Epoch 4 of 5 | Iteration:  91%|█████████ | 1104/1212 [08:50<00:57,  1.87it/s]

Gradient norm: 413.31659392959807


Epoch 4 of 5 | Iteration:  91%|█████████ | 1105/1212 [08:51<00:56,  1.91it/s]

Gradient norm: 10.43423259130501


Epoch 4 of 5 | Iteration:  91%|█████████▏| 1106/1212 [08:52<01:01,  1.74it/s]

Gradient norm: 13.29242634075728


Epoch 4 of 5 | Iteration:  91%|█████████▏| 1107/1212 [08:52<00:58,  1.79it/s]

Gradient norm: 14.595445190574953


Epoch 4 of 5 | Iteration:  91%|█████████▏| 1108/1212 [08:53<01:02,  1.67it/s]

Gradient norm: 14.667225111268543


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1109/1212 [08:53<01:02,  1.64it/s]

Gradient norm: 14.747427024519231


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1110/1212 [08:54<00:56,  1.81it/s]

Gradient norm: 14.802424836330285


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1111/1212 [08:54<00:50,  2.00it/s]

Gradient norm: 14.761892638458294


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1112/1212 [08:55<00:51,  1.94it/s]

Gradient norm: 23.303646686594117


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1113/1212 [08:55<00:49,  2.00it/s]

Gradient norm: 35.324867144041875


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1114/1212 [08:56<00:46,  2.10it/s]

Gradient norm: 35.22571650776084


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1115/1212 [08:56<00:47,  2.04it/s]

Gradient norm: 36.48932705456275


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1116/1212 [08:57<00:52,  1.84it/s]

Gradient norm: 36.73332277356898


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1117/1212 [08:57<00:49,  1.90it/s]

Gradient norm: 38.94241039176224


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1118/1212 [08:58<00:53,  1.77it/s]

Gradient norm: 38.90823150901706


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1119/1212 [08:58<00:47,  1.95it/s]

Gradient norm: 41.71590370950114


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1120/1212 [08:59<00:46,  1.96it/s]

Gradient norm: 46.55928627284701


Epoch 4 of 5 | Iteration:  92%|█████████▏| 1121/1212 [08:59<00:42,  2.12it/s]

Gradient norm: 21.555653762347138


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1122/1212 [09:00<00:39,  2.25it/s]

Gradient norm: 130.1100550719336


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1123/1212 [09:00<00:40,  2.20it/s]

Gradient norm: 131.4966676345199


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1124/1212 [09:01<00:41,  2.12it/s]

Gradient norm: 131.78126913285416


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1125/1212 [09:01<00:40,  2.17it/s]

Gradient norm: 131.99505958748995


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1126/1212 [09:01<00:37,  2.28it/s]

Gradient norm: 133.5778608666339


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1127/1212 [09:02<00:36,  2.35it/s]

Gradient norm: 133.35668347652378


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1128/1212 [09:02<00:36,  2.32it/s]

Gradient norm: 134.8944274352058


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1129/1212 [09:03<00:38,  2.16it/s]

Gradient norm: 134.73751521011317


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1130/1212 [09:04<00:42,  1.92it/s]

Gradient norm: 134.6363714837295


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1131/1212 [09:04<00:41,  1.94it/s]

Gradient norm: 137.24072840142276


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1132/1212 [09:05<00:42,  1.90it/s]

Gradient norm: 137.57395859585


Epoch 4 of 5 | Iteration:  93%|█████████▎| 1133/1212 [09:05<00:42,  1.85it/s]

Gradient norm: 137.51535579864287


Epoch 4 of 5 | Iteration:  94%|█████████▎| 1134/1212 [09:06<00:45,  1.73it/s]

Gradient norm: 137.57489096371947


Epoch 4 of 5 | Iteration:  94%|█████████▎| 1135/1212 [09:06<00:45,  1.70it/s]

Gradient norm: 140.49728943427633


Epoch 4 of 5 | Iteration:  94%|█████████▎| 1136/1212 [09:07<00:43,  1.74it/s]

Gradient norm: 140.23711227740782


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1137/1212 [09:07<00:39,  1.91it/s]

Gradient norm: 0.8633279917838494


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1138/1212 [09:08<00:39,  1.89it/s]

Gradient norm: 19.338946334431437


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1139/1212 [09:08<00:35,  2.06it/s]

Gradient norm: 24.913723942537665


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1140/1212 [09:09<00:33,  2.18it/s]

Gradient norm: 36.50875532105049


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1141/1212 [09:09<00:31,  2.23it/s]

Gradient norm: 36.45058367033798


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1142/1212 [09:10<00:30,  2.33it/s]

Gradient norm: 36.40724367407044


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1143/1212 [09:10<00:33,  2.07it/s]

Gradient norm: 36.59517747108462


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1144/1212 [09:11<00:31,  2.14it/s]

Gradient norm: 36.68846603177934


Epoch 4 of 5 | Iteration:  94%|█████████▍| 1145/1212 [09:11<00:31,  2.14it/s]

Gradient norm: 38.154099541384355


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1146/1212 [09:11<00:29,  2.27it/s]

Gradient norm: 39.05268571848573


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1147/1212 [09:12<00:32,  2.03it/s]

Gradient norm: 38.73282890794649


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1148/1212 [09:12<00:29,  2.17it/s]

Gradient norm: 39.104337040721134


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1149/1212 [09:13<00:27,  2.31it/s]

Gradient norm: 39.22207680302481


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1150/1212 [09:13<00:29,  2.13it/s]

Gradient norm: 44.185113657312236


Epoch 4 of 5 | Iteration:  95%|█████████▍| 1151/1212 [09:14<00:27,  2.25it/s]

Gradient norm: 43.929021855366194


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1152/1212 [09:14<00:28,  2.14it/s]

Gradient norm: 43.36709794324696


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1153/1212 [09:15<00:28,  2.06it/s]

Gradient norm: 3.7544233012719634


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1154/1212 [09:15<00:26,  2.21it/s]

Gradient norm: 20.764538778669774


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1155/1212 [09:15<00:24,  2.35it/s]

Gradient norm: 21.32374305266265


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1156/1212 [09:16<00:28,  1.98it/s]

Gradient norm: 23.778318967424642


Epoch 4 of 5 | Iteration:  95%|█████████▌| 1157/1212 [09:17<00:28,  1.93it/s]

Gradient norm: 38.653939129794146


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1158/1212 [09:17<00:27,  1.94it/s]

Gradient norm: 38.87956902919379


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1159/1212 [09:18<00:27,  1.95it/s]

Gradient norm: 38.97733435621546


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1160/1212 [09:18<00:28,  1.86it/s]

Gradient norm: 39.14183354367388


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1161/1212 [09:19<00:28,  1.77it/s]

Gradient norm: 272.27092549417347


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1162/1212 [09:19<00:26,  1.87it/s]

Gradient norm: 272.62689983815926


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1163/1212 [09:20<00:26,  1.88it/s]

Gradient norm: 272.65089358849366


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1164/1212 [09:21<00:25,  1.85it/s]

Gradient norm: 272.7801430411121


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1165/1212 [09:21<00:23,  2.00it/s]

Gradient norm: 273.58914370678224


Epoch 4 of 5 | Iteration:  96%|█████████▌| 1166/1212 [09:21<00:22,  2.04it/s]

Gradient norm: 272.0457352801331


Epoch 4 of 5 | Iteration:  96%|█████████▋| 1167/1212 [09:22<00:21,  2.11it/s]

Gradient norm: 271.5431318871622


Epoch 4 of 5 | Iteration:  96%|█████████▋| 1168/1212 [09:22<00:21,  2.03it/s]

Gradient norm: 273.5314249954238


Epoch 4 of 5 | Iteration:  96%|█████████▋| 1169/1212 [09:23<00:20,  2.07it/s]

Gradient norm: 1.590485755797725


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1170/1212 [09:23<00:19,  2.14it/s]

Gradient norm: 1.8825568262053598


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1171/1212 [09:24<00:18,  2.23it/s]

Gradient norm: 3.063625529043935


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1172/1212 [09:24<00:18,  2.21it/s]

Gradient norm: 5.047711224863764


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1173/1212 [09:25<00:16,  2.31it/s]

Gradient norm: 6.537487304681821


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1174/1212 [09:25<00:16,  2.35it/s]

Gradient norm: 143.03279053355465


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1175/1212 [09:25<00:15,  2.37it/s]

Gradient norm: 142.41303113588106


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1176/1212 [09:26<00:14,  2.43it/s]

Gradient norm: 142.60096257794154


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1177/1212 [09:26<00:15,  2.25it/s]

Gradient norm: 142.29529523374524


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1178/1212 [09:27<00:14,  2.32it/s]

Gradient norm: 144.00226104596547


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1179/1212 [09:27<00:14,  2.27it/s]

Gradient norm: 143.97964512640817


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1180/1212 [09:28<00:14,  2.21it/s]

Gradient norm: 144.51420342439704


Epoch 4 of 5 | Iteration:  97%|█████████▋| 1181/1212 [09:28<00:13,  2.30it/s]

Gradient norm: 144.6173793757619


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1182/1212 [09:29<00:14,  2.08it/s]

Gradient norm: 144.67755594318766


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1183/1212 [09:29<00:14,  1.98it/s]

Gradient norm: 144.8610852851725


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1184/1212 [09:30<00:13,  2.00it/s]

Gradient norm: 146.52148583395092


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1185/1212 [09:30<00:14,  1.87it/s]

Gradient norm: 4.5903191436357424


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1186/1212 [09:31<00:13,  1.97it/s]

Gradient norm: 4.943410476768416


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1187/1212 [09:31<00:13,  1.89it/s]

Gradient norm: 6.596059518750649


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1188/1212 [09:32<00:13,  1.81it/s]

Gradient norm: 21.227556055981864


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1189/1212 [09:32<00:12,  1.84it/s]

Gradient norm: 20.54313747138978


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1190/1212 [09:33<00:11,  1.88it/s]

Gradient norm: 47.577337700926485


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1191/1212 [09:34<00:12,  1.73it/s]

Gradient norm: 47.26203777201522


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1192/1212 [09:34<00:11,  1.75it/s]

Gradient norm: 47.68304751876749


Epoch 4 of 5 | Iteration:  98%|█████████▊| 1193/1212 [09:35<00:09,  1.90it/s]

Gradient norm: 47.863674219655245


Epoch 4 of 5 | Iteration:  99%|█████████▊| 1194/1212 [09:35<00:08,  2.09it/s]

Gradient norm: 47.856557898634605


Epoch 4 of 5 | Iteration:  99%|█████████▊| 1195/1212 [09:35<00:08,  2.11it/s]

Gradient norm: 48.319674678511355


Epoch 4 of 5 | Iteration:  99%|█████████▊| 1196/1212 [09:36<00:07,  2.14it/s]

Gradient norm: 48.239619118088996


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1197/1212 [09:36<00:06,  2.22it/s]

Gradient norm: 48.21811435626155


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1198/1212 [09:37<00:06,  2.31it/s]

Gradient norm: 48.363050760315886


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1199/1212 [09:37<00:05,  2.43it/s]

Gradient norm: 48.326402025384326


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1200/1212 [09:37<00:05,  2.33it/s]

Gradient norm: 48.30420839639509


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1201/1212 [09:38<00:04,  2.43it/s]

Gradient norm: 4.666034479892777


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1202/1212 [09:38<00:04,  2.45it/s]

Gradient norm: 9.21467446687169


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1203/1212 [09:39<00:03,  2.27it/s]

Gradient norm: 19.425877896566696


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1204/1212 [09:39<00:03,  2.33it/s]

Gradient norm: 97.14296861347154


Epoch 4 of 5 | Iteration:  99%|█████████▉| 1205/1212 [09:40<00:02,  2.38it/s]

Gradient norm: 100.15873173943693


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1206/1212 [09:40<00:02,  2.43it/s]

Gradient norm: 100.91854847375457


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1207/1212 [09:40<00:02,  2.42it/s]

Gradient norm: 100.20278112194364


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1208/1212 [09:41<00:01,  2.19it/s]

Gradient norm: 100.13778593686364


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1209/1212 [09:41<00:01,  2.07it/s]

Gradient norm: 99.78090103486176


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1210/1212 [09:42<00:00,  2.23it/s]

Gradient norm: 99.56981574081055


Epoch 4 of 5 | Iteration: 100%|█████████▉| 1211/1212 [09:42<00:00,  2.17it/s]

Gradient norm: 99.2893071475566


Epoch 4 of 5 | Iteration: 100%|██████████| 1212/1212 [09:43<00:00,  2.08it/s]


Gradient norm: 99.21770765257162


100%|██████████| 1212/1212 [05:18<00:00,  3.81it/s]


Epoch 4/5, Training Loss: 1.6306, Validation Loss: 1.5550
Validation top k acc: 0.8663
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     10666
           1       0.34      0.62      0.44      1947

    accuracy                           0.75     12613
   macro avg       0.63      0.70      0.64     12613
weighted avg       0.83      0.75      0.78     12613



Epoch 5 of 5 | Iteration:   0%|          | 0/1212 [00:00<?, ?it/s]

Train ...


Epoch 5 of 5 | Iteration:   0%|          | 1/1212 [00:00<09:05,  2.22it/s]

Gradient norm: 103.93071847203312


Epoch 5 of 5 | Iteration:   0%|          | 2/1212 [00:00<09:50,  2.05it/s]

Gradient norm: 102.96595288553884


Epoch 5 of 5 | Iteration:   0%|          | 3/1212 [00:01<10:11,  1.98it/s]

Gradient norm: 103.20722323174074


Epoch 5 of 5 | Iteration:   0%|          | 4/1212 [00:02<11:15,  1.79it/s]

Gradient norm: 105.97467314683746


Epoch 5 of 5 | Iteration:   0%|          | 5/1212 [00:02<11:28,  1.75it/s]

Gradient norm: 108.70190851614299


Epoch 5 of 5 | Iteration:   0%|          | 6/1212 [00:03<10:58,  1.83it/s]

Gradient norm: 108.9868827559082


Epoch 5 of 5 | Iteration:   1%|          | 7/1212 [00:03<11:19,  1.77it/s]

Gradient norm: 108.972415978977


Epoch 5 of 5 | Iteration:   1%|          | 8/1212 [00:04<11:35,  1.73it/s]

Gradient norm: 109.04256581770395


Epoch 5 of 5 | Iteration:   1%|          | 9/1212 [00:04<10:34,  1.90it/s]

Gradient norm: 109.61953041468853


Epoch 5 of 5 | Iteration:   1%|          | 10/1212 [00:05<10:50,  1.85it/s]

Gradient norm: 108.9902367184438


Epoch 5 of 5 | Iteration:   1%|          | 11/1212 [00:05<09:55,  2.02it/s]

Gradient norm: 108.90762940265937


Epoch 5 of 5 | Iteration:   1%|          | 12/1212 [00:06<09:48,  2.04it/s]

Gradient norm: 108.89966310091869


Epoch 5 of 5 | Iteration:   1%|          | 13/1212 [00:06<09:41,  2.06it/s]

Gradient norm: 110.0131235132441


Epoch 5 of 5 | Iteration:   1%|          | 14/1212 [00:07<09:31,  2.10it/s]

Gradient norm: 110.253529580481


Epoch 5 of 5 | Iteration:   1%|          | 15/1212 [00:07<09:59,  2.00it/s]

Gradient norm: 114.80010301670308


Epoch 5 of 5 | Iteration:   1%|▏         | 16/1212 [00:08<09:30,  2.10it/s]

Gradient norm: 115.36404993583513


Epoch 5 of 5 | Iteration:   1%|▏         | 17/1212 [00:08<10:05,  1.97it/s]

Gradient norm: 7.63492874938507


Epoch 5 of 5 | Iteration:   1%|▏         | 18/1212 [00:09<09:28,  2.10it/s]

Gradient norm: 8.0558859854597


Epoch 5 of 5 | Iteration:   2%|▏         | 19/1212 [00:09<08:59,  2.21it/s]

Gradient norm: 19.158033970050916


Epoch 5 of 5 | Iteration:   2%|▏         | 20/1212 [00:09<08:42,  2.28it/s]

Gradient norm: 21.773200365801923


Epoch 5 of 5 | Iteration:   2%|▏         | 21/1212 [00:10<08:57,  2.22it/s]

Gradient norm: 83.8289595765358


Epoch 5 of 5 | Iteration:   2%|▏         | 22/1212 [00:10<08:39,  2.29it/s]

Gradient norm: 83.85502515311848


Epoch 5 of 5 | Iteration:   2%|▏         | 23/1212 [00:11<09:09,  2.17it/s]

Gradient norm: 84.75121403598251


Epoch 5 of 5 | Iteration:   2%|▏         | 24/1212 [00:11<09:07,  2.17it/s]

Gradient norm: 84.63273616255745


Epoch 5 of 5 | Iteration:   2%|▏         | 25/1212 [00:12<09:03,  2.18it/s]

Gradient norm: 85.29238127920341


Epoch 5 of 5 | Iteration:   2%|▏         | 26/1212 [00:12<09:43,  2.03it/s]

Gradient norm: 120.38993500702473


Epoch 5 of 5 | Iteration:   2%|▏         | 27/1212 [00:13<10:05,  1.96it/s]

Gradient norm: 124.05509242410858


Epoch 5 of 5 | Iteration:   2%|▏         | 28/1212 [00:13<09:34,  2.06it/s]

Gradient norm: 124.12188653587143


Epoch 5 of 5 | Iteration:   2%|▏         | 29/1212 [00:14<08:56,  2.20it/s]

Gradient norm: 123.8035338850181


Epoch 5 of 5 | Iteration:   2%|▏         | 30/1212 [00:14<09:13,  2.14it/s]

Gradient norm: 124.7090009542356


Epoch 5 of 5 | Iteration:   3%|▎         | 31/1212 [00:15<09:26,  2.08it/s]

Gradient norm: 125.15724429609375


Epoch 5 of 5 | Iteration:   3%|▎         | 32/1212 [00:15<10:21,  1.90it/s]

Gradient norm: 124.8540921217256


Epoch 5 of 5 | Iteration:   3%|▎         | 33/1212 [00:16<10:18,  1.91it/s]

Gradient norm: 2.7058286996498886


Epoch 5 of 5 | Iteration:   3%|▎         | 34/1212 [00:17<12:16,  1.60it/s]

Gradient norm: 3.3290076747644037


Epoch 5 of 5 | Iteration:   3%|▎         | 35/1212 [00:17<11:50,  1.66it/s]

Gradient norm: 265.69354595607285


Epoch 5 of 5 | Iteration:   3%|▎         | 36/1212 [00:18<11:20,  1.73it/s]

Gradient norm: 265.9879111088639


Epoch 5 of 5 | Iteration:   3%|▎         | 37/1212 [00:18<10:47,  1.82it/s]

Gradient norm: 267.67972408627264


Epoch 5 of 5 | Iteration:   3%|▎         | 38/1212 [00:19<11:31,  1.70it/s]

Gradient norm: 267.59956023673845


Epoch 5 of 5 | Iteration:   3%|▎         | 39/1212 [00:19<10:16,  1.90it/s]

Gradient norm: 267.6455581562448


Epoch 5 of 5 | Iteration:   3%|▎         | 40/1212 [00:20<09:27,  2.07it/s]

Gradient norm: 267.6495269454801


Epoch 5 of 5 | Iteration:   3%|▎         | 41/1212 [00:20<10:07,  1.93it/s]

Gradient norm: 267.30387099186345


Epoch 5 of 5 | Iteration:   3%|▎         | 42/1212 [00:21<09:24,  2.07it/s]

Gradient norm: 267.53783534719975


Epoch 5 of 5 | Iteration:   4%|▎         | 43/1212 [00:21<10:21,  1.88it/s]

Gradient norm: 267.75091530584933


Epoch 5 of 5 | Iteration:   4%|▎         | 44/1212 [00:22<11:07,  1.75it/s]

Gradient norm: 267.5265203524193


Epoch 5 of 5 | Iteration:   4%|▎         | 45/1212 [00:23<10:53,  1.79it/s]

Gradient norm: 267.0092183978021


Epoch 5 of 5 | Iteration:   4%|▍         | 46/1212 [00:23<10:46,  1.80it/s]

Gradient norm: 266.580018574164


Epoch 5 of 5 | Iteration:   4%|▍         | 47/1212 [00:24<10:33,  1.84it/s]

Gradient norm: 268.89909020590807


Epoch 5 of 5 | Iteration:   4%|▍         | 48/1212 [00:24<09:43,  2.00it/s]

Gradient norm: 266.89519644518106


Epoch 5 of 5 | Iteration:   4%|▍         | 49/1212 [00:24<09:04,  2.14it/s]

Gradient norm: 1.5481058155229306


Epoch 5 of 5 | Iteration:   4%|▍         | 50/1212 [00:25<09:39,  2.01it/s]

Gradient norm: 8.638226857240342


Epoch 5 of 5 | Iteration:   4%|▍         | 51/1212 [00:25<09:11,  2.10it/s]

Gradient norm: 8.676857235528752


Epoch 5 of 5 | Iteration:   4%|▍         | 52/1212 [00:26<08:51,  2.18it/s]

Gradient norm: 11.151965250323713


Epoch 5 of 5 | Iteration:   4%|▍         | 53/1212 [00:26<08:31,  2.27it/s]

Gradient norm: 11.904299837500554


Epoch 5 of 5 | Iteration:   4%|▍         | 54/1212 [00:27<08:20,  2.32it/s]

Gradient norm: 12.813520948898386


Epoch 5 of 5 | Iteration:   5%|▍         | 55/1212 [00:27<08:49,  2.19it/s]

Gradient norm: 12.44549134577414


Epoch 5 of 5 | Iteration:   5%|▍         | 56/1212 [00:28<09:39,  1.99it/s]

Gradient norm: 14.995955939493559


Epoch 5 of 5 | Iteration:   5%|▍         | 57/1212 [00:28<10:18,  1.87it/s]

Gradient norm: 22.82300108234065


Epoch 5 of 5 | Iteration:   5%|▍         | 58/1212 [00:29<10:34,  1.82it/s]

Gradient norm: 22.88364084920328


Epoch 5 of 5 | Iteration:   5%|▍         | 59/1212 [00:30<10:41,  1.80it/s]

Gradient norm: 31.127369265027852


Epoch 5 of 5 | Iteration:   5%|▍         | 60/1212 [00:30<10:52,  1.76it/s]

Gradient norm: 31.101795966802015


Epoch 5 of 5 | Iteration:   5%|▌         | 61/1212 [00:31<11:21,  1.69it/s]

Gradient norm: 35.54184859938255


Epoch 5 of 5 | Iteration:   5%|▌         | 62/1212 [00:31<11:00,  1.74it/s]

Gradient norm: 36.20935450689428


Epoch 5 of 5 | Iteration:   5%|▌         | 63/1212 [00:32<11:44,  1.63it/s]

Gradient norm: 36.280999888610516


Epoch 5 of 5 | Iteration:   5%|▌         | 64/1212 [00:32<10:40,  1.79it/s]

Gradient norm: 36.525371138636686


Epoch 5 of 5 | Iteration:   5%|▌         | 65/1212 [00:33<09:39,  1.98it/s]

Gradient norm: 5.144156633967173


Epoch 5 of 5 | Iteration:   5%|▌         | 66/1212 [00:33<09:06,  2.10it/s]

Gradient norm: 6.352333039988666


Epoch 5 of 5 | Iteration:   6%|▌         | 67/1212 [00:34<08:34,  2.22it/s]

Gradient norm: 7.186662781320962


Epoch 5 of 5 | Iteration:   6%|▌         | 68/1212 [00:34<09:44,  1.96it/s]

Gradient norm: 10.269306828351283


Epoch 5 of 5 | Iteration:   6%|▌         | 69/1212 [00:35<09:02,  2.11it/s]

Gradient norm: 10.422164733983728


Epoch 5 of 5 | Iteration:   6%|▌         | 70/1212 [00:35<08:33,  2.23it/s]

Gradient norm: 10.68453107595387


Epoch 5 of 5 | Iteration:   6%|▌         | 71/1212 [00:36<09:44,  1.95it/s]

Gradient norm: 10.54846168793255


Epoch 5 of 5 | Iteration:   6%|▌         | 72/1212 [00:36<09:27,  2.01it/s]

Gradient norm: 10.639459866038411


Epoch 5 of 5 | Iteration:   6%|▌         | 73/1212 [00:37<10:09,  1.87it/s]

Gradient norm: 26.157383669431272


Epoch 5 of 5 | Iteration:   6%|▌         | 74/1212 [00:37<10:21,  1.83it/s]

Gradient norm: 38.11438540324899


Epoch 5 of 5 | Iteration:   6%|▌         | 75/1212 [00:38<09:39,  1.96it/s]

Gradient norm: 41.18532548445512


Epoch 5 of 5 | Iteration:   6%|▋         | 76/1212 [00:38<09:26,  2.00it/s]

Gradient norm: 40.86432822533699


Epoch 5 of 5 | Iteration:   6%|▋         | 77/1212 [00:39<08:53,  2.13it/s]

Gradient norm: 40.89721182153584


Epoch 5 of 5 | Iteration:   6%|▋         | 78/1212 [00:39<08:57,  2.11it/s]

Gradient norm: 40.69819362443041


Epoch 5 of 5 | Iteration:   7%|▋         | 79/1212 [00:40<08:45,  2.15it/s]

Gradient norm: 47.25779904736712


Epoch 5 of 5 | Iteration:   7%|▋         | 80/1212 [00:40<08:33,  2.21it/s]

Gradient norm: 52.108849768558805


Epoch 5 of 5 | Iteration:   7%|▋         | 81/1212 [00:41<09:28,  1.99it/s]

Gradient norm: 9.665202828318257


Epoch 5 of 5 | Iteration:   7%|▋         | 82/1212 [00:41<08:50,  2.13it/s]

Gradient norm: 48.22352652131982


Epoch 5 of 5 | Iteration:   7%|▋         | 83/1212 [00:42<09:10,  2.05it/s]

Gradient norm: 48.8511524826364


Epoch 5 of 5 | Iteration:   7%|▋         | 84/1212 [00:42<09:17,  2.02it/s]

Gradient norm: 67.29824407153768


Epoch 5 of 5 | Iteration:   7%|▋         | 85/1212 [00:43<09:15,  2.03it/s]

Gradient norm: 68.6181035676384


Epoch 5 of 5 | Iteration:   7%|▋         | 86/1212 [00:43<09:24,  2.00it/s]

Gradient norm: 69.14881536253512


Epoch 5 of 5 | Iteration:   7%|▋         | 87/1212 [00:44<09:35,  1.95it/s]

Gradient norm: 81.2352077685648


Epoch 5 of 5 | Iteration:   7%|▋         | 88/1212 [00:44<09:40,  1.93it/s]

Gradient norm: 86.0418083171133


Epoch 5 of 5 | Iteration:   7%|▋         | 89/1212 [00:45<10:01,  1.87it/s]

Gradient norm: 86.13132858093628


Epoch 5 of 5 | Iteration:   7%|▋         | 90/1212 [00:45<09:46,  1.91it/s]

Gradient norm: 87.25832966953016


Epoch 5 of 5 | Iteration:   8%|▊         | 91/1212 [00:46<09:15,  2.02it/s]

Gradient norm: 88.47130117763065


Epoch 5 of 5 | Iteration:   8%|▊         | 92/1212 [00:46<09:05,  2.05it/s]

Gradient norm: 92.55944627941034


Epoch 5 of 5 | Iteration:   8%|▊         | 93/1212 [00:47<08:41,  2.15it/s]

Gradient norm: 92.94776279632252


Epoch 5 of 5 | Iteration:   8%|▊         | 94/1212 [00:47<08:16,  2.25it/s]

Gradient norm: 93.59060087147202


Epoch 5 of 5 | Iteration:   8%|▊         | 95/1212 [00:48<09:27,  1.97it/s]

Gradient norm: 94.08281830085586


Epoch 5 of 5 | Iteration:   8%|▊         | 96/1212 [00:48<08:56,  2.08it/s]

Gradient norm: 94.93541251891583


Epoch 5 of 5 | Iteration:   8%|▊         | 97/1212 [00:48<08:22,  2.22it/s]

Gradient norm: 2.776521460458399


Epoch 5 of 5 | Iteration:   8%|▊         | 98/1212 [00:49<08:46,  2.11it/s]

Gradient norm: 7.679284976238406


Epoch 5 of 5 | Iteration:   8%|▊         | 99/1212 [00:49<08:12,  2.26it/s]

Gradient norm: 8.239328089074474


Epoch 5 of 5 | Iteration:   8%|▊         | 100/1212 [00:50<07:58,  2.32it/s]

Gradient norm: 309.1513155231802


Epoch 5 of 5 | Iteration:   8%|▊         | 101/1212 [00:50<07:38,  2.42it/s]

Gradient norm: 312.4925478360793


Epoch 5 of 5 | Iteration:   8%|▊         | 102/1212 [00:50<07:39,  2.42it/s]

Gradient norm: 311.47346359100567


Epoch 5 of 5 | Iteration:   8%|▊         | 103/1212 [00:51<09:02,  2.04it/s]

Gradient norm: 316.0010920258287


Epoch 5 of 5 | Iteration:   9%|▊         | 104/1212 [00:52<09:12,  2.00it/s]

Gradient norm: 315.9588235627131


Epoch 5 of 5 | Iteration:   9%|▊         | 105/1212 [00:52<08:48,  2.09it/s]

Gradient norm: 314.45855800675525


Epoch 5 of 5 | Iteration:   9%|▊         | 106/1212 [00:53<08:41,  2.12it/s]

Gradient norm: 314.42689700304754


Epoch 5 of 5 | Iteration:   9%|▉         | 107/1212 [00:53<09:30,  1.94it/s]

Gradient norm: 314.45199659092896


Epoch 5 of 5 | Iteration:   9%|▉         | 108/1212 [00:54<10:54,  1.69it/s]

Gradient norm: 314.72123777887697


Epoch 5 of 5 | Iteration:   9%|▉         | 109/1212 [00:54<09:58,  1.84it/s]

Gradient norm: 313.5103701175954


Epoch 5 of 5 | Iteration:   9%|▉         | 110/1212 [00:55<09:05,  2.02it/s]

Gradient norm: 315.47225682410703


Epoch 5 of 5 | Iteration:   9%|▉         | 111/1212 [00:55<09:15,  1.98it/s]

Gradient norm: 315.6705441520727


Epoch 5 of 5 | Iteration:   9%|▉         | 112/1212 [00:56<09:37,  1.90it/s]

Gradient norm: 315.95795195055535


Epoch 5 of 5 | Iteration:   9%|▉         | 113/1212 [00:56<09:57,  1.84it/s]

In [None]:
test_dataset_raw = Dataset.from_sql(dataset_query.format(set_type='test'), con=DB_URL)
test_dataset = DefinitionDataset(test_dataset_raw, tokenizer, mode='train', model='evidence_selection')
test_dataloader = DataLoader(test_dataset, shuffle=True, collate_fn=test_dataset.collate_fn, batch_size=10)

test_loss = evaluate(model, test_dataloader)
print(test_loss)

In [10]:
print(trace_train)
print(len(train_dataloader))

[]
581
