# Kaggle competition

New Data available, in different format so we need to repreprocess everything

# 0 - Imports

In [1]:
import pandas as pd
import numpy as np
import torch

# I - Data Preprocessing

In [2]:
# df_train = pd.read_csv('Lexical_juggling_train.csv')
df_train = pd.read_csv('Lexical_juggling_train.csv')
df_train.dropna(subset=['Label'], inplace=True)
labels_with_multiple_rows = df_train['Label'].value_counts()
labels_to_keep = labels_with_multiple_rows[labels_with_multiple_rows > 1].index
df_train = df_train[df_train['Label'].isin(labels_to_keep)]
df_train['Text'] = df_train['Text'].astype(str)
df_train['Label'] = df_train['Label'].astype(str)
df_train.head(2)

Unnamed: 0,ID,Usage,Text,Label
0,136,Public,Finalment Atena le recibe en l'acropoli d'Ate...,arg
1,62,Public,Jane Laffort fille de Joseph Laffort et d' Ang...,lat


In [3]:
df_test = pd.read_csv('test_without_labels.csv')
df_test.head(2)

Unnamed: 0,ID,Usage,Text
0,55,Private,Ponovo dobija riječni oblik do Drežnice.
1,71,Private,Se formaron aproximadamente hace apenas unos 1...


In [4]:
print("Train Shape = ",df_train.shape)
print("Test shape = ",df_test.shape)
print("List labels length = ", len(df_train['Label'].unique()))

Train Shape =  (77900, 4)
Test shape =  (38827, 3)
List labels length =  389


Ok - So we have 38K sentences in different languages, to classify in 390 categories. If the class is balanced, this would represent a 100:1 ratio, so ok to train without generating new sentences I assume. So first baseline algo will just train an NLP classifier on train dataset, and then use the test dataset to see how good it actually is.

In [5]:
df_lab = pd.DataFrame(df_train)

label_counts = df_lab["Label"].value_counts().reset_index()
label_counts.columns = ["Label", "Count"]
print(label_counts)

    Label  Count
0     tgk    300
1     arg    200
2     san    200
3     kon    200
4     wal    200
..    ...    ...
384   hus    200
385   sun    200
386   mlg    200
387   kir    200
388   toi    200

[389 rows x 2 columns]


In [6]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from datasets import Dataset
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

train_df, test_df = train_test_split(df_train, test_size=0.2, stratify=df_train["Label"], random_state=42)
labels = sorted(train_df["Label"].unique())
label2id = {label: i for i, label in enumerate(labels)} # Mapping int → label

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
def map_labels(example):
    example["labels"] = label2id[example["Label"]]
    return example

train_dataset = train_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=1)
test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=1)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 62320/62320 [00:06<00:00, 9744.03 examples/s] 
Map: 100%|██████████| 15580/15580 [00:01<00:00, 8654.29 examples/s]
Map: 100%|██████████| 62320/62320 [00:08<00:00, 7139.05 examples/s]
Map: 100%|██████████| 15580/15580 [00:02<00:00, 7546.06 examples/s]


In [7]:
columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)
num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
for param in model.bert.parameters():
    param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    step = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        step += 1
        progress_bar.update(1)
        if step % 10 == 0:
            print(f"Epoch {epoch+1} Step {step} Loss {loss.item():.4f}")
    avg_loss = total_loss / step
    print(f"Epoch {epoch+1} Average Training Loss: {avg_loss:.4f}")
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1} Test Accuracy: {acc:.4f}")


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 10/11685 [00:03<58:50,  3.31it/s] 

Epoch 1 Step 10 Loss 6.0729


  0%|          | 20/11685 [00:06<55:17,  3.52it/s]

Epoch 1 Step 20 Loss 6.0159


  0%|          | 30/11685 [00:09<54:58,  3.53it/s]

Epoch 1 Step 30 Loss 5.9296


  0%|          | 40/11685 [00:12<55:15,  3.51it/s]

Epoch 1 Step 40 Loss 6.1275


  0%|          | 50/11685 [00:15<55:18,  3.51it/s]

Epoch 1 Step 50 Loss 6.1868


  1%|          | 60/11685 [00:18<55:40,  3.48it/s]

Epoch 1 Step 60 Loss 5.8557


  1%|          | 70/11685 [00:20<54:31,  3.55it/s]

Epoch 1 Step 70 Loss 6.0231


  1%|          | 80/11685 [00:23<55:22,  3.49it/s]

Epoch 1 Step 80 Loss 6.1068


  1%|          | 90/11685 [00:26<54:43,  3.53it/s]

Epoch 1 Step 90 Loss 6.1925


  1%|          | 100/11685 [00:29<55:06,  3.50it/s]

Epoch 1 Step 100 Loss 6.0187


  1%|          | 110/11685 [00:32<55:19,  3.49it/s]

Epoch 1 Step 110 Loss 6.1559


  1%|          | 120/11685 [00:35<55:17,  3.49it/s]

Epoch 1 Step 120 Loss 6.0455


  1%|          | 130/11685 [00:38<55:27,  3.47it/s]

Epoch 1 Step 130 Loss 6.0078


  1%|          | 140/11685 [00:40<54:51,  3.51it/s]

Epoch 1 Step 140 Loss 5.9808


  1%|▏         | 150/11685 [00:43<54:44,  3.51it/s]

Epoch 1 Step 150 Loss 6.0922


  1%|▏         | 160/11685 [00:46<54:11,  3.54it/s]

Epoch 1 Step 160 Loss 6.1399


  1%|▏         | 170/11685 [00:49<54:26,  3.53it/s]

Epoch 1 Step 170 Loss 6.0221


  2%|▏         | 180/11685 [00:52<54:47,  3.50it/s]

Epoch 1 Step 180 Loss 5.9883


  2%|▏         | 190/11685 [00:55<55:07,  3.48it/s]

Epoch 1 Step 190 Loss 6.0809


  2%|▏         | 200/11685 [00:58<54:32,  3.51it/s]

Epoch 1 Step 200 Loss 5.9708


  2%|▏         | 210/11685 [01:00<54:41,  3.50it/s]

Epoch 1 Step 210 Loss 5.9672


  2%|▏         | 220/11685 [01:03<54:10,  3.53it/s]

Epoch 1 Step 220 Loss 5.9430


  2%|▏         | 230/11685 [01:06<54:08,  3.53it/s]

Epoch 1 Step 230 Loss 6.0705


  2%|▏         | 240/11685 [01:09<54:25,  3.50it/s]

Epoch 1 Step 240 Loss 5.9447


  2%|▏         | 250/11685 [01:12<54:15,  3.51it/s]

Epoch 1 Step 250 Loss 6.0010


  2%|▏         | 260/11685 [01:15<54:17,  3.51it/s]

Epoch 1 Step 260 Loss 6.0860


  2%|▏         | 270/11685 [01:17<53:58,  3.53it/s]

Epoch 1 Step 270 Loss 6.0581


  2%|▏         | 280/11685 [01:20<54:25,  3.49it/s]

Epoch 1 Step 280 Loss 5.9993


  2%|▏         | 290/11685 [01:23<53:40,  3.54it/s]

Epoch 1 Step 290 Loss 6.2057


  3%|▎         | 300/11685 [01:26<54:22,  3.49it/s]

Epoch 1 Step 300 Loss 5.9849


  3%|▎         | 310/11685 [01:29<53:37,  3.54it/s]

Epoch 1 Step 310 Loss 5.9422


  3%|▎         | 320/11685 [01:32<53:37,  3.53it/s]

Epoch 1 Step 320 Loss 6.1110


  3%|▎         | 330/11685 [01:35<53:50,  3.51it/s]

Epoch 1 Step 330 Loss 5.9842


  3%|▎         | 340/11685 [01:37<54:12,  3.49it/s]

Epoch 1 Step 340 Loss 6.0045


  3%|▎         | 350/11685 [01:40<53:52,  3.51it/s]

Epoch 1 Step 350 Loss 5.9985


  3%|▎         | 360/11685 [01:43<53:55,  3.50it/s]

Epoch 1 Step 360 Loss 5.9445


  3%|▎         | 370/11685 [01:46<54:02,  3.49it/s]

Epoch 1 Step 370 Loss 5.8562


  3%|▎         | 380/11685 [01:49<53:38,  3.51it/s]

Epoch 1 Step 380 Loss 6.0860


  3%|▎         | 390/11685 [01:52<53:42,  3.50it/s]

Epoch 1 Step 390 Loss 6.0322


  3%|▎         | 400/11685 [01:54<53:11,  3.54it/s]

Epoch 1 Step 400 Loss 6.0056


  4%|▎         | 410/11685 [01:57<53:33,  3.51it/s]

Epoch 1 Step 410 Loss 6.0537


  4%|▎         | 420/11685 [02:00<53:12,  3.53it/s]

Epoch 1 Step 420 Loss 6.0092


  4%|▎         | 430/11685 [02:03<53:02,  3.54it/s]

Epoch 1 Step 430 Loss 6.1155


  4%|▍         | 440/11685 [02:06<53:53,  3.48it/s]

Epoch 1 Step 440 Loss 5.9169


  4%|▍         | 450/11685 [02:09<52:58,  3.53it/s]

Epoch 1 Step 450 Loss 5.9960


  4%|▍         | 460/11685 [02:12<53:36,  3.49it/s]

Epoch 1 Step 460 Loss 6.0486


  4%|▍         | 470/11685 [02:14<52:48,  3.54it/s]

Epoch 1 Step 470 Loss 5.9736


  4%|▍         | 480/11685 [02:17<52:46,  3.54it/s]

Epoch 1 Step 480 Loss 6.1285


  4%|▍         | 490/11685 [02:20<52:47,  3.53it/s]

Epoch 1 Step 490 Loss 6.0634


  4%|▍         | 500/11685 [02:23<53:02,  3.51it/s]

Epoch 1 Step 500 Loss 6.0109


  4%|▍         | 510/11685 [02:26<53:18,  3.49it/s]

Epoch 1 Step 510 Loss 5.8832


  4%|▍         | 520/11685 [02:29<52:36,  3.54it/s]

Epoch 1 Step 520 Loss 5.8924


  5%|▍         | 530/11685 [02:31<52:43,  3.53it/s]

Epoch 1 Step 530 Loss 5.9409


  5%|▍         | 540/11685 [02:34<52:56,  3.51it/s]

Epoch 1 Step 540 Loss 5.9032


  5%|▍         | 550/11685 [02:37<52:27,  3.54it/s]

Epoch 1 Step 550 Loss 5.9558


  5%|▍         | 560/11685 [02:40<52:31,  3.53it/s]

Epoch 1 Step 560 Loss 5.9512


  5%|▍         | 570/11685 [02:43<53:02,  3.49it/s]

Epoch 1 Step 570 Loss 5.9300


  5%|▍         | 580/11685 [02:46<53:17,  3.47it/s]

Epoch 1 Step 580 Loss 5.9011


  5%|▌         | 590/11685 [02:48<52:30,  3.52it/s]

Epoch 1 Step 590 Loss 5.9693


  5%|▌         | 600/11685 [02:51<52:06,  3.55it/s]

Epoch 1 Step 600 Loss 6.0018


  5%|▌         | 610/11685 [02:54<52:47,  3.50it/s]

Epoch 1 Step 610 Loss 6.0505


  5%|▌         | 620/11685 [02:57<52:06,  3.54it/s]

Epoch 1 Step 620 Loss 5.9443


  5%|▌         | 630/11685 [03:00<52:18,  3.52it/s]

Epoch 1 Step 630 Loss 5.9110


  5%|▌         | 640/11685 [03:03<51:58,  3.54it/s]

Epoch 1 Step 640 Loss 5.8187


  6%|▌         | 650/11685 [03:06<52:07,  3.53it/s]

Epoch 1 Step 650 Loss 5.9204


  6%|▌         | 660/11685 [03:08<52:17,  3.51it/s]

Epoch 1 Step 660 Loss 5.9132


  6%|▌         | 670/11685 [03:11<51:52,  3.54it/s]

Epoch 1 Step 670 Loss 5.8124


  6%|▌         | 680/11685 [03:14<52:39,  3.48it/s]

Epoch 1 Step 680 Loss 5.9011


  6%|▌         | 690/11685 [03:17<51:58,  3.53it/s]

Epoch 1 Step 690 Loss 5.8819


  6%|▌         | 700/11685 [03:20<51:38,  3.55it/s]

Epoch 1 Step 700 Loss 5.9952


  6%|▌         | 710/11685 [03:23<52:09,  3.51it/s]

Epoch 1 Step 710 Loss 5.9120


  6%|▌         | 720/11685 [03:25<51:38,  3.54it/s]

Epoch 1 Step 720 Loss 5.8084


  6%|▌         | 730/11685 [03:28<52:17,  3.49it/s]

Epoch 1 Step 730 Loss 6.1408


  6%|▋         | 740/11685 [03:31<51:45,  3.52it/s]

Epoch 1 Step 740 Loss 5.8552


  6%|▋         | 750/11685 [03:34<51:46,  3.52it/s]

Epoch 1 Step 750 Loss 5.9908


  7%|▋         | 760/11685 [03:37<52:18,  3.48it/s]

Epoch 1 Step 760 Loss 6.0569


  7%|▋         | 770/11685 [03:40<51:33,  3.53it/s]

Epoch 1 Step 770 Loss 5.8035


  7%|▋         | 780/11685 [03:42<51:22,  3.54it/s]

Epoch 1 Step 780 Loss 5.8862


  7%|▋         | 790/11685 [03:45<51:22,  3.53it/s]

Epoch 1 Step 790 Loss 5.9945


  7%|▋         | 800/11685 [03:48<51:23,  3.53it/s]

Epoch 1 Step 800 Loss 5.9813


  7%|▋         | 810/11685 [03:51<52:15,  3.47it/s]

Epoch 1 Step 810 Loss 5.8852


  7%|▋         | 820/11685 [03:54<51:11,  3.54it/s]

Epoch 1 Step 820 Loss 6.1394


  7%|▋         | 830/11685 [03:57<51:39,  3.50it/s]

Epoch 1 Step 830 Loss 5.9921


  7%|▋         | 840/11685 [04:00<51:42,  3.50it/s]

Epoch 1 Step 840 Loss 5.9154


  7%|▋         | 850/11685 [04:02<51:01,  3.54it/s]

Epoch 1 Step 850 Loss 6.0988


  7%|▋         | 860/11685 [04:05<51:23,  3.51it/s]

Epoch 1 Step 860 Loss 6.0074


  7%|▋         | 870/11685 [04:08<51:12,  3.52it/s]

Epoch 1 Step 870 Loss 5.9739


  8%|▊         | 880/11685 [04:11<51:04,  3.53it/s]

Epoch 1 Step 880 Loss 5.9407


  8%|▊         | 890/11685 [04:14<51:13,  3.51it/s]

Epoch 1 Step 890 Loss 5.9295


  8%|▊         | 900/11685 [04:17<51:36,  3.48it/s]

Epoch 1 Step 900 Loss 5.9186


  8%|▊         | 910/11685 [04:19<51:06,  3.51it/s]

Epoch 1 Step 910 Loss 5.9970


  8%|▊         | 920/11685 [04:22<50:57,  3.52it/s]

Epoch 1 Step 920 Loss 5.9678


  8%|▊         | 930/11685 [04:25<50:43,  3.53it/s]

Epoch 1 Step 930 Loss 5.9147


  8%|▊         | 940/11685 [04:28<50:47,  3.53it/s]

Epoch 1 Step 940 Loss 5.9426


  8%|▊         | 950/11685 [04:31<50:38,  3.53it/s]

Epoch 1 Step 950 Loss 6.0100


  8%|▊         | 960/11685 [04:34<51:28,  3.47it/s]

Epoch 1 Step 960 Loss 5.9771


  8%|▊         | 970/11685 [04:36<50:25,  3.54it/s]

Epoch 1 Step 970 Loss 5.9538


  8%|▊         | 980/11685 [04:39<50:24,  3.54it/s]

Epoch 1 Step 980 Loss 5.9736


  8%|▊         | 990/11685 [04:42<50:25,  3.53it/s]

Epoch 1 Step 990 Loss 6.0311


  9%|▊         | 1000/11685 [04:45<50:35,  3.52it/s]

Epoch 1 Step 1000 Loss 5.9857


  9%|▊         | 1010/11685 [04:48<50:15,  3.54it/s]

Epoch 1 Step 1010 Loss 5.9210


  9%|▊         | 1020/11685 [04:51<50:48,  3.50it/s]

Epoch 1 Step 1020 Loss 5.9786


  9%|▉         | 1030/11685 [04:54<50:14,  3.53it/s]

Epoch 1 Step 1030 Loss 5.9725


  9%|▉         | 1040/11685 [04:56<50:40,  3.50it/s]

Epoch 1 Step 1040 Loss 5.9502


  9%|▉         | 1050/11685 [04:59<50:37,  3.50it/s]

Epoch 1 Step 1050 Loss 5.8870


  9%|▉         | 1060/11685 [05:02<50:27,  3.51it/s]

Epoch 1 Step 1060 Loss 6.1076


  9%|▉         | 1070/11685 [05:05<50:48,  3.48it/s]

Epoch 1 Step 1070 Loss 5.9891


  9%|▉         | 1080/11685 [05:08<50:23,  3.51it/s]

Epoch 1 Step 1080 Loss 6.0086


  9%|▉         | 1090/11685 [05:11<49:52,  3.54it/s]

Epoch 1 Step 1090 Loss 5.9322


  9%|▉         | 1100/11685 [05:13<49:52,  3.54it/s]

Epoch 1 Step 1100 Loss 5.8796


  9%|▉         | 1110/11685 [05:16<50:04,  3.52it/s]

Epoch 1 Step 1110 Loss 5.9855


 10%|▉         | 1120/11685 [05:19<50:01,  3.52it/s]

Epoch 1 Step 1120 Loss 6.0131


 10%|▉         | 1130/11685 [05:22<50:12,  3.50it/s]

Epoch 1 Step 1130 Loss 5.9296


 10%|▉         | 1140/11685 [05:25<49:43,  3.53it/s]

Epoch 1 Step 1140 Loss 5.8298


 10%|▉         | 1150/11685 [05:28<50:38,  3.47it/s]

Epoch 1 Step 1150 Loss 5.8744


 10%|▉         | 1160/11685 [05:30<49:39,  3.53it/s]

Epoch 1 Step 1160 Loss 5.9187


 10%|█         | 1170/11685 [05:33<49:39,  3.53it/s]

Epoch 1 Step 1170 Loss 5.9775


 10%|█         | 1180/11685 [05:36<50:07,  3.49it/s]

Epoch 1 Step 1180 Loss 5.9995


 10%|█         | 1190/11685 [05:39<49:38,  3.52it/s]

Epoch 1 Step 1190 Loss 5.8277


 10%|█         | 1200/11685 [05:42<49:24,  3.54it/s]

Epoch 1 Step 1200 Loss 5.9937


 10%|█         | 1210/11685 [05:45<49:59,  3.49it/s]

Epoch 1 Step 1210 Loss 5.9135


 10%|█         | 1220/11685 [05:48<49:47,  3.50it/s]

Epoch 1 Step 1220 Loss 5.9540


 11%|█         | 1230/11685 [05:50<49:29,  3.52it/s]

Epoch 1 Step 1230 Loss 5.8756


 11%|█         | 1240/11685 [05:53<49:16,  3.53it/s]

Epoch 1 Step 1240 Loss 5.8924


 11%|█         | 1250/11685 [05:56<49:18,  3.53it/s]

Epoch 1 Step 1250 Loss 5.8064


 11%|█         | 1260/11685 [05:59<49:09,  3.53it/s]

Epoch 1 Step 1260 Loss 5.9409


 11%|█         | 1270/11685 [06:02<49:42,  3.49it/s]

Epoch 1 Step 1270 Loss 5.9447


 11%|█         | 1280/11685 [06:05<49:10,  3.53it/s]

Epoch 1 Step 1280 Loss 5.9206


 11%|█         | 1290/11685 [06:07<49:21,  3.51it/s]

Epoch 1 Step 1290 Loss 5.9546


 11%|█         | 1300/11685 [06:10<48:58,  3.53it/s]

Epoch 1 Step 1300 Loss 5.9931


 11%|█         | 1310/11685 [06:13<49:03,  3.53it/s]

Epoch 1 Step 1310 Loss 6.0455


 11%|█▏        | 1320/11685 [06:16<49:09,  3.51it/s]

Epoch 1 Step 1320 Loss 6.0022


 11%|█▏        | 1330/11685 [06:19<49:32,  3.48it/s]

Epoch 1 Step 1330 Loss 5.9984


 11%|█▏        | 1340/11685 [06:22<49:35,  3.48it/s]

Epoch 1 Step 1340 Loss 5.9715


 12%|█▏        | 1350/11685 [06:24<49:19,  3.49it/s]

Epoch 1 Step 1350 Loss 5.9181


 12%|█▏        | 1360/11685 [06:27<48:37,  3.54it/s]

Epoch 1 Step 1360 Loss 5.9849


 12%|█▏        | 1370/11685 [06:30<49:05,  3.50it/s]

Epoch 1 Step 1370 Loss 6.0406


 12%|█▏        | 1380/11685 [06:33<49:05,  3.50it/s]

Epoch 1 Step 1380 Loss 6.0783


 12%|█▏        | 1390/11685 [06:36<48:28,  3.54it/s]

Epoch 1 Step 1390 Loss 5.8634


 12%|█▏        | 1400/11685 [06:39<48:36,  3.53it/s]

Epoch 1 Step 1400 Loss 5.8874


 12%|█▏        | 1410/11685 [06:42<48:21,  3.54it/s]

Epoch 1 Step 1410 Loss 5.9970


 12%|█▏        | 1420/11685 [06:44<48:40,  3.51it/s]

Epoch 1 Step 1420 Loss 5.8810


 12%|█▏        | 1430/11685 [06:47<48:20,  3.54it/s]

Epoch 1 Step 1430 Loss 5.9926


 12%|█▏        | 1440/11685 [06:50<48:09,  3.55it/s]

Epoch 1 Step 1440 Loss 5.9195


 12%|█▏        | 1450/11685 [06:53<48:08,  3.54it/s]

Epoch 1 Step 1450 Loss 5.8881


 12%|█▏        | 1460/11685 [06:56<48:15,  3.53it/s]

Epoch 1 Step 1460 Loss 5.9841


 13%|█▎        | 1470/11685 [06:59<48:12,  3.53it/s]

Epoch 1 Step 1470 Loss 5.8812


 13%|█▎        | 1480/11685 [07:01<48:23,  3.51it/s]

Epoch 1 Step 1480 Loss 5.9381


 13%|█▎        | 1490/11685 [07:04<48:53,  3.47it/s]

Epoch 1 Step 1490 Loss 5.8430


 13%|█▎        | 1500/11685 [07:07<48:45,  3.48it/s]

Epoch 1 Step 1500 Loss 5.9952


 13%|█▎        | 1510/11685 [07:10<48:27,  3.50it/s]

Epoch 1 Step 1510 Loss 6.0276


 13%|█▎        | 1520/11685 [07:13<47:59,  3.53it/s]

Epoch 1 Step 1520 Loss 6.0038


 13%|█▎        | 1530/11685 [07:16<47:55,  3.53it/s]

Epoch 1 Step 1530 Loss 5.7660


 13%|█▎        | 1540/11685 [07:19<48:19,  3.50it/s]

Epoch 1 Step 1540 Loss 5.9733


 13%|█▎        | 1550/11685 [07:21<47:54,  3.53it/s]

Epoch 1 Step 1550 Loss 5.9947


 13%|█▎        | 1560/11685 [07:24<47:45,  3.53it/s]

Epoch 1 Step 1560 Loss 5.7935


 13%|█▎        | 1570/11685 [07:27<48:26,  3.48it/s]

Epoch 1 Step 1570 Loss 5.8998


 14%|█▎        | 1580/11685 [07:30<48:12,  3.49it/s]

Epoch 1 Step 1580 Loss 5.9119


 14%|█▎        | 1590/11685 [07:33<48:03,  3.50it/s]

Epoch 1 Step 1590 Loss 5.9018


 14%|█▎        | 1600/11685 [07:36<47:54,  3.51it/s]

Epoch 1 Step 1600 Loss 5.9951


 14%|█▍        | 1610/11685 [07:38<47:50,  3.51it/s]

Epoch 1 Step 1610 Loss 6.0741


 14%|█▍        | 1620/11685 [07:41<47:44,  3.51it/s]

Epoch 1 Step 1620 Loss 5.9283


 14%|█▍        | 1630/11685 [07:44<47:41,  3.51it/s]

Epoch 1 Step 1630 Loss 6.1458


 14%|█▍        | 1640/11685 [07:47<47:41,  3.51it/s]

Epoch 1 Step 1640 Loss 5.9175


 14%|█▍        | 1650/11685 [07:50<47:27,  3.52it/s]

Epoch 1 Step 1650 Loss 5.9071


 14%|█▍        | 1660/11685 [07:53<47:35,  3.51it/s]

Epoch 1 Step 1660 Loss 6.0018


 14%|█▍        | 1670/11685 [07:55<47:34,  3.51it/s]

Epoch 1 Step 1670 Loss 5.8908


 14%|█▍        | 1680/11685 [07:58<47:15,  3.53it/s]

Epoch 1 Step 1680 Loss 5.7466


 14%|█▍        | 1690/11685 [08:01<47:15,  3.53it/s]

Epoch 1 Step 1690 Loss 5.9398


 15%|█▍        | 1700/11685 [08:04<47:46,  3.48it/s]

Epoch 1 Step 1700 Loss 5.9906


 15%|█▍        | 1710/11685 [08:07<47:10,  3.52it/s]

Epoch 1 Step 1710 Loss 5.8670


 15%|█▍        | 1720/11685 [08:10<46:55,  3.54it/s]

Epoch 1 Step 1720 Loss 6.0257


 15%|█▍        | 1730/11685 [08:12<47:16,  3.51it/s]

Epoch 1 Step 1730 Loss 5.8444


 15%|█▍        | 1740/11685 [08:15<47:02,  3.52it/s]

Epoch 1 Step 1740 Loss 5.9344


 15%|█▍        | 1750/11685 [08:18<46:57,  3.53it/s]

Epoch 1 Step 1750 Loss 6.0375


 15%|█▌        | 1760/11685 [08:21<46:59,  3.52it/s]

Epoch 1 Step 1760 Loss 5.8925


 15%|█▌        | 1770/11685 [08:24<46:45,  3.53it/s]

Epoch 1 Step 1770 Loss 6.0391


 15%|█▌        | 1780/11685 [08:27<47:08,  3.50it/s]

Epoch 1 Step 1780 Loss 5.9751


 15%|█▌        | 1790/11685 [08:30<46:57,  3.51it/s]

Epoch 1 Step 1790 Loss 5.9589


 15%|█▌        | 1800/11685 [08:32<46:35,  3.54it/s]

Epoch 1 Step 1800 Loss 5.9823


 15%|█▌        | 1810/11685 [08:35<46:23,  3.55it/s]

Epoch 1 Step 1810 Loss 5.8272


 16%|█▌        | 1820/11685 [08:38<46:54,  3.50it/s]

Epoch 1 Step 1820 Loss 5.9107


 16%|█▌        | 1830/11685 [08:41<46:28,  3.53it/s]

Epoch 1 Step 1830 Loss 5.8978


 16%|█▌        | 1840/11685 [08:44<46:52,  3.50it/s]

Epoch 1 Step 1840 Loss 5.9850


 16%|█▌        | 1850/11685 [08:47<46:35,  3.52it/s]

Epoch 1 Step 1850 Loss 5.9329


 16%|█▌        | 1860/11685 [08:49<46:19,  3.54it/s]

Epoch 1 Step 1860 Loss 5.9390


 16%|█▌        | 1870/11685 [08:52<46:14,  3.54it/s]

Epoch 1 Step 1870 Loss 5.9551


 16%|█▌        | 1880/11685 [08:55<46:29,  3.52it/s]

Epoch 1 Step 1880 Loss 5.8678


 16%|█▌        | 1890/11685 [08:58<45:39,  3.57it/s]

Epoch 1 Step 1890 Loss 5.8825


 16%|█▋        | 1900/11685 [09:01<46:06,  3.54it/s]

Epoch 1 Step 1900 Loss 5.9729


 16%|█▋        | 1910/11685 [09:04<46:40,  3.49it/s]

Epoch 1 Step 1910 Loss 6.0257


 16%|█▋        | 1920/11685 [09:06<45:49,  3.55it/s]

Epoch 1 Step 1920 Loss 5.8824


 17%|█▋        | 1930/11685 [09:09<45:56,  3.54it/s]

Epoch 1 Step 1930 Loss 6.0366


 17%|█▋        | 1940/11685 [09:12<46:06,  3.52it/s]

Epoch 1 Step 1940 Loss 5.8592


 17%|█▋        | 1950/11685 [09:15<45:57,  3.53it/s]

Epoch 1 Step 1950 Loss 5.8872


 17%|█▋        | 1960/11685 [09:18<46:06,  3.52it/s]

Epoch 1 Step 1960 Loss 5.9917


 17%|█▋        | 1970/11685 [09:21<46:00,  3.52it/s]

Epoch 1 Step 1970 Loss 5.7987


 17%|█▋        | 1980/11685 [09:23<45:19,  3.57it/s]

Epoch 1 Step 1980 Loss 5.9698


 17%|█▋        | 1990/11685 [09:26<45:54,  3.52it/s]

Epoch 1 Step 1990 Loss 5.8847


 17%|█▋        | 2000/11685 [09:29<45:41,  3.53it/s]

Epoch 1 Step 2000 Loss 5.8695


 17%|█▋        | 2010/11685 [09:32<45:34,  3.54it/s]

Epoch 1 Step 2010 Loss 5.9305


 17%|█▋        | 2020/11685 [09:35<45:44,  3.52it/s]

Epoch 1 Step 2020 Loss 5.8426


 17%|█▋        | 2030/11685 [09:38<45:28,  3.54it/s]

Epoch 1 Step 2030 Loss 5.8452


 17%|█▋        | 2040/11685 [09:40<45:32,  3.53it/s]

Epoch 1 Step 2040 Loss 5.9459


 18%|█▊        | 2050/11685 [09:43<45:58,  3.49it/s]

Epoch 1 Step 2050 Loss 5.8910


 18%|█▊        | 2060/11685 [09:46<45:22,  3.54it/s]

Epoch 1 Step 2060 Loss 5.9455


 18%|█▊        | 2070/11685 [09:49<45:19,  3.54it/s]

Epoch 1 Step 2070 Loss 5.9307


 18%|█▊        | 2080/11685 [09:52<45:41,  3.50it/s]

Epoch 1 Step 2080 Loss 5.8645


 18%|█▊        | 2090/11685 [09:55<44:57,  3.56it/s]

Epoch 1 Step 2090 Loss 5.9689


 18%|█▊        | 2100/11685 [09:58<45:21,  3.52it/s]

Epoch 1 Step 2100 Loss 5.9486


 18%|█▊        | 2110/11685 [10:00<45:02,  3.54it/s]

Epoch 1 Step 2110 Loss 5.9096


 18%|█▊        | 2120/11685 [10:03<45:10,  3.53it/s]

Epoch 1 Step 2120 Loss 5.9600


 18%|█▊        | 2130/11685 [10:06<45:13,  3.52it/s]

Epoch 1 Step 2130 Loss 5.7790


 18%|█▊        | 2140/11685 [10:09<45:09,  3.52it/s]

Epoch 1 Step 2140 Loss 5.9717


 18%|█▊        | 2150/11685 [10:12<45:02,  3.53it/s]

Epoch 1 Step 2150 Loss 5.9686


 18%|█▊        | 2160/11685 [10:15<45:24,  3.50it/s]

Epoch 1 Step 2160 Loss 5.8544


 19%|█▊        | 2170/11685 [10:17<45:03,  3.52it/s]

Epoch 1 Step 2170 Loss 5.8701


 19%|█▊        | 2180/11685 [10:20<44:32,  3.56it/s]

Epoch 1 Step 2180 Loss 5.8398


 19%|█▊        | 2190/11685 [10:23<44:59,  3.52it/s]

Epoch 1 Step 2190 Loss 5.8270


 19%|█▉        | 2200/11685 [10:26<44:58,  3.52it/s]

Epoch 1 Step 2200 Loss 5.9595


 19%|█▉        | 2210/11685 [10:29<44:58,  3.51it/s]

Epoch 1 Step 2210 Loss 5.9501


 19%|█▉        | 2220/11685 [10:32<44:39,  3.53it/s]

Epoch 1 Step 2220 Loss 5.9521


 19%|█▉        | 2230/11685 [10:34<44:44,  3.52it/s]

Epoch 1 Step 2230 Loss 6.0322


 19%|█▉        | 2240/11685 [10:37<44:29,  3.54it/s]

Epoch 1 Step 2240 Loss 5.8450


 19%|█▉        | 2250/11685 [10:40<44:34,  3.53it/s]

Epoch 1 Step 2250 Loss 5.9025


 19%|█▉        | 2260/11685 [10:43<44:44,  3.51it/s]

Epoch 1 Step 2260 Loss 5.9819


 19%|█▉        | 2270/11685 [10:46<45:04,  3.48it/s]

Epoch 1 Step 2270 Loss 6.0024


 20%|█▉        | 2280/11685 [10:49<44:58,  3.48it/s]

Epoch 1 Step 2280 Loss 5.9388


 20%|█▉        | 2290/11685 [10:51<44:32,  3.52it/s]

Epoch 1 Step 2290 Loss 5.8664


 20%|█▉        | 2300/11685 [10:54<44:29,  3.52it/s]

Epoch 1 Step 2300 Loss 5.8950


 20%|█▉        | 2310/11685 [10:57<44:09,  3.54it/s]

Epoch 1 Step 2310 Loss 5.9923


 20%|█▉        | 2320/11685 [11:00<44:11,  3.53it/s]

Epoch 1 Step 2320 Loss 5.8039


 20%|█▉        | 2330/11685 [11:03<44:16,  3.52it/s]

Epoch 1 Step 2330 Loss 5.8946


 20%|██        | 2340/11685 [11:06<44:13,  3.52it/s]

Epoch 1 Step 2340 Loss 5.9298


 20%|██        | 2350/11685 [11:08<43:58,  3.54it/s]

Epoch 1 Step 2350 Loss 5.7874


 20%|██        | 2360/11685 [11:11<43:59,  3.53it/s]

Epoch 1 Step 2360 Loss 5.9251


 20%|██        | 2370/11685 [11:14<44:13,  3.51it/s]

Epoch 1 Step 2370 Loss 5.9122


 20%|██        | 2380/11685 [11:17<44:00,  3.52it/s]

Epoch 1 Step 2380 Loss 5.9493


 20%|██        | 2390/11685 [11:20<43:58,  3.52it/s]

Epoch 1 Step 2390 Loss 5.9511


 21%|██        | 2400/11685 [11:23<43:48,  3.53it/s]

Epoch 1 Step 2400 Loss 5.8956


 21%|██        | 2410/11685 [11:26<44:15,  3.49it/s]

Epoch 1 Step 2410 Loss 5.7487


 21%|██        | 2420/11685 [11:28<43:49,  3.52it/s]

Epoch 1 Step 2420 Loss 5.9394


 21%|██        | 2430/11685 [11:31<44:21,  3.48it/s]

Epoch 1 Step 2430 Loss 5.9686


 21%|██        | 2440/11685 [11:34<43:53,  3.51it/s]

Epoch 1 Step 2440 Loss 5.9397


 21%|██        | 2450/11685 [11:37<43:43,  3.52it/s]

Epoch 1 Step 2450 Loss 5.8045


 21%|██        | 2460/11685 [11:40<43:26,  3.54it/s]

Epoch 1 Step 2460 Loss 5.8391


 21%|██        | 2470/11685 [11:43<43:54,  3.50it/s]

Epoch 1 Step 2470 Loss 5.9540


 21%|██        | 2480/11685 [11:45<43:35,  3.52it/s]

Epoch 1 Step 2480 Loss 5.9241


 21%|██▏       | 2490/11685 [11:48<43:31,  3.52it/s]

Epoch 1 Step 2490 Loss 5.9591


 21%|██▏       | 2500/11685 [11:51<43:21,  3.53it/s]

Epoch 1 Step 2500 Loss 6.0118


 21%|██▏       | 2510/11685 [11:54<43:17,  3.53it/s]

Epoch 1 Step 2510 Loss 5.9051


 22%|██▏       | 2520/11685 [11:57<43:02,  3.55it/s]

Epoch 1 Step 2520 Loss 5.8295


 22%|██▏       | 2530/11685 [12:00<43:05,  3.54it/s]

Epoch 1 Step 2530 Loss 5.9515


 22%|██▏       | 2540/11685 [12:02<43:50,  3.48it/s]

Epoch 1 Step 2540 Loss 5.8276


 22%|██▏       | 2550/11685 [12:05<42:57,  3.54it/s]

Epoch 1 Step 2550 Loss 5.8721


 22%|██▏       | 2560/11685 [12:08<43:24,  3.50it/s]

Epoch 1 Step 2560 Loss 6.0977


 22%|██▏       | 2570/11685 [12:11<43:10,  3.52it/s]

Epoch 1 Step 2570 Loss 5.9903


 22%|██▏       | 2580/11685 [12:14<42:57,  3.53it/s]

Epoch 1 Step 2580 Loss 5.8726


 22%|██▏       | 2590/11685 [12:17<43:48,  3.46it/s]

Epoch 1 Step 2590 Loss 5.7714


 22%|██▏       | 2600/11685 [12:20<43:10,  3.51it/s]

Epoch 1 Step 2600 Loss 5.8147


 22%|██▏       | 2610/11685 [12:22<43:04,  3.51it/s]

Epoch 1 Step 2610 Loss 5.7952


 22%|██▏       | 2620/11685 [12:25<43:01,  3.51it/s]

Epoch 1 Step 2620 Loss 5.9990


 23%|██▎       | 2630/11685 [12:28<42:47,  3.53it/s]

Epoch 1 Step 2630 Loss 6.0768


 23%|██▎       | 2640/11685 [12:31<43:31,  3.46it/s]

Epoch 1 Step 2640 Loss 5.9391


 23%|██▎       | 2650/11685 [12:34<44:21,  3.40it/s]

Epoch 1 Step 2650 Loss 5.9041


 23%|██▎       | 2660/11685 [12:37<42:52,  3.51it/s]

Epoch 1 Step 2660 Loss 5.9496


 23%|██▎       | 2670/11685 [12:40<42:41,  3.52it/s]

Epoch 1 Step 2670 Loss 5.9241


 23%|██▎       | 2680/11685 [12:42<42:34,  3.53it/s]

Epoch 1 Step 2680 Loss 5.9821


 23%|██▎       | 2690/11685 [12:45<42:51,  3.50it/s]

Epoch 1 Step 2690 Loss 5.9998


 23%|██▎       | 2700/11685 [12:48<42:43,  3.50it/s]

Epoch 1 Step 2700 Loss 5.9814


 23%|██▎       | 2710/11685 [12:51<43:09,  3.47it/s]

Epoch 1 Step 2710 Loss 6.0103


 23%|██▎       | 2720/11685 [12:54<42:19,  3.53it/s]

Epoch 1 Step 2720 Loss 5.8708


 23%|██▎       | 2730/11685 [12:57<42:34,  3.51it/s]

Epoch 1 Step 2730 Loss 5.8141


 23%|██▎       | 2740/11685 [13:00<42:31,  3.51it/s]

Epoch 1 Step 2740 Loss 5.9894


 24%|██▎       | 2750/11685 [13:02<41:52,  3.56it/s]

Epoch 1 Step 2750 Loss 5.9483


 24%|██▎       | 2760/11685 [13:05<42:26,  3.50it/s]

Epoch 1 Step 2760 Loss 5.9549


 24%|██▎       | 2770/11685 [13:08<42:29,  3.50it/s]

Epoch 1 Step 2770 Loss 5.9711


 24%|██▍       | 2780/11685 [13:11<41:47,  3.55it/s]

Epoch 1 Step 2780 Loss 5.9098


 24%|██▍       | 2790/11685 [13:14<42:20,  3.50it/s]

Epoch 1 Step 2790 Loss 5.8011


 24%|██▍       | 2800/11685 [13:17<41:56,  3.53it/s]

Epoch 1 Step 2800 Loss 5.9227


 24%|██▍       | 2810/11685 [13:19<42:03,  3.52it/s]

Epoch 1 Step 2810 Loss 5.9059


 24%|██▍       | 2820/11685 [13:22<42:21,  3.49it/s]

Epoch 1 Step 2820 Loss 5.8245


 24%|██▍       | 2830/11685 [13:25<43:04,  3.43it/s]

Epoch 1 Step 2830 Loss 5.9433


 24%|██▍       | 2840/11685 [13:28<41:43,  3.53it/s]

Epoch 1 Step 2840 Loss 5.8688


 24%|██▍       | 2850/11685 [13:31<41:53,  3.52it/s]

Epoch 1 Step 2850 Loss 5.9450


 24%|██▍       | 2860/11685 [13:34<41:50,  3.51it/s]

Epoch 1 Step 2860 Loss 5.9005


 25%|██▍       | 2870/11685 [13:37<41:48,  3.51it/s]

Epoch 1 Step 2870 Loss 5.8657


 25%|██▍       | 2880/11685 [13:39<41:56,  3.50it/s]

Epoch 1 Step 2880 Loss 5.9078


 25%|██▍       | 2890/11685 [13:42<41:16,  3.55it/s]

Epoch 1 Step 2890 Loss 5.9731


 25%|██▍       | 2900/11685 [13:45<41:35,  3.52it/s]

Epoch 1 Step 2900 Loss 5.9445


 25%|██▍       | 2910/11685 [13:48<41:41,  3.51it/s]

Epoch 1 Step 2910 Loss 5.9032


 25%|██▍       | 2920/11685 [13:51<42:23,  3.45it/s]

Epoch 1 Step 2920 Loss 5.8648


 25%|██▌       | 2930/11685 [13:54<41:27,  3.52it/s]

Epoch 1 Step 2930 Loss 5.8991


 25%|██▌       | 2940/11685 [13:57<41:33,  3.51it/s]

Epoch 1 Step 2940 Loss 5.9802


 25%|██▌       | 2950/11685 [13:59<41:33,  3.50it/s]

Epoch 1 Step 2950 Loss 5.7831


 25%|██▌       | 2960/11685 [14:02<41:34,  3.50it/s]

Epoch 1 Step 2960 Loss 5.8659


 25%|██▌       | 2970/11685 [14:05<41:26,  3.50it/s]

Epoch 1 Step 2970 Loss 5.9625


 26%|██▌       | 2980/11685 [14:08<41:18,  3.51it/s]

Epoch 1 Step 2980 Loss 5.9493


 26%|██▌       | 2990/11685 [14:11<41:08,  3.52it/s]

Epoch 1 Step 2990 Loss 5.8642


 26%|██▌       | 3000/11685 [14:14<41:19,  3.50it/s]

Epoch 1 Step 3000 Loss 6.0120


 26%|██▌       | 3010/11685 [14:16<41:03,  3.52it/s]

Epoch 1 Step 3010 Loss 5.9681


 26%|██▌       | 3020/11685 [14:19<41:16,  3.50it/s]

Epoch 1 Step 3020 Loss 5.8469


 26%|██▌       | 3030/11685 [14:22<41:08,  3.51it/s]

Epoch 1 Step 3030 Loss 5.8709


 26%|██▌       | 3040/11685 [14:25<41:08,  3.50it/s]

Epoch 1 Step 3040 Loss 5.9584


 26%|██▌       | 3050/11685 [14:28<41:07,  3.50it/s]

Epoch 1 Step 3050 Loss 5.8362


 26%|██▌       | 3060/11685 [14:31<40:58,  3.51it/s]

Epoch 1 Step 3060 Loss 5.9212


 26%|██▋       | 3070/11685 [14:34<40:57,  3.51it/s]

Epoch 1 Step 3070 Loss 6.0127


 26%|██▋       | 3080/11685 [14:36<40:49,  3.51it/s]

Epoch 1 Step 3080 Loss 5.9009


 26%|██▋       | 3090/11685 [14:39<40:42,  3.52it/s]

Epoch 1 Step 3090 Loss 5.8526


 27%|██▋       | 3100/11685 [14:42<40:35,  3.52it/s]

Epoch 1 Step 3100 Loss 6.0136


 27%|██▋       | 3110/11685 [14:45<40:39,  3.51it/s]

Epoch 1 Step 3110 Loss 5.8749


 27%|██▋       | 3120/11685 [14:48<40:45,  3.50it/s]

Epoch 1 Step 3120 Loss 5.9100


 27%|██▋       | 3130/11685 [14:51<40:27,  3.52it/s]

Epoch 1 Step 3130 Loss 5.8923


 27%|██▋       | 3140/11685 [14:54<40:27,  3.52it/s]

Epoch 1 Step 3140 Loss 5.8566


 27%|██▋       | 3150/11685 [14:56<40:24,  3.52it/s]

Epoch 1 Step 3150 Loss 5.9123


 27%|██▋       | 3160/11685 [14:59<40:32,  3.50it/s]

Epoch 1 Step 3160 Loss 5.8805


 27%|██▋       | 3170/11685 [15:02<40:27,  3.51it/s]

Epoch 1 Step 3170 Loss 5.8489


 27%|██▋       | 3180/11685 [15:05<40:46,  3.48it/s]

Epoch 1 Step 3180 Loss 5.9247


 27%|██▋       | 3190/11685 [15:08<40:12,  3.52it/s]

Epoch 1 Step 3190 Loss 5.9357


 27%|██▋       | 3200/11685 [15:11<40:19,  3.51it/s]

Epoch 1 Step 3200 Loss 5.7451


 27%|██▋       | 3210/11685 [15:13<40:14,  3.51it/s]

Epoch 1 Step 3210 Loss 5.8874


 28%|██▊       | 3220/11685 [15:16<40:07,  3.52it/s]

Epoch 1 Step 3220 Loss 5.9319


 28%|██▊       | 3230/11685 [15:19<39:51,  3.54it/s]

Epoch 1 Step 3230 Loss 5.8748


 28%|██▊       | 3240/11685 [15:22<40:02,  3.52it/s]

Epoch 1 Step 3240 Loss 5.9493


 28%|██▊       | 3250/11685 [15:25<40:08,  3.50it/s]

Epoch 1 Step 3250 Loss 5.8784


 28%|██▊       | 3260/11685 [15:28<39:59,  3.51it/s]

Epoch 1 Step 3260 Loss 5.8898


 28%|██▊       | 3270/11685 [15:30<39:36,  3.54it/s]

Epoch 1 Step 3270 Loss 5.8516


 28%|██▊       | 3280/11685 [15:33<39:59,  3.50it/s]

Epoch 1 Step 3280 Loss 5.9815


 28%|██▊       | 3290/11685 [15:36<39:41,  3.52it/s]

Epoch 1 Step 3290 Loss 5.7771


 28%|██▊       | 3300/11685 [15:39<39:49,  3.51it/s]

Epoch 1 Step 3300 Loss 5.7971


 28%|██▊       | 3310/11685 [15:42<39:56,  3.49it/s]

Epoch 1 Step 3310 Loss 5.9032


 28%|██▊       | 3320/11685 [15:45<39:27,  3.53it/s]

Epoch 1 Step 3320 Loss 5.7698


 28%|██▊       | 3330/11685 [15:48<39:32,  3.52it/s]

Epoch 1 Step 3330 Loss 5.8045


 29%|██▊       | 3340/11685 [15:50<39:36,  3.51it/s]

Epoch 1 Step 3340 Loss 5.8454


 29%|██▊       | 3350/11685 [15:53<39:46,  3.49it/s]

Epoch 1 Step 3350 Loss 5.8547


 29%|██▉       | 3360/11685 [15:56<39:13,  3.54it/s]

Epoch 1 Step 3360 Loss 5.9166


 29%|██▉       | 3370/11685 [15:59<39:58,  3.47it/s]

Epoch 1 Step 3370 Loss 5.8352


 29%|██▉       | 3380/11685 [16:02<39:12,  3.53it/s]

Epoch 1 Step 3380 Loss 5.9749


 29%|██▉       | 3390/11685 [16:05<39:31,  3.50it/s]

Epoch 1 Step 3390 Loss 5.9390


 29%|██▉       | 3400/11685 [16:08<39:10,  3.52it/s]

Epoch 1 Step 3400 Loss 5.8809


 29%|██▉       | 3410/11685 [16:10<39:24,  3.50it/s]

Epoch 1 Step 3410 Loss 5.9446


 29%|██▉       | 3420/11685 [16:13<39:10,  3.52it/s]

Epoch 1 Step 3420 Loss 5.8637


 29%|██▉       | 3430/11685 [16:16<39:10,  3.51it/s]

Epoch 1 Step 3430 Loss 5.9081


 29%|██▉       | 3440/11685 [16:19<39:22,  3.49it/s]

Epoch 1 Step 3440 Loss 5.9107


 30%|██▉       | 3450/11685 [16:22<39:02,  3.52it/s]

Epoch 1 Step 3450 Loss 5.9012


 30%|██▉       | 3460/11685 [16:25<39:11,  3.50it/s]

Epoch 1 Step 3460 Loss 5.9858


 30%|██▉       | 3470/11685 [16:28<39:04,  3.50it/s]

Epoch 1 Step 3470 Loss 5.9872


 30%|██▉       | 3480/11685 [16:30<38:51,  3.52it/s]

Epoch 1 Step 3480 Loss 5.9128


 30%|██▉       | 3490/11685 [16:33<38:55,  3.51it/s]

Epoch 1 Step 3490 Loss 5.9522


 30%|██▉       | 3500/11685 [16:36<38:53,  3.51it/s]

Epoch 1 Step 3500 Loss 5.9918


 30%|███       | 3510/11685 [16:46<5:11:40,  2.29s/it]

Epoch 1 Step 3510 Loss 5.9787


 30%|███       | 3520/11685 [16:49<46:52,  2.90it/s]  

Epoch 1 Step 3520 Loss 5.9305


 30%|███       | 3530/11685 [16:52<39:04,  3.48it/s]

Epoch 1 Step 3530 Loss 5.9071


 30%|███       | 3540/11685 [16:55<38:28,  3.53it/s]

Epoch 1 Step 3540 Loss 5.7465


 30%|███       | 3550/11685 [16:57<38:38,  3.51it/s]

Epoch 1 Step 3550 Loss 5.9167


 30%|███       | 3560/11685 [17:00<38:44,  3.50it/s]

Epoch 1 Step 3560 Loss 5.7738


 31%|███       | 3570/11685 [17:03<38:35,  3.50it/s]

Epoch 1 Step 3570 Loss 5.8111


 31%|███       | 3580/11685 [17:06<38:27,  3.51it/s]

Epoch 1 Step 3580 Loss 5.8580


 31%|███       | 3590/11685 [17:09<38:28,  3.51it/s]

Epoch 1 Step 3590 Loss 5.9836


 31%|███       | 3600/11685 [17:12<38:30,  3.50it/s]

Epoch 1 Step 3600 Loss 5.8675


 31%|███       | 3610/11685 [17:15<38:34,  3.49it/s]

Epoch 1 Step 3610 Loss 5.8535


 31%|███       | 3620/11685 [17:17<38:14,  3.51it/s]

Epoch 1 Step 3620 Loss 5.8221


 31%|███       | 3630/11685 [17:20<38:11,  3.52it/s]

Epoch 1 Step 3630 Loss 5.9187


 31%|███       | 3640/11685 [17:23<38:13,  3.51it/s]

Epoch 1 Step 3640 Loss 5.8550


 31%|███       | 3650/11685 [17:26<38:22,  3.49it/s]

Epoch 1 Step 3650 Loss 5.8042


 31%|███▏      | 3660/11685 [17:29<37:57,  3.52it/s]

Epoch 1 Step 3660 Loss 5.8830


 31%|███▏      | 3670/11685 [17:32<38:00,  3.51it/s]

Epoch 1 Step 3670 Loss 5.7940


 31%|███▏      | 3680/11685 [17:35<38:03,  3.51it/s]

Epoch 1 Step 3680 Loss 5.7099


 32%|███▏      | 3690/11685 [17:37<37:50,  3.52it/s]

Epoch 1 Step 3690 Loss 5.8861


 32%|███▏      | 3700/11685 [17:40<38:02,  3.50it/s]

Epoch 1 Step 3700 Loss 5.9472


 32%|███▏      | 3710/11685 [17:43<38:05,  3.49it/s]

Epoch 1 Step 3710 Loss 5.8171


 32%|███▏      | 3720/11685 [17:46<37:51,  3.51it/s]

Epoch 1 Step 3720 Loss 5.8669


 32%|███▏      | 3730/11685 [17:49<37:44,  3.51it/s]

Epoch 1 Step 3730 Loss 5.8681


 32%|███▏      | 3740/11685 [17:52<37:39,  3.52it/s]

Epoch 1 Step 3740 Loss 5.8440


 32%|███▏      | 3750/11685 [17:54<37:39,  3.51it/s]

Epoch 1 Step 3750 Loss 5.9033


 32%|███▏      | 3760/11685 [17:57<37:39,  3.51it/s]

Epoch 1 Step 3760 Loss 5.8859


 32%|███▏      | 3770/11685 [18:00<37:30,  3.52it/s]

Epoch 1 Step 3770 Loss 5.8547


 32%|███▏      | 3780/11685 [18:03<37:39,  3.50it/s]

Epoch 1 Step 3780 Loss 5.7949


 32%|███▏      | 3790/11685 [18:06<37:30,  3.51it/s]

Epoch 1 Step 3790 Loss 5.9486


 33%|███▎      | 3800/11685 [18:09<37:19,  3.52it/s]

Epoch 1 Step 3800 Loss 5.7947


 33%|███▎      | 3810/11685 [18:12<37:24,  3.51it/s]

Epoch 1 Step 3810 Loss 5.9160


 33%|███▎      | 3820/11685 [18:14<37:14,  3.52it/s]

Epoch 1 Step 3820 Loss 6.0177


 33%|███▎      | 3830/11685 [18:17<37:13,  3.52it/s]

Epoch 1 Step 3830 Loss 5.8498


 33%|███▎      | 3840/11685 [18:20<37:24,  3.49it/s]

Epoch 1 Step 3840 Loss 6.1034


 33%|███▎      | 3850/11685 [18:23<37:05,  3.52it/s]

Epoch 1 Step 3850 Loss 5.8289


 33%|███▎      | 3860/11685 [18:26<37:04,  3.52it/s]

Epoch 1 Step 3860 Loss 5.9204


 33%|███▎      | 3870/11685 [18:29<37:03,  3.52it/s]

Epoch 1 Step 3870 Loss 5.9104


 33%|███▎      | 3880/11685 [18:31<37:13,  3.49it/s]

Epoch 1 Step 3880 Loss 5.9073


 33%|███▎      | 3890/11685 [18:34<37:01,  3.51it/s]

Epoch 1 Step 3890 Loss 5.8070


 33%|███▎      | 3895/11685 [18:36<37:03,  3.50it/s]

Epoch 1 Average Training Loss: 5.9363
Epoch 1 Test Accuracy: 0.0142


 33%|███▎      | 3905/11685 [23:18<7:56:10,  3.67s/it]  

Epoch 2 Step 10 Loss 5.8346


 34%|███▎      | 3915/11685 [23:21<49:20,  2.62it/s]  

Epoch 2 Step 20 Loss 5.9952


 34%|███▎      | 3925/11685 [23:24<37:09,  3.48it/s]

Epoch 2 Step 30 Loss 5.9094


 34%|███▎      | 3935/11685 [23:27<36:52,  3.50it/s]

Epoch 2 Step 40 Loss 5.6302


 34%|███▍      | 3945/11685 [23:30<36:53,  3.50it/s]

Epoch 2 Step 50 Loss 5.8115


 34%|███▍      | 3955/11685 [23:33<36:40,  3.51it/s]

Epoch 2 Step 60 Loss 5.8850


 34%|███▍      | 3965/11685 [23:36<36:32,  3.52it/s]

Epoch 2 Step 70 Loss 5.9476


 34%|███▍      | 3975/11685 [23:38<36:42,  3.50it/s]

Epoch 2 Step 80 Loss 5.8430


 34%|███▍      | 3985/11685 [23:41<36:26,  3.52it/s]

Epoch 2 Step 90 Loss 5.8940


 34%|███▍      | 3995/11685 [23:44<36:42,  3.49it/s]

Epoch 2 Step 100 Loss 5.8838


 34%|███▍      | 4005/11685 [23:47<36:27,  3.51it/s]

Epoch 2 Step 110 Loss 5.8135


 34%|███▍      | 4015/11685 [23:50<36:26,  3.51it/s]

Epoch 2 Step 120 Loss 5.9731


 34%|███▍      | 4025/11685 [23:53<36:28,  3.50it/s]

Epoch 2 Step 130 Loss 5.8741


 35%|███▍      | 4035/11685 [23:55<36:11,  3.52it/s]

Epoch 2 Step 140 Loss 5.8526


 35%|███▍      | 4045/11685 [23:58<36:12,  3.52it/s]

Epoch 2 Step 150 Loss 5.8380


 35%|███▍      | 4055/11685 [24:01<36:09,  3.52it/s]

Epoch 2 Step 160 Loss 5.9457


 35%|███▍      | 4065/11685 [24:04<36:12,  3.51it/s]

Epoch 2 Step 170 Loss 6.0203


 35%|███▍      | 4075/11685 [24:07<36:05,  3.51it/s]

Epoch 2 Step 180 Loss 5.8844


 35%|███▍      | 4085/11685 [24:10<36:03,  3.51it/s]

Epoch 2 Step 190 Loss 5.7353


 35%|███▌      | 4095/11685 [24:13<36:03,  3.51it/s]

Epoch 2 Step 200 Loss 5.8368


 35%|███▌      | 4105/11685 [24:15<35:56,  3.51it/s]

Epoch 2 Step 210 Loss 5.8505


 35%|███▌      | 4115/11685 [24:18<36:00,  3.50it/s]

Epoch 2 Step 220 Loss 5.8765


 35%|███▌      | 4125/11685 [24:21<35:44,  3.53it/s]

Epoch 2 Step 230 Loss 5.9314


 35%|███▌      | 4135/11685 [24:24<35:46,  3.52it/s]

Epoch 2 Step 240 Loss 5.9152


 35%|███▌      | 4145/11685 [24:27<35:47,  3.51it/s]

Epoch 2 Step 250 Loss 5.8848


 36%|███▌      | 4155/11685 [24:30<35:37,  3.52it/s]

Epoch 2 Step 260 Loss 5.8298


 36%|███▌      | 4165/11685 [24:32<35:50,  3.50it/s]

Epoch 2 Step 270 Loss 5.8955


 36%|███▌      | 4175/11685 [24:35<35:42,  3.51it/s]

Epoch 2 Step 280 Loss 5.8107


 36%|███▌      | 4185/11685 [24:38<35:41,  3.50it/s]

Epoch 2 Step 290 Loss 5.8879


 36%|███▌      | 4195/11685 [24:41<35:40,  3.50it/s]

Epoch 2 Step 300 Loss 5.8344


 36%|███▌      | 4205/11685 [24:44<35:30,  3.51it/s]

Epoch 2 Step 310 Loss 5.7758


 36%|███▌      | 4215/11685 [24:47<35:28,  3.51it/s]

Epoch 2 Step 320 Loss 5.7955


 36%|███▌      | 4225/11685 [24:50<35:28,  3.50it/s]

Epoch 2 Step 330 Loss 5.9501


 36%|███▌      | 4235/11685 [24:52<35:24,  3.51it/s]

Epoch 2 Step 340 Loss 5.9173


 36%|███▋      | 4245/11685 [24:55<35:17,  3.51it/s]

Epoch 2 Step 350 Loss 6.0112


 36%|███▋      | 4255/11685 [24:58<35:20,  3.50it/s]

Epoch 2 Step 360 Loss 5.8855


 36%|███▋      | 4265/11685 [25:01<35:05,  3.52it/s]

Epoch 2 Step 370 Loss 5.9139


 37%|███▋      | 4275/11685 [25:04<35:08,  3.51it/s]

Epoch 2 Step 380 Loss 5.8372


 37%|███▋      | 4285/11685 [25:07<35:08,  3.51it/s]

Epoch 2 Step 390 Loss 5.7609


 37%|███▋      | 4295/11685 [25:10<35:08,  3.50it/s]

Epoch 2 Step 400 Loss 5.8963


 37%|███▋      | 4305/11685 [25:12<35:22,  3.48it/s]

Epoch 2 Step 410 Loss 5.9841


 37%|███▋      | 4315/11685 [25:15<34:57,  3.51it/s]

Epoch 2 Step 420 Loss 5.9977


 37%|███▋      | 4325/11685 [25:18<34:56,  3.51it/s]

Epoch 2 Step 430 Loss 5.9596


 37%|███▋      | 4335/11685 [25:21<34:52,  3.51it/s]

Epoch 2 Step 440 Loss 5.9387


 37%|███▋      | 4345/11685 [25:24<34:47,  3.52it/s]

Epoch 2 Step 450 Loss 5.9108


 37%|███▋      | 4355/11685 [25:27<34:42,  3.52it/s]

Epoch 2 Step 460 Loss 5.7489


 37%|███▋      | 4365/11685 [25:29<34:40,  3.52it/s]

Epoch 2 Step 470 Loss 5.7906


 37%|███▋      | 4375/11685 [25:32<34:41,  3.51it/s]

Epoch 2 Step 480 Loss 5.8705


 38%|███▊      | 4385/11685 [25:35<34:49,  3.49it/s]

Epoch 2 Step 490 Loss 5.6825


 38%|███▊      | 4395/11685 [25:38<34:40,  3.50it/s]

Epoch 2 Step 500 Loss 6.0053


 38%|███▊      | 4405/11685 [25:41<34:36,  3.51it/s]

Epoch 2 Step 510 Loss 5.7797


 38%|███▊      | 4415/11685 [25:44<34:31,  3.51it/s]

Epoch 2 Step 520 Loss 5.8678


 38%|███▊      | 4425/11685 [25:47<34:31,  3.51it/s]

Epoch 2 Step 530 Loss 5.9228


 38%|███▊      | 4435/11685 [25:49<34:25,  3.51it/s]

Epoch 2 Step 540 Loss 5.8020


 38%|███▊      | 4445/11685 [25:52<34:20,  3.51it/s]

Epoch 2 Step 550 Loss 5.8026


 38%|███▊      | 4455/11685 [25:55<34:13,  3.52it/s]

Epoch 2 Step 560 Loss 5.9128


 38%|███▊      | 4465/11685 [25:58<34:14,  3.51it/s]

Epoch 2 Step 570 Loss 5.8391


 38%|███▊      | 4475/11685 [26:01<34:16,  3.51it/s]

Epoch 2 Step 580 Loss 5.8219


 38%|███▊      | 4485/11685 [26:04<34:12,  3.51it/s]

Epoch 2 Step 590 Loss 5.7505


 38%|███▊      | 4495/11685 [26:07<34:03,  3.52it/s]

Epoch 2 Step 600 Loss 5.7745


 39%|███▊      | 4505/11685 [26:09<34:01,  3.52it/s]

Epoch 2 Step 610 Loss 5.8986


 39%|███▊      | 4515/11685 [26:12<33:56,  3.52it/s]

Epoch 2 Step 620 Loss 5.8690


 39%|███▊      | 4525/11685 [26:15<33:58,  3.51it/s]

Epoch 2 Step 630 Loss 5.7741


 39%|███▉      | 4535/11685 [26:18<33:55,  3.51it/s]

Epoch 2 Step 640 Loss 5.8792


 39%|███▉      | 4545/11685 [26:21<33:51,  3.51it/s]

Epoch 2 Step 650 Loss 5.9395


 39%|███▉      | 4555/11685 [26:24<33:56,  3.50it/s]

Epoch 2 Step 660 Loss 5.8821


 39%|███▉      | 4565/11685 [26:26<33:49,  3.51it/s]

Epoch 2 Step 670 Loss 5.9673


 39%|███▉      | 4575/11685 [26:29<33:43,  3.51it/s]

Epoch 2 Step 680 Loss 6.0003


 39%|███▉      | 4585/11685 [26:32<33:39,  3.52it/s]

Epoch 2 Step 690 Loss 5.6802


 39%|███▉      | 4595/11685 [26:35<33:32,  3.52it/s]

Epoch 2 Step 700 Loss 5.7645


 39%|███▉      | 4605/11685 [26:38<33:35,  3.51it/s]

Epoch 2 Step 710 Loss 5.9639


 39%|███▉      | 4615/11685 [26:41<33:39,  3.50it/s]

Epoch 2 Step 720 Loss 5.8895


 40%|███▉      | 4625/11685 [26:44<33:29,  3.51it/s]

Epoch 2 Step 730 Loss 5.8480


 40%|███▉      | 4635/11685 [26:46<33:38,  3.49it/s]

Epoch 2 Step 740 Loss 5.7660


 40%|███▉      | 4645/11685 [26:49<33:24,  3.51it/s]

Epoch 2 Step 750 Loss 5.7086


 40%|███▉      | 4655/11685 [26:52<33:24,  3.51it/s]

Epoch 2 Step 760 Loss 5.8970


 40%|███▉      | 4665/11685 [26:55<33:14,  3.52it/s]

Epoch 2 Step 770 Loss 5.7286


 40%|████      | 4675/11685 [26:58<33:08,  3.53it/s]

Epoch 2 Step 780 Loss 5.8833


 40%|████      | 4685/11685 [27:01<33:14,  3.51it/s]

Epoch 2 Step 790 Loss 5.8593


 40%|████      | 4695/11685 [27:03<33:08,  3.51it/s]

Epoch 2 Step 800 Loss 5.8216


 40%|████      | 4705/11685 [27:06<32:57,  3.53it/s]

Epoch 2 Step 810 Loss 5.8765


 40%|████      | 4715/11685 [27:09<33:06,  3.51it/s]

Epoch 2 Step 820 Loss 5.8350


 40%|████      | 4725/11685 [27:12<33:00,  3.51it/s]

Epoch 2 Step 830 Loss 5.7264


 41%|████      | 4735/11685 [27:15<33:02,  3.51it/s]

Epoch 2 Step 840 Loss 5.9014


 41%|████      | 4745/11685 [27:18<33:00,  3.50it/s]

Epoch 2 Step 850 Loss 5.9385


 41%|████      | 4755/11685 [27:21<32:58,  3.50it/s]

Epoch 2 Step 860 Loss 5.9572


 41%|████      | 4765/11685 [27:23<32:52,  3.51it/s]

Epoch 2 Step 870 Loss 5.7916


 41%|████      | 4775/11685 [27:26<32:42,  3.52it/s]

Epoch 2 Step 880 Loss 5.7933


 41%|████      | 4785/11685 [27:29<32:45,  3.51it/s]

Epoch 2 Step 890 Loss 5.8766


 41%|████      | 4795/11685 [27:32<32:28,  3.54it/s]

Epoch 2 Step 900 Loss 5.8981


 41%|████      | 4805/11685 [27:35<32:32,  3.52it/s]

Epoch 2 Step 910 Loss 5.8023


 41%|████      | 4815/11685 [27:38<32:39,  3.51it/s]

Epoch 2 Step 920 Loss 5.8469


 41%|████▏     | 4825/11685 [27:40<32:37,  3.50it/s]

Epoch 2 Step 930 Loss 5.8436


 41%|████▏     | 4835/11685 [27:43<32:32,  3.51it/s]

Epoch 2 Step 940 Loss 5.9329


 41%|████▏     | 4845/11685 [27:46<32:15,  3.53it/s]

Epoch 2 Step 950 Loss 5.8973


 42%|████▏     | 4855/11685 [27:49<32:24,  3.51it/s]

Epoch 2 Step 960 Loss 5.8524


 42%|████▏     | 4865/11685 [27:52<32:20,  3.51it/s]

Epoch 2 Step 970 Loss 5.8508


 42%|████▏     | 4875/11685 [27:55<32:12,  3.52it/s]

Epoch 2 Step 980 Loss 5.7836


 42%|████▏     | 4885/11685 [27:58<32:08,  3.53it/s]

Epoch 2 Step 990 Loss 5.8438


 42%|████▏     | 4895/11685 [28:00<32:23,  3.49it/s]

Epoch 2 Step 1000 Loss 5.9551


 42%|████▏     | 4905/11685 [28:03<32:13,  3.51it/s]

Epoch 2 Step 1010 Loss 5.9023


 42%|████▏     | 4915/11685 [28:06<31:59,  3.53it/s]

Epoch 2 Step 1020 Loss 5.9175


 42%|████▏     | 4925/11685 [28:09<32:09,  3.50it/s]

Epoch 2 Step 1030 Loss 5.8424


 42%|████▏     | 4935/11685 [28:12<32:05,  3.51it/s]

Epoch 2 Step 1040 Loss 5.9186


 42%|████▏     | 4945/11685 [28:15<31:50,  3.53it/s]

Epoch 2 Step 1050 Loss 5.7519


 42%|████▏     | 4955/11685 [28:17<31:58,  3.51it/s]

Epoch 2 Step 1060 Loss 5.8147


 42%|████▏     | 4965/11685 [28:20<31:56,  3.51it/s]

Epoch 2 Step 1070 Loss 5.6233


 43%|████▎     | 4975/11685 [28:23<31:42,  3.53it/s]

Epoch 2 Step 1080 Loss 5.9481


 43%|████▎     | 4985/11685 [28:26<31:40,  3.53it/s]

Epoch 2 Step 1090 Loss 5.9102


 43%|████▎     | 4995/11685 [28:29<31:41,  3.52it/s]

Epoch 2 Step 1100 Loss 5.9067


 43%|████▎     | 5005/11685 [28:32<31:35,  3.53it/s]

Epoch 2 Step 1110 Loss 5.8466


 43%|████▎     | 5015/11685 [28:35<31:33,  3.52it/s]

Epoch 2 Step 1120 Loss 5.8134


 43%|████▎     | 5025/11685 [28:37<31:32,  3.52it/s]

Epoch 2 Step 1130 Loss 5.9530


 43%|████▎     | 5035/11685 [28:40<31:24,  3.53it/s]

Epoch 2 Step 1140 Loss 5.9103


 43%|████▎     | 5045/11685 [28:43<31:31,  3.51it/s]

Epoch 2 Step 1150 Loss 5.8283


 43%|████▎     | 5055/11685 [28:46<31:24,  3.52it/s]

Epoch 2 Step 1160 Loss 5.9664


 43%|████▎     | 5065/11685 [28:49<31:25,  3.51it/s]

Epoch 2 Step 1170 Loss 5.8359


 43%|████▎     | 5075/11685 [28:52<31:22,  3.51it/s]

Epoch 2 Step 1180 Loss 5.9087


 44%|████▎     | 5085/11685 [28:54<31:15,  3.52it/s]

Epoch 2 Step 1190 Loss 5.7810


 44%|████▎     | 5095/11685 [28:57<31:17,  3.51it/s]

Epoch 2 Step 1200 Loss 5.7349


 44%|████▎     | 5105/11685 [29:00<31:07,  3.52it/s]

Epoch 2 Step 1210 Loss 5.7338


 44%|████▍     | 5115/11685 [29:03<31:08,  3.52it/s]

Epoch 2 Step 1220 Loss 5.7842


 44%|████▍     | 5125/11685 [29:06<31:12,  3.50it/s]

Epoch 2 Step 1230 Loss 5.9032


 44%|████▍     | 5135/11685 [29:09<31:11,  3.50it/s]

Epoch 2 Step 1240 Loss 5.9171


 44%|████▍     | 5145/11685 [29:12<30:52,  3.53it/s]

Epoch 2 Step 1250 Loss 5.8103


 44%|████▍     | 5155/11685 [29:14<30:56,  3.52it/s]

Epoch 2 Step 1260 Loss 5.6707


 44%|████▍     | 5165/11685 [29:17<31:03,  3.50it/s]

Epoch 2 Step 1270 Loss 5.7866


 44%|████▍     | 5175/11685 [29:20<30:46,  3.53it/s]

Epoch 2 Step 1280 Loss 5.9027


 44%|████▍     | 5185/11685 [29:23<30:51,  3.51it/s]

Epoch 2 Step 1290 Loss 5.9420


 44%|████▍     | 5195/11685 [29:26<30:53,  3.50it/s]

Epoch 2 Step 1300 Loss 5.9578


 45%|████▍     | 5205/11685 [29:29<30:35,  3.53it/s]

Epoch 2 Step 1310 Loss 5.7577


 45%|████▍     | 5215/11685 [29:31<30:38,  3.52it/s]

Epoch 2 Step 1320 Loss 5.7464


 45%|████▍     | 5225/11685 [29:34<30:33,  3.52it/s]

Epoch 2 Step 1330 Loss 5.8692


 45%|████▍     | 5235/11685 [29:37<30:37,  3.51it/s]

Epoch 2 Step 1340 Loss 5.8098


 45%|████▍     | 5245/11685 [29:40<30:31,  3.52it/s]

Epoch 2 Step 1350 Loss 5.7826


 45%|████▍     | 5255/11685 [29:43<30:25,  3.52it/s]

Epoch 2 Step 1360 Loss 5.8521


 45%|████▌     | 5265/11685 [29:46<30:26,  3.51it/s]

Epoch 2 Step 1370 Loss 5.9423


 45%|████▌     | 5275/11685 [29:48<30:22,  3.52it/s]

Epoch 2 Step 1380 Loss 5.9352


 45%|████▌     | 5285/11685 [29:51<30:25,  3.51it/s]

Epoch 2 Step 1390 Loss 5.8741


 45%|████▌     | 5295/11685 [29:54<30:17,  3.52it/s]

Epoch 2 Step 1400 Loss 5.8940


 45%|████▌     | 5305/11685 [29:57<30:22,  3.50it/s]

Epoch 2 Step 1410 Loss 5.8357


 45%|████▌     | 5315/11685 [30:00<30:14,  3.51it/s]

Epoch 2 Step 1420 Loss 5.8071


 46%|████▌     | 5325/11685 [30:03<30:08,  3.52it/s]

Epoch 2 Step 1430 Loss 5.7183


 46%|████▌     | 5335/11685 [30:06<30:06,  3.52it/s]

Epoch 2 Step 1440 Loss 5.8750


 46%|████▌     | 5345/11685 [30:08<29:58,  3.52it/s]

Epoch 2 Step 1450 Loss 5.8510


 46%|████▌     | 5355/11685 [30:11<29:59,  3.52it/s]

Epoch 2 Step 1460 Loss 6.0627


 46%|████▌     | 5365/11685 [30:14<29:56,  3.52it/s]

Epoch 2 Step 1470 Loss 5.7729


 46%|████▌     | 5375/11685 [30:17<29:54,  3.52it/s]

Epoch 2 Step 1480 Loss 5.8921


 46%|████▌     | 5385/11685 [30:20<29:50,  3.52it/s]

Epoch 2 Step 1490 Loss 5.7259


 46%|████▌     | 5395/11685 [30:23<29:55,  3.50it/s]

Epoch 2 Step 1500 Loss 5.9189


 46%|████▋     | 5405/11685 [30:25<29:45,  3.52it/s]

Epoch 2 Step 1510 Loss 6.0068


 46%|████▋     | 5415/11685 [30:28<29:42,  3.52it/s]

Epoch 2 Step 1520 Loss 5.8561


 46%|████▋     | 5425/11685 [30:31<29:41,  3.51it/s]

Epoch 2 Step 1530 Loss 5.8427


 47%|████▋     | 5435/11685 [30:34<29:36,  3.52it/s]

Epoch 2 Step 1540 Loss 5.8875


 47%|████▋     | 5445/11685 [30:37<29:31,  3.52it/s]

Epoch 2 Step 1550 Loss 5.7171


 47%|████▋     | 5455/11685 [30:40<29:36,  3.51it/s]

Epoch 2 Step 1560 Loss 5.8723


 47%|████▋     | 5465/11685 [30:43<29:28,  3.52it/s]

Epoch 2 Step 1570 Loss 5.5651


 47%|████▋     | 5475/11685 [30:45<29:23,  3.52it/s]

Epoch 2 Step 1580 Loss 5.7027


 47%|████▋     | 5485/11685 [30:48<29:17,  3.53it/s]

Epoch 2 Step 1590 Loss 5.8443


 47%|████▋     | 5495/11685 [30:51<29:21,  3.51it/s]

Epoch 2 Step 1600 Loss 5.8590


 47%|████▋     | 5505/11685 [30:54<29:19,  3.51it/s]

Epoch 2 Step 1610 Loss 5.6685


 47%|████▋     | 5515/11685 [30:57<29:07,  3.53it/s]

Epoch 2 Step 1620 Loss 5.5734


 47%|████▋     | 5525/11685 [31:00<29:08,  3.52it/s]

Epoch 2 Step 1630 Loss 5.8197


 47%|████▋     | 5535/11685 [31:02<29:13,  3.51it/s]

Epoch 2 Step 1640 Loss 5.9243


 47%|████▋     | 5545/11685 [31:05<29:03,  3.52it/s]

Epoch 2 Step 1650 Loss 5.9457


 48%|████▊     | 5555/11685 [31:08<28:53,  3.54it/s]

Epoch 2 Step 1660 Loss 5.9839


 48%|████▊     | 5565/11685 [31:11<28:58,  3.52it/s]

Epoch 2 Step 1670 Loss 5.7432


 48%|████▊     | 5575/11685 [31:14<28:56,  3.52it/s]

Epoch 2 Step 1680 Loss 5.9281


 48%|████▊     | 5585/11685 [31:17<29:00,  3.51it/s]

Epoch 2 Step 1690 Loss 5.8432


 48%|████▊     | 5595/11685 [31:20<28:55,  3.51it/s]

Epoch 2 Step 1700 Loss 6.0046


 48%|████▊     | 5605/11685 [31:22<28:41,  3.53it/s]

Epoch 2 Step 1710 Loss 5.9199


 48%|████▊     | 5615/11685 [31:25<28:44,  3.52it/s]

Epoch 2 Step 1720 Loss 5.7624


 48%|████▊     | 5625/11685 [31:28<28:38,  3.53it/s]

Epoch 2 Step 1730 Loss 6.0314


 48%|████▊     | 5635/11685 [31:31<28:46,  3.51it/s]

Epoch 2 Step 1740 Loss 5.7926


 48%|████▊     | 5645/11685 [31:34<28:48,  3.49it/s]

Epoch 2 Step 1750 Loss 5.8109


 48%|████▊     | 5655/11685 [31:37<28:36,  3.51it/s]

Epoch 2 Step 1760 Loss 6.1086


 48%|████▊     | 5665/11685 [31:39<28:36,  3.51it/s]

Epoch 2 Step 1770 Loss 5.8189


 49%|████▊     | 5675/11685 [31:42<28:27,  3.52it/s]

Epoch 2 Step 1780 Loss 5.8962


 49%|████▊     | 5685/11685 [31:45<28:21,  3.53it/s]

Epoch 2 Step 1790 Loss 5.9174


 49%|████▊     | 5695/11685 [31:48<28:22,  3.52it/s]

Epoch 2 Step 1800 Loss 5.8686


 49%|████▉     | 5705/11685 [31:51<28:22,  3.51it/s]

Epoch 2 Step 1810 Loss 5.8509


 49%|████▉     | 5715/11685 [31:54<28:20,  3.51it/s]

Epoch 2 Step 1820 Loss 5.6735


 49%|████▉     | 5725/11685 [31:57<28:15,  3.52it/s]

Epoch 2 Step 1830 Loss 5.9289


 49%|████▉     | 5735/11685 [31:59<28:09,  3.52it/s]

Epoch 2 Step 1840 Loss 5.7844


 49%|████▉     | 5745/11685 [32:02<28:12,  3.51it/s]

Epoch 2 Step 1850 Loss 5.8300


 49%|████▉     | 5755/11685 [32:05<28:02,  3.52it/s]

Epoch 2 Step 1860 Loss 5.7148


 49%|████▉     | 5765/11685 [32:08<28:04,  3.51it/s]

Epoch 2 Step 1870 Loss 5.8133


 49%|████▉     | 5775/11685 [32:11<27:56,  3.52it/s]

Epoch 2 Step 1880 Loss 5.7904


 50%|████▉     | 5785/11685 [32:14<28:00,  3.51it/s]

Epoch 2 Step 1890 Loss 5.8213


 50%|████▉     | 5795/11685 [32:16<27:52,  3.52it/s]

Epoch 2 Step 1900 Loss 5.8801


 50%|████▉     | 5805/11685 [32:19<27:54,  3.51it/s]

Epoch 2 Step 1910 Loss 6.0148


 50%|████▉     | 5815/11685 [32:22<27:48,  3.52it/s]

Epoch 2 Step 1920 Loss 5.6894


 50%|████▉     | 5825/11685 [32:25<27:47,  3.51it/s]

Epoch 2 Step 1930 Loss 5.8774


 50%|████▉     | 5835/11685 [32:28<27:38,  3.53it/s]

Epoch 2 Step 1940 Loss 5.7836


 50%|█████     | 5845/11685 [32:31<27:42,  3.51it/s]

Epoch 2 Step 1950 Loss 5.8807


 50%|█████     | 5855/11685 [32:33<27:36,  3.52it/s]

Epoch 2 Step 1960 Loss 5.8195


 50%|█████     | 5865/11685 [32:36<27:33,  3.52it/s]

Epoch 2 Step 1970 Loss 5.7950


 50%|█████     | 5875/11685 [32:39<27:35,  3.51it/s]

Epoch 2 Step 1980 Loss 5.9470


 50%|█████     | 5885/11685 [32:42<27:29,  3.52it/s]

Epoch 2 Step 1990 Loss 5.9612


 50%|█████     | 5895/11685 [32:45<27:28,  3.51it/s]

Epoch 2 Step 2000 Loss 5.8167


 51%|█████     | 5905/11685 [32:48<27:27,  3.51it/s]

Epoch 2 Step 2010 Loss 5.7064


 51%|█████     | 5915/11685 [32:51<27:24,  3.51it/s]

Epoch 2 Step 2020 Loss 5.9879


 51%|█████     | 5925/11685 [32:53<27:16,  3.52it/s]

Epoch 2 Step 2030 Loss 5.7528


 51%|█████     | 5935/11685 [32:56<27:12,  3.52it/s]

Epoch 2 Step 2040 Loss 5.6943


 51%|█████     | 5945/11685 [32:59<27:13,  3.51it/s]

Epoch 2 Step 2050 Loss 5.8407


 51%|█████     | 5955/11685 [33:02<27:11,  3.51it/s]

Epoch 2 Step 2060 Loss 5.8190


 51%|█████     | 5965/11685 [33:05<26:57,  3.54it/s]

Epoch 2 Step 2070 Loss 5.8638


 51%|█████     | 5975/11685 [33:08<27:07,  3.51it/s]

Epoch 2 Step 2080 Loss 6.0742


 51%|█████     | 5985/11685 [33:10<27:00,  3.52it/s]

Epoch 2 Step 2090 Loss 5.8571


 51%|█████▏    | 5995/11685 [33:13<26:57,  3.52it/s]

Epoch 2 Step 2100 Loss 6.0207


 51%|█████▏    | 6005/11685 [33:16<26:56,  3.51it/s]

Epoch 2 Step 2110 Loss 5.8623


 51%|█████▏    | 6015/11685 [33:19<26:53,  3.51it/s]

Epoch 2 Step 2120 Loss 5.8679


 52%|█████▏    | 6025/11685 [33:22<26:51,  3.51it/s]

Epoch 2 Step 2130 Loss 5.9000


 52%|█████▏    | 6035/11685 [33:25<26:46,  3.52it/s]

Epoch 2 Step 2140 Loss 6.0096


 52%|█████▏    | 6045/11685 [33:28<26:45,  3.51it/s]

Epoch 2 Step 2150 Loss 5.9486


 52%|█████▏    | 6055/11685 [33:30<26:45,  3.51it/s]

Epoch 2 Step 2160 Loss 5.8512


 52%|█████▏    | 6065/11685 [33:33<26:35,  3.52it/s]

Epoch 2 Step 2170 Loss 5.7567


 52%|█████▏    | 6075/11685 [33:36<26:28,  3.53it/s]

Epoch 2 Step 2180 Loss 5.8997


 52%|█████▏    | 6085/11685 [33:39<26:30,  3.52it/s]

Epoch 2 Step 2190 Loss 5.9791


 52%|█████▏    | 6095/11685 [33:42<26:33,  3.51it/s]

Epoch 2 Step 2200 Loss 5.9680


 52%|█████▏    | 6105/11685 [33:45<26:26,  3.52it/s]

Epoch 2 Step 2210 Loss 5.7388


 52%|█████▏    | 6115/11685 [33:47<26:30,  3.50it/s]

Epoch 2 Step 2220 Loss 5.8721


 52%|█████▏    | 6125/11685 [33:50<26:16,  3.53it/s]

Epoch 2 Step 2230 Loss 5.8898


 53%|█████▎    | 6135/11685 [33:53<26:12,  3.53it/s]

Epoch 2 Step 2240 Loss 5.9429


 53%|█████▎    | 6145/11685 [33:56<26:18,  3.51it/s]

Epoch 2 Step 2250 Loss 5.8180


 53%|█████▎    | 6155/11685 [33:59<26:06,  3.53it/s]

Epoch 2 Step 2260 Loss 5.8249


 53%|█████▎    | 6165/11685 [34:02<26:07,  3.52it/s]

Epoch 2 Step 2270 Loss 5.7157


 53%|█████▎    | 6175/11685 [34:05<26:13,  3.50it/s]

Epoch 2 Step 2280 Loss 5.9060


 53%|█████▎    | 6185/11685 [34:07<26:05,  3.51it/s]

Epoch 2 Step 2290 Loss 5.8916


 53%|█████▎    | 6195/11685 [34:10<26:03,  3.51it/s]

Epoch 2 Step 2300 Loss 5.7742


 53%|█████▎    | 6205/11685 [34:13<25:56,  3.52it/s]

Epoch 2 Step 2310 Loss 5.6462


 53%|█████▎    | 6215/11685 [34:16<25:54,  3.52it/s]

Epoch 2 Step 2320 Loss 5.9925


 53%|█████▎    | 6225/11685 [34:19<25:49,  3.52it/s]

Epoch 2 Step 2330 Loss 5.7399


 53%|█████▎    | 6235/11685 [34:22<25:53,  3.51it/s]

Epoch 2 Step 2340 Loss 5.7871


 53%|█████▎    | 6245/11685 [34:24<26:12,  3.46it/s]

Epoch 2 Step 2350 Loss 5.7678


 54%|█████▎    | 6255/11685 [34:27<25:43,  3.52it/s]

Epoch 2 Step 2360 Loss 5.8957


 54%|█████▎    | 6265/11685 [34:30<25:58,  3.48it/s]

Epoch 2 Step 2370 Loss 5.8320


 54%|█████▎    | 6275/11685 [34:33<25:41,  3.51it/s]

Epoch 2 Step 2380 Loss 5.9526


 54%|█████▍    | 6285/11685 [34:36<25:31,  3.53it/s]

Epoch 2 Step 2390 Loss 5.8796


 54%|█████▍    | 6295/11685 [34:39<25:58,  3.46it/s]

Epoch 2 Step 2400 Loss 5.7672


 54%|█████▍    | 6305/11685 [34:42<25:41,  3.49it/s]

Epoch 2 Step 2410 Loss 5.8831


 54%|█████▍    | 6315/11685 [34:44<25:38,  3.49it/s]

Epoch 2 Step 2420 Loss 5.8231


 54%|█████▍    | 6325/11685 [34:47<25:28,  3.51it/s]

Epoch 2 Step 2430 Loss 5.8971


 54%|█████▍    | 6335/11685 [34:50<25:36,  3.48it/s]

Epoch 2 Step 2440 Loss 5.7952


 54%|█████▍    | 6345/11685 [34:53<25:20,  3.51it/s]

Epoch 2 Step 2450 Loss 5.9220


 54%|█████▍    | 6355/11685 [34:56<25:27,  3.49it/s]

Epoch 2 Step 2460 Loss 5.7958


 54%|█████▍    | 6365/11685 [34:59<25:17,  3.51it/s]

Epoch 2 Step 2470 Loss 5.7195


 55%|█████▍    | 6375/11685 [35:02<25:25,  3.48it/s]

Epoch 2 Step 2480 Loss 5.8252


 55%|█████▍    | 6385/11685 [35:05<25:15,  3.50it/s]

Epoch 2 Step 2490 Loss 5.7868


 55%|█████▍    | 6395/11685 [35:07<25:10,  3.50it/s]

Epoch 2 Step 2500 Loss 5.9232


 55%|█████▍    | 6405/11685 [35:10<25:05,  3.51it/s]

Epoch 2 Step 2510 Loss 5.8701


 55%|█████▍    | 6415/11685 [35:13<25:08,  3.49it/s]

Epoch 2 Step 2520 Loss 5.8409


 55%|█████▍    | 6425/11685 [35:16<25:04,  3.50it/s]

Epoch 2 Step 2530 Loss 5.9266


 55%|█████▌    | 6435/11685 [35:19<25:04,  3.49it/s]

Epoch 2 Step 2540 Loss 5.7160


 55%|█████▌    | 6445/11685 [35:22<24:48,  3.52it/s]

Epoch 2 Step 2550 Loss 5.8289


 55%|█████▌    | 6455/11685 [35:25<24:51,  3.51it/s]

Epoch 2 Step 2560 Loss 5.9836


 55%|█████▌    | 6465/11685 [35:27<24:55,  3.49it/s]

Epoch 2 Step 2570 Loss 5.7042


 55%|█████▌    | 6475/11685 [35:30<24:40,  3.52it/s]

Epoch 2 Step 2580 Loss 5.8806


 55%|█████▌    | 6485/11685 [35:33<25:06,  3.45it/s]

Epoch 2 Step 2590 Loss 5.7246


 56%|█████▌    | 6495/11685 [35:36<25:03,  3.45it/s]

Epoch 2 Step 2600 Loss 5.8841


 56%|█████▌    | 6505/11685 [35:39<24:46,  3.48it/s]

Epoch 2 Step 2610 Loss 5.8050


 56%|█████▌    | 6515/11685 [35:42<24:35,  3.50it/s]

Epoch 2 Step 2620 Loss 5.8726


 56%|█████▌    | 6525/11685 [35:45<24:36,  3.49it/s]

Epoch 2 Step 2630 Loss 5.9260


 56%|█████▌    | 6535/11685 [35:48<24:25,  3.51it/s]

Epoch 2 Step 2640 Loss 5.9542


 56%|█████▌    | 6545/11685 [35:50<24:15,  3.53it/s]

Epoch 2 Step 2650 Loss 5.8246


 56%|█████▌    | 6555/11685 [35:53<24:14,  3.53it/s]

Epoch 2 Step 2660 Loss 5.8289


 56%|█████▌    | 6565/11685 [35:56<24:28,  3.49it/s]

Epoch 2 Step 2670 Loss 5.6586


 56%|█████▋    | 6575/11685 [35:59<24:19,  3.50it/s]

Epoch 2 Step 2680 Loss 5.9434


 56%|█████▋    | 6585/11685 [36:02<24:15,  3.50it/s]

Epoch 2 Step 2690 Loss 5.8376


 56%|█████▋    | 6595/11685 [36:05<24:09,  3.51it/s]

Epoch 2 Step 2700 Loss 5.8735


 57%|█████▋    | 6605/11685 [36:07<24:06,  3.51it/s]

Epoch 2 Step 2710 Loss 5.8960


 57%|█████▋    | 6615/11685 [36:10<24:12,  3.49it/s]

Epoch 2 Step 2720 Loss 5.9889


 57%|█████▋    | 6625/11685 [36:13<23:51,  3.54it/s]

Epoch 2 Step 2730 Loss 5.7111


 57%|█████▋    | 6635/11685 [36:16<23:50,  3.53it/s]

Epoch 2 Step 2740 Loss 5.8697


 57%|█████▋    | 6645/11685 [36:19<23:39,  3.55it/s]

Epoch 2 Step 2750 Loss 5.8928


 57%|█████▋    | 6655/11685 [36:22<23:48,  3.52it/s]

Epoch 2 Step 2760 Loss 5.8260


 57%|█████▋    | 6665/11685 [36:25<23:40,  3.53it/s]

Epoch 2 Step 2770 Loss 5.8249


 57%|█████▋    | 6675/11685 [36:27<23:43,  3.52it/s]

Epoch 2 Step 2780 Loss 5.8858


 57%|█████▋    | 6685/11685 [36:30<23:58,  3.48it/s]

Epoch 2 Step 2790 Loss 5.8219


 57%|█████▋    | 6695/11685 [36:33<23:32,  3.53it/s]

Epoch 2 Step 2800 Loss 5.7841


 57%|█████▋    | 6705/11685 [36:36<23:52,  3.48it/s]

Epoch 2 Step 2810 Loss 5.9155


 57%|█████▋    | 6715/11685 [36:39<23:30,  3.52it/s]

Epoch 2 Step 2820 Loss 5.9063


 58%|█████▊    | 6725/11685 [36:42<23:39,  3.49it/s]

Epoch 2 Step 2830 Loss 5.9050


 58%|█████▊    | 6735/11685 [36:45<23:29,  3.51it/s]

Epoch 2 Step 2840 Loss 5.7660


 58%|█████▊    | 6745/11685 [36:47<23:38,  3.48it/s]

Epoch 2 Step 2850 Loss 5.7266


 58%|█████▊    | 6755/11685 [36:50<23:17,  3.53it/s]

Epoch 2 Step 2860 Loss 5.8539


 58%|█████▊    | 6765/11685 [36:53<23:09,  3.54it/s]

Epoch 2 Step 2870 Loss 5.8453


 58%|█████▊    | 6775/11685 [36:56<23:04,  3.55it/s]

Epoch 2 Step 2880 Loss 5.8868


 58%|█████▊    | 6785/11685 [36:59<23:15,  3.51it/s]

Epoch 2 Step 2890 Loss 5.9105


 58%|█████▊    | 6795/11685 [37:02<23:06,  3.53it/s]

Epoch 2 Step 2900 Loss 5.8340


 58%|█████▊    | 6805/11685 [37:04<23:07,  3.52it/s]

Epoch 2 Step 2910 Loss 5.9041


 58%|█████▊    | 6815/11685 [37:07<23:03,  3.52it/s]

Epoch 2 Step 2920 Loss 5.8776


 58%|█████▊    | 6825/11685 [37:10<23:13,  3.49it/s]

Epoch 2 Step 2930 Loss 5.9034


 58%|█████▊    | 6835/11685 [37:13<22:55,  3.53it/s]

Epoch 2 Step 2940 Loss 5.8430


 59%|█████▊    | 6845/11685 [37:16<22:50,  3.53it/s]

Epoch 2 Step 2950 Loss 5.9055


 59%|█████▊    | 6855/11685 [37:19<22:48,  3.53it/s]

Epoch 2 Step 2960 Loss 5.7816


 59%|█████▉    | 6865/11685 [37:22<22:50,  3.52it/s]

Epoch 2 Step 2970 Loss 5.8382


 59%|█████▉    | 6875/11685 [37:24<22:43,  3.53it/s]

Epoch 2 Step 2980 Loss 5.8641


 59%|█████▉    | 6885/11685 [37:27<22:53,  3.50it/s]

Epoch 2 Step 2990 Loss 5.8363


 59%|█████▉    | 6895/11685 [37:30<22:40,  3.52it/s]

Epoch 2 Step 3000 Loss 5.9628


 59%|█████▉    | 6905/11685 [37:33<22:46,  3.50it/s]

Epoch 2 Step 3010 Loss 5.8682


 59%|█████▉    | 6915/11685 [37:36<22:50,  3.48it/s]

Epoch 2 Step 3020 Loss 5.8803


 59%|█████▉    | 6925/11685 [37:39<22:28,  3.53it/s]

Epoch 2 Step 3030 Loss 5.9835


 59%|█████▉    | 6935/11685 [37:41<22:23,  3.54it/s]

Epoch 2 Step 3040 Loss 5.8426


 59%|█████▉    | 6945/11685 [37:44<22:29,  3.51it/s]

Epoch 2 Step 3050 Loss 5.8110


 60%|█████▉    | 6955/11685 [37:47<22:32,  3.50it/s]

Epoch 2 Step 3060 Loss 5.8737


 60%|█████▉    | 6965/11685 [37:50<22:14,  3.54it/s]

Epoch 2 Step 3070 Loss 5.8074


 60%|█████▉    | 6975/11685 [37:53<22:32,  3.48it/s]

Epoch 2 Step 3080 Loss 5.8484


 60%|█████▉    | 6985/11685 [37:56<22:16,  3.52it/s]

Epoch 2 Step 3090 Loss 5.7491


 60%|█████▉    | 6995/11685 [37:58<22:09,  3.53it/s]

Epoch 2 Step 3100 Loss 5.9043


 60%|█████▉    | 7005/11685 [38:01<22:18,  3.50it/s]

Epoch 2 Step 3110 Loss 5.7152


 60%|██████    | 7015/11685 [38:04<22:09,  3.51it/s]

Epoch 2 Step 3120 Loss 5.9067


 60%|██████    | 7025/11685 [38:07<22:11,  3.50it/s]

Epoch 2 Step 3130 Loss 5.9191


 60%|██████    | 7035/11685 [38:10<22:04,  3.51it/s]

Epoch 2 Step 3140 Loss 5.5911


 60%|██████    | 7045/11685 [38:13<21:54,  3.53it/s]

Epoch 2 Step 3150 Loss 5.7709


 60%|██████    | 7055/11685 [38:16<21:50,  3.53it/s]

Epoch 2 Step 3160 Loss 5.8164


 60%|██████    | 7065/11685 [38:18<21:57,  3.51it/s]

Epoch 2 Step 3170 Loss 5.7993


 61%|██████    | 7075/11685 [38:21<21:49,  3.52it/s]

Epoch 2 Step 3180 Loss 5.8584


 61%|██████    | 7085/11685 [38:24<21:57,  3.49it/s]

Epoch 2 Step 3190 Loss 5.8053


 61%|██████    | 7095/11685 [38:27<21:39,  3.53it/s]

Epoch 2 Step 3200 Loss 5.7391


 61%|██████    | 7105/11685 [38:30<21:35,  3.53it/s]

Epoch 2 Step 3210 Loss 5.9067


 61%|██████    | 7115/11685 [38:33<21:31,  3.54it/s]

Epoch 2 Step 3220 Loss 5.8562


 61%|██████    | 7125/11685 [38:35<21:26,  3.54it/s]

Epoch 2 Step 3230 Loss 5.8380


 61%|██████    | 7135/11685 [38:38<21:47,  3.48it/s]

Epoch 2 Step 3240 Loss 5.9273


 61%|██████    | 7145/11685 [38:41<21:28,  3.52it/s]

Epoch 2 Step 3250 Loss 5.8371


 61%|██████    | 7155/11685 [38:44<21:20,  3.54it/s]

Epoch 2 Step 3260 Loss 5.8836


 61%|██████▏   | 7165/11685 [38:47<21:39,  3.48it/s]

Epoch 2 Step 3270 Loss 5.9032


 61%|██████▏   | 7175/11685 [38:50<21:11,  3.55it/s]

Epoch 2 Step 3280 Loss 5.9304


 61%|██████▏   | 7185/11685 [38:53<21:35,  3.47it/s]

Epoch 2 Step 3290 Loss 5.9152


 62%|██████▏   | 7195/11685 [38:55<21:18,  3.51it/s]

Epoch 2 Step 3300 Loss 5.8612


 62%|██████▏   | 7205/11685 [38:58<21:10,  3.53it/s]

Epoch 2 Step 3310 Loss 5.7026


 62%|██████▏   | 7215/11685 [39:01<21:08,  3.52it/s]

Epoch 2 Step 3320 Loss 5.9525


 62%|██████▏   | 7225/11685 [39:04<21:09,  3.51it/s]

Epoch 2 Step 3330 Loss 5.9400


 62%|██████▏   | 7235/11685 [39:07<21:08,  3.51it/s]

Epoch 2 Step 3340 Loss 5.9196


 62%|██████▏   | 7245/11685 [39:10<20:53,  3.54it/s]

Epoch 2 Step 3350 Loss 5.6646


 62%|██████▏   | 7255/11685 [39:12<21:13,  3.48it/s]

Epoch 2 Step 3360 Loss 5.8589


 62%|██████▏   | 7265/11685 [39:15<20:53,  3.53it/s]

Epoch 2 Step 3370 Loss 6.0329


 62%|██████▏   | 7275/11685 [39:18<20:51,  3.52it/s]

Epoch 2 Step 3380 Loss 5.8521


 62%|██████▏   | 7285/11685 [39:21<20:42,  3.54it/s]

Epoch 2 Step 3390 Loss 5.8400


 62%|██████▏   | 7295/11685 [39:24<20:54,  3.50it/s]

Epoch 2 Step 3400 Loss 5.7041


 63%|██████▎   | 7305/11685 [39:27<20:38,  3.54it/s]

Epoch 2 Step 3410 Loss 5.8445


 63%|██████▎   | 7315/11685 [39:29<20:38,  3.53it/s]

Epoch 2 Step 3420 Loss 5.7855


 63%|██████▎   | 7325/11685 [39:32<20:43,  3.51it/s]

Epoch 2 Step 3430 Loss 5.9166


 63%|██████▎   | 7335/11685 [39:35<20:29,  3.54it/s]

Epoch 2 Step 3440 Loss 5.9594


 63%|██████▎   | 7345/11685 [39:38<20:26,  3.54it/s]

Epoch 2 Step 3450 Loss 5.8234


 63%|██████▎   | 7355/11685 [39:41<20:43,  3.48it/s]

Epoch 2 Step 3460 Loss 5.8640


 63%|██████▎   | 7365/11685 [39:44<20:20,  3.54it/s]

Epoch 2 Step 3470 Loss 5.8315


 63%|██████▎   | 7375/11685 [39:47<20:37,  3.48it/s]

Epoch 2 Step 3480 Loss 5.9015


 63%|██████▎   | 7385/11685 [39:49<20:21,  3.52it/s]

Epoch 2 Step 3490 Loss 5.8900


 63%|██████▎   | 7395/11685 [39:52<20:12,  3.54it/s]

Epoch 2 Step 3500 Loss 5.9028


 63%|██████▎   | 7405/11685 [39:55<20:25,  3.49it/s]

Epoch 2 Step 3510 Loss 5.7818


 63%|██████▎   | 7415/11685 [39:58<20:05,  3.54it/s]

Epoch 2 Step 3520 Loss 5.8364


 64%|██████▎   | 7425/11685 [40:01<20:05,  3.54it/s]

Epoch 2 Step 3530 Loss 5.9370


 64%|██████▎   | 7435/11685 [40:04<20:19,  3.48it/s]

Epoch 2 Step 3540 Loss 5.7474


 64%|██████▎   | 7445/11685 [40:06<19:58,  3.54it/s]

Epoch 2 Step 3550 Loss 5.7927


 64%|██████▍   | 7455/11685 [40:09<19:54,  3.54it/s]

Epoch 2 Step 3560 Loss 5.9310


 64%|██████▍   | 7465/11685 [40:12<20:05,  3.50it/s]

Epoch 2 Step 3570 Loss 5.8834


 64%|██████▍   | 7475/11685 [40:15<20:13,  3.47it/s]

Epoch 2 Step 3580 Loss 5.8479


 64%|██████▍   | 7485/11685 [40:18<20:02,  3.49it/s]

Epoch 2 Step 3590 Loss 5.8186


 64%|██████▍   | 7495/11685 [40:21<19:44,  3.54it/s]

Epoch 2 Step 3600 Loss 5.9772


 64%|██████▍   | 7505/11685 [40:24<19:54,  3.50it/s]

Epoch 2 Step 3610 Loss 5.9073


 64%|██████▍   | 7515/11685 [40:26<19:45,  3.52it/s]

Epoch 2 Step 3620 Loss 5.7416


 64%|██████▍   | 7525/11685 [40:29<19:47,  3.50it/s]

Epoch 2 Step 3630 Loss 5.8517


 64%|██████▍   | 7535/11685 [40:32<19:34,  3.53it/s]

Epoch 2 Step 3640 Loss 5.7914


 65%|██████▍   | 7545/11685 [40:35<19:37,  3.52it/s]

Epoch 2 Step 3650 Loss 5.9147


 65%|██████▍   | 7555/11685 [40:38<19:34,  3.52it/s]

Epoch 2 Step 3660 Loss 5.7814


 65%|██████▍   | 7565/11685 [40:41<19:39,  3.49it/s]

Epoch 2 Step 3670 Loss 5.8714


 65%|██████▍   | 7575/11685 [40:43<19:24,  3.53it/s]

Epoch 2 Step 3680 Loss 5.7685


 65%|██████▍   | 7585/11685 [40:46<19:34,  3.49it/s]

Epoch 2 Step 3690 Loss 5.6435


 65%|██████▍   | 7595/11685 [40:49<19:28,  3.50it/s]

Epoch 2 Step 3700 Loss 5.8492


 65%|██████▌   | 7605/11685 [40:52<19:14,  3.53it/s]

Epoch 2 Step 3710 Loss 5.8686


 65%|██████▌   | 7615/11685 [40:55<19:26,  3.49it/s]

Epoch 2 Step 3720 Loss 5.9668


 65%|██████▌   | 7625/11685 [40:58<19:13,  3.52it/s]

Epoch 2 Step 3730 Loss 5.7760


 65%|██████▌   | 7635/11685 [41:01<19:05,  3.53it/s]

Epoch 2 Step 3740 Loss 5.7103


 65%|██████▌   | 7645/11685 [41:03<19:08,  3.52it/s]

Epoch 2 Step 3750 Loss 5.7133


 66%|██████▌   | 7655/11685 [41:06<19:17,  3.48it/s]

Epoch 2 Step 3760 Loss 5.8233


 66%|██████▌   | 7665/11685 [41:09<19:06,  3.50it/s]

Epoch 2 Step 3770 Loss 5.7368


 66%|██████▌   | 7675/11685 [41:12<19:05,  3.50it/s]

Epoch 2 Step 3780 Loss 5.8372


 66%|██████▌   | 7685/11685 [41:15<18:52,  3.53it/s]

Epoch 2 Step 3790 Loss 5.8904


 66%|██████▌   | 7695/11685 [41:18<18:51,  3.53it/s]

Epoch 2 Step 3800 Loss 5.8237


 66%|██████▌   | 7705/11685 [41:21<18:57,  3.50it/s]

Epoch 2 Step 3810 Loss 5.7238


 66%|██████▌   | 7715/11685 [41:23<19:03,  3.47it/s]

Epoch 2 Step 3820 Loss 5.7121


 66%|██████▌   | 7725/11685 [41:26<19:00,  3.47it/s]

Epoch 2 Step 3830 Loss 5.7694


 66%|██████▌   | 7735/11685 [41:29<18:44,  3.51it/s]

Epoch 2 Step 3840 Loss 5.7445


 66%|██████▋   | 7745/11685 [41:32<18:45,  3.50it/s]

Epoch 2 Step 3850 Loss 5.7114


 66%|██████▋   | 7755/11685 [41:35<18:51,  3.47it/s]

Epoch 2 Step 3860 Loss 6.0136


 66%|██████▋   | 7765/11685 [41:38<18:38,  3.51it/s]

Epoch 2 Step 3870 Loss 5.7422


 67%|██████▋   | 7775/11685 [41:41<18:33,  3.51it/s]

Epoch 2 Step 3880 Loss 5.9841


 67%|██████▋   | 7785/11685 [41:43<18:28,  3.52it/s]

Epoch 2 Step 3890 Loss 5.7699


 67%|██████▋   | 7790/11685 [41:45<18:26,  3.52it/s]

Epoch 2 Average Training Loss: 5.8517
Epoch 2 Test Accuracy: 0.0206


 67%|██████▋   | 7800/11685 [46:19<3:51:20,  3.57s/it] 

Epoch 3 Step 10 Loss 5.9109


 67%|██████▋   | 7810/11685 [46:22<24:36,  2.62it/s]  

Epoch 3 Step 20 Loss 5.7703


 67%|██████▋   | 7820/11685 [46:25<18:26,  3.49it/s]

Epoch 3 Step 30 Loss 5.9868


 67%|██████▋   | 7830/11685 [46:28<18:13,  3.52it/s]

Epoch 3 Step 40 Loss 5.8106


 67%|██████▋   | 7840/11685 [46:30<18:18,  3.50it/s]

Epoch 3 Step 50 Loss 5.7639


 67%|██████▋   | 7850/11685 [46:33<18:18,  3.49it/s]

Epoch 3 Step 60 Loss 5.7453


 67%|██████▋   | 7860/11685 [46:36<18:08,  3.51it/s]

Epoch 3 Step 70 Loss 5.6803


 67%|██████▋   | 7870/11685 [46:39<18:16,  3.48it/s]

Epoch 3 Step 80 Loss 5.7769


 67%|██████▋   | 7880/11685 [46:42<18:01,  3.52it/s]

Epoch 3 Step 90 Loss 5.8478


 68%|██████▊   | 7890/11685 [46:45<18:01,  3.51it/s]

Epoch 3 Step 100 Loss 5.9206


 68%|██████▊   | 7900/11685 [46:47<17:56,  3.52it/s]

Epoch 3 Step 110 Loss 5.8595


 68%|██████▊   | 7910/11685 [46:50<17:59,  3.50it/s]

Epoch 3 Step 120 Loss 5.6360


 68%|██████▊   | 7920/11685 [46:53<17:51,  3.51it/s]

Epoch 3 Step 130 Loss 5.8347


 68%|██████▊   | 7930/11685 [46:56<17:49,  3.51it/s]

Epoch 3 Step 140 Loss 5.8297


 68%|██████▊   | 7940/11685 [46:59<17:56,  3.48it/s]

Epoch 3 Step 150 Loss 5.9022


 68%|██████▊   | 7950/11685 [47:02<17:51,  3.49it/s]

Epoch 3 Step 160 Loss 5.8598


 68%|██████▊   | 7960/11685 [47:05<17:37,  3.52it/s]

Epoch 3 Step 170 Loss 5.7445


 68%|██████▊   | 7970/11685 [47:07<17:39,  3.51it/s]

Epoch 3 Step 180 Loss 5.9058


 68%|██████▊   | 7980/11685 [47:10<17:39,  3.50it/s]

Epoch 3 Step 190 Loss 5.8512


 68%|██████▊   | 7990/11685 [47:13<17:38,  3.49it/s]

Epoch 3 Step 200 Loss 5.7515


 68%|██████▊   | 8000/11685 [47:16<17:31,  3.51it/s]

Epoch 3 Step 210 Loss 5.6644


 69%|██████▊   | 8010/11685 [47:19<17:27,  3.51it/s]

Epoch 3 Step 220 Loss 5.7377


 69%|██████▊   | 8020/11685 [47:22<17:20,  3.52it/s]

Epoch 3 Step 230 Loss 5.8708


 69%|██████▊   | 8030/11685 [47:25<17:14,  3.53it/s]

Epoch 3 Step 240 Loss 5.9775


 69%|██████▉   | 8040/11685 [47:27<17:11,  3.53it/s]

Epoch 3 Step 250 Loss 5.7280


 69%|██████▉   | 8050/11685 [47:30<17:11,  3.52it/s]

Epoch 3 Step 260 Loss 5.9327


 69%|██████▉   | 8060/11685 [47:33<17:13,  3.51it/s]

Epoch 3 Step 270 Loss 5.8728


 69%|██████▉   | 8070/11685 [47:36<17:14,  3.50it/s]

Epoch 3 Step 280 Loss 5.8788


 69%|██████▉   | 8080/11685 [47:39<17:15,  3.48it/s]

Epoch 3 Step 290 Loss 5.6826


 69%|██████▉   | 8090/11685 [47:42<16:59,  3.53it/s]

Epoch 3 Step 300 Loss 5.7682


 69%|██████▉   | 8100/11685 [47:45<16:53,  3.54it/s]

Epoch 3 Step 310 Loss 5.7691


 69%|██████▉   | 8110/11685 [47:47<16:56,  3.52it/s]

Epoch 3 Step 320 Loss 5.7874


 69%|██████▉   | 8120/11685 [47:50<16:53,  3.52it/s]

Epoch 3 Step 330 Loss 5.9069


 70%|██████▉   | 8130/11685 [47:53<16:57,  3.50it/s]

Epoch 3 Step 340 Loss 5.8105


 70%|██████▉   | 8140/11685 [47:56<16:42,  3.54it/s]

Epoch 3 Step 350 Loss 5.6984


 70%|██████▉   | 8150/11685 [47:59<16:44,  3.52it/s]

Epoch 3 Step 360 Loss 5.8193


 70%|██████▉   | 8160/11685 [48:02<16:38,  3.53it/s]

Epoch 3 Step 370 Loss 5.8483


 70%|██████▉   | 8170/11685 [48:04<16:42,  3.51it/s]

Epoch 3 Step 380 Loss 5.9701


 70%|███████   | 8180/11685 [48:07<16:30,  3.54it/s]

Epoch 3 Step 390 Loss 5.8799


 70%|███████   | 8190/11685 [48:10<16:28,  3.53it/s]

Epoch 3 Step 400 Loss 5.7601


 70%|███████   | 8200/11685 [48:13<16:30,  3.52it/s]

Epoch 3 Step 410 Loss 5.8412


 70%|███████   | 8210/11685 [48:16<16:41,  3.47it/s]

Epoch 3 Step 420 Loss 5.6483


 70%|███████   | 8220/11685 [48:19<16:21,  3.53it/s]

Epoch 3 Step 430 Loss 5.8182


 70%|███████   | 8230/11685 [48:22<16:19,  3.53it/s]

Epoch 3 Step 440 Loss 5.7763


 71%|███████   | 8240/11685 [48:24<16:14,  3.53it/s]

Epoch 3 Step 450 Loss 5.7727


 71%|███████   | 8250/11685 [48:27<16:20,  3.50it/s]

Epoch 3 Step 460 Loss 5.7251


 71%|███████   | 8260/11685 [48:30<16:08,  3.54it/s]

Epoch 3 Step 470 Loss 5.9149


 71%|███████   | 8270/11685 [48:33<16:23,  3.47it/s]

Epoch 3 Step 480 Loss 5.9311


 71%|███████   | 8280/11685 [48:36<16:09,  3.51it/s]

Epoch 3 Step 490 Loss 5.8263


 71%|███████   | 8290/11685 [48:39<16:03,  3.52it/s]

Epoch 3 Step 500 Loss 5.8343


 71%|███████   | 8300/11685 [48:42<16:16,  3.47it/s]

Epoch 3 Step 510 Loss 5.7770


 71%|███████   | 8310/11685 [48:44<16:04,  3.50it/s]

Epoch 3 Step 520 Loss 5.8041


 71%|███████   | 8320/11685 [48:47<15:54,  3.52it/s]

Epoch 3 Step 530 Loss 5.8420


 71%|███████▏  | 8330/11685 [48:50<15:43,  3.56it/s]

Epoch 3 Step 540 Loss 5.5455


 71%|███████▏  | 8340/11685 [48:53<15:47,  3.53it/s]

Epoch 3 Step 550 Loss 5.7318


 71%|███████▏  | 8350/11685 [48:56<15:43,  3.53it/s]

Epoch 3 Step 560 Loss 5.8666


 72%|███████▏  | 8360/11685 [48:59<15:40,  3.54it/s]

Epoch 3 Step 570 Loss 5.9323


 72%|███████▏  | 8370/11685 [49:01<15:38,  3.53it/s]

Epoch 3 Step 580 Loss 5.9133


 72%|███████▏  | 8380/11685 [49:04<15:36,  3.53it/s]

Epoch 3 Step 590 Loss 5.9131


 72%|███████▏  | 8390/11685 [49:07<15:32,  3.53it/s]

Epoch 3 Step 600 Loss 5.7616


 72%|███████▏  | 8400/11685 [49:10<15:30,  3.53it/s]

Epoch 3 Step 610 Loss 5.8308


 72%|███████▏  | 8410/11685 [49:13<15:25,  3.54it/s]

Epoch 3 Step 620 Loss 5.8382


 72%|███████▏  | 8420/11685 [49:16<15:22,  3.54it/s]

Epoch 3 Step 630 Loss 5.7724


 72%|███████▏  | 8430/11685 [49:18<15:22,  3.53it/s]

Epoch 3 Step 640 Loss 5.8541


 72%|███████▏  | 8440/11685 [49:21<15:18,  3.53it/s]

Epoch 3 Step 650 Loss 5.7239


 72%|███████▏  | 8450/11685 [49:24<15:26,  3.49it/s]

Epoch 3 Step 660 Loss 5.6952


 72%|███████▏  | 8460/11685 [49:27<15:10,  3.54it/s]

Epoch 3 Step 670 Loss 5.7989


 72%|███████▏  | 8470/11685 [49:30<15:16,  3.51it/s]

Epoch 3 Step 680 Loss 5.6249


 73%|███████▎  | 8480/11685 [49:33<15:09,  3.52it/s]

Epoch 3 Step 690 Loss 5.8653


 73%|███████▎  | 8490/11685 [49:36<15:06,  3.52it/s]

Epoch 3 Step 700 Loss 5.6257


 73%|███████▎  | 8500/11685 [49:38<15:00,  3.54it/s]

Epoch 3 Step 710 Loss 5.7596


 73%|███████▎  | 8510/11685 [49:41<15:03,  3.52it/s]

Epoch 3 Step 720 Loss 5.8443


 73%|███████▎  | 8520/11685 [49:44<14:55,  3.53it/s]

Epoch 3 Step 730 Loss 5.8013


 73%|███████▎  | 8530/11685 [49:47<14:52,  3.54it/s]

Epoch 3 Step 740 Loss 5.8090


 73%|███████▎  | 8540/11685 [49:50<14:58,  3.50it/s]

Epoch 3 Step 750 Loss 5.7488


 73%|███████▎  | 8550/11685 [49:53<15:00,  3.48it/s]

Epoch 3 Step 760 Loss 5.6816


 73%|███████▎  | 8560/11685 [49:55<14:48,  3.52it/s]

Epoch 3 Step 770 Loss 5.9200


 73%|███████▎  | 8570/11685 [49:58<14:39,  3.54it/s]

Epoch 3 Step 780 Loss 5.8819


 73%|███████▎  | 8580/11685 [50:01<14:42,  3.52it/s]

Epoch 3 Step 790 Loss 5.8158


 74%|███████▎  | 8590/11685 [50:04<14:42,  3.51it/s]

Epoch 3 Step 800 Loss 5.8559


 74%|███████▎  | 8600/11685 [50:07<14:46,  3.48it/s]

Epoch 3 Step 810 Loss 6.0636


 74%|███████▎  | 8610/11685 [50:10<14:34,  3.52it/s]

Epoch 3 Step 820 Loss 5.8994


 74%|███████▍  | 8620/11685 [50:13<14:31,  3.52it/s]

Epoch 3 Step 830 Loss 5.7840


 74%|███████▍  | 8630/11685 [50:15<14:24,  3.54it/s]

Epoch 3 Step 840 Loss 5.8882


 74%|███████▍  | 8640/11685 [50:18<14:27,  3.51it/s]

Epoch 3 Step 850 Loss 5.8080


 74%|███████▍  | 8650/11685 [50:21<14:34,  3.47it/s]

Epoch 3 Step 860 Loss 5.6810


 74%|███████▍  | 8660/11685 [50:24<14:21,  3.51it/s]

Epoch 3 Step 870 Loss 5.7954


 74%|███████▍  | 8670/11685 [50:27<14:14,  3.53it/s]

Epoch 3 Step 880 Loss 5.9047


 74%|███████▍  | 8680/11685 [50:30<14:10,  3.53it/s]

Epoch 3 Step 890 Loss 5.8109


 74%|███████▍  | 8690/11685 [50:32<14:20,  3.48it/s]

Epoch 3 Step 900 Loss 5.8003


 74%|███████▍  | 8700/11685 [50:35<14:06,  3.53it/s]

Epoch 3 Step 910 Loss 5.8961


 75%|███████▍  | 8710/11685 [50:38<14:05,  3.52it/s]

Epoch 3 Step 920 Loss 5.8550


 75%|███████▍  | 8720/11685 [50:41<13:59,  3.53it/s]

Epoch 3 Step 930 Loss 5.6403


 75%|███████▍  | 8730/11685 [50:44<14:08,  3.48it/s]

Epoch 3 Step 940 Loss 5.8946


 75%|███████▍  | 8740/11685 [50:47<13:51,  3.54it/s]

Epoch 3 Step 950 Loss 5.8050


 75%|███████▍  | 8750/11685 [50:49<13:49,  3.54it/s]

Epoch 3 Step 960 Loss 5.9505


 75%|███████▍  | 8760/11685 [50:52<13:46,  3.54it/s]

Epoch 3 Step 970 Loss 5.8639


 75%|███████▌  | 8770/11685 [50:55<13:53,  3.50it/s]

Epoch 3 Step 980 Loss 5.8661


 75%|███████▌  | 8780/11685 [50:58<13:45,  3.52it/s]

Epoch 3 Step 990 Loss 5.8598


 75%|███████▌  | 8790/11685 [51:01<13:41,  3.53it/s]

Epoch 3 Step 1000 Loss 5.8270


 75%|███████▌  | 8800/11685 [51:04<13:45,  3.50it/s]

Epoch 3 Step 1010 Loss 5.8355


 75%|███████▌  | 8810/11685 [51:07<13:45,  3.48it/s]

Epoch 3 Step 1020 Loss 5.7648


 75%|███████▌  | 8820/11685 [51:09<13:41,  3.49it/s]

Epoch 3 Step 1030 Loss 5.7716


 76%|███████▌  | 8830/11685 [51:12<13:26,  3.54it/s]

Epoch 3 Step 1040 Loss 5.8021


 76%|███████▌  | 8840/11685 [51:15<13:26,  3.53it/s]

Epoch 3 Step 1050 Loss 5.8360


 76%|███████▌  | 8850/11685 [51:18<13:20,  3.54it/s]

Epoch 3 Step 1060 Loss 5.7780


 76%|███████▌  | 8860/11685 [51:21<13:18,  3.54it/s]

Epoch 3 Step 1070 Loss 5.7532


 76%|███████▌  | 8870/11685 [51:24<13:18,  3.53it/s]

Epoch 3 Step 1080 Loss 5.8677


 76%|███████▌  | 8880/11685 [51:26<13:14,  3.53it/s]

Epoch 3 Step 1090 Loss 5.7807


 76%|███████▌  | 8890/11685 [51:29<13:09,  3.54it/s]

Epoch 3 Step 1100 Loss 5.7993


 76%|███████▌  | 8900/11685 [51:32<13:13,  3.51it/s]

Epoch 3 Step 1110 Loss 5.7906


 76%|███████▋  | 8910/11685 [51:35<13:05,  3.53it/s]

Epoch 3 Step 1120 Loss 5.7592


 76%|███████▋  | 8920/11685 [51:38<13:05,  3.52it/s]

Epoch 3 Step 1130 Loss 5.9561


 76%|███████▋  | 8930/11685 [51:41<13:01,  3.53it/s]

Epoch 3 Step 1140 Loss 5.8496


 77%|███████▋  | 8940/11685 [51:43<12:56,  3.54it/s]

Epoch 3 Step 1150 Loss 5.7893


 77%|███████▋  | 8950/11685 [51:46<12:55,  3.52it/s]

Epoch 3 Step 1160 Loss 5.8822


 77%|███████▋  | 8960/11685 [51:49<12:49,  3.54it/s]

Epoch 3 Step 1170 Loss 5.8477


 77%|███████▋  | 8970/11685 [51:52<12:47,  3.54it/s]

Epoch 3 Step 1180 Loss 5.8243


 77%|███████▋  | 8980/11685 [51:55<12:52,  3.50it/s]

Epoch 3 Step 1190 Loss 5.8703


 77%|███████▋  | 8990/11685 [51:58<12:44,  3.53it/s]

Epoch 3 Step 1200 Loss 5.8086


 77%|███████▋  | 9000/11685 [52:00<12:38,  3.54it/s]

Epoch 3 Step 1210 Loss 5.8677


 77%|███████▋  | 9010/11685 [52:03<12:36,  3.54it/s]

Epoch 3 Step 1220 Loss 5.8796


 77%|███████▋  | 9020/11685 [52:06<12:43,  3.49it/s]

Epoch 3 Step 1230 Loss 5.7561


 77%|███████▋  | 9030/11685 [52:09<12:31,  3.53it/s]

Epoch 3 Step 1240 Loss 5.8442


 77%|███████▋  | 9040/11685 [52:12<12:27,  3.54it/s]

Epoch 3 Step 1250 Loss 5.8202


 77%|███████▋  | 9050/11685 [52:15<12:27,  3.52it/s]

Epoch 3 Step 1260 Loss 5.7920


 78%|███████▊  | 9060/11685 [52:17<12:22,  3.53it/s]

Epoch 3 Step 1270 Loss 5.9578


 78%|███████▊  | 9070/11685 [52:20<12:27,  3.50it/s]

Epoch 3 Step 1280 Loss 5.8023


 78%|███████▊  | 9080/11685 [52:23<12:18,  3.53it/s]

Epoch 3 Step 1290 Loss 5.7724


 78%|███████▊  | 9090/11685 [52:26<12:16,  3.52it/s]

Epoch 3 Step 1300 Loss 5.7750


 78%|███████▊  | 9100/11685 [52:29<12:11,  3.53it/s]

Epoch 3 Step 1310 Loss 5.9178


 78%|███████▊  | 9110/11685 [52:32<12:08,  3.53it/s]

Epoch 3 Step 1320 Loss 5.8097


 78%|███████▊  | 9120/11685 [52:35<12:09,  3.51it/s]

Epoch 3 Step 1330 Loss 5.6892


 78%|███████▊  | 9130/11685 [52:37<12:05,  3.52it/s]

Epoch 3 Step 1340 Loss 5.8519


 78%|███████▊  | 9140/11685 [52:40<12:00,  3.53it/s]

Epoch 3 Step 1350 Loss 5.5752


 78%|███████▊  | 9150/11685 [52:43<12:05,  3.50it/s]

Epoch 3 Step 1360 Loss 5.6791


 78%|███████▊  | 9160/11685 [52:46<11:56,  3.52it/s]

Epoch 3 Step 1370 Loss 5.7159


 78%|███████▊  | 9170/11685 [52:49<11:51,  3.53it/s]

Epoch 3 Step 1380 Loss 5.8090


 79%|███████▊  | 9180/11685 [52:52<11:58,  3.48it/s]

Epoch 3 Step 1390 Loss 5.8111


 79%|███████▊  | 9190/11685 [52:54<11:52,  3.50it/s]

Epoch 3 Step 1400 Loss 5.8864


 79%|███████▊  | 9200/11685 [52:57<11:44,  3.53it/s]

Epoch 3 Step 1410 Loss 5.7375


 79%|███████▉  | 9210/11685 [53:00<11:40,  3.53it/s]

Epoch 3 Step 1420 Loss 5.7273


 79%|███████▉  | 9220/11685 [53:03<11:37,  3.53it/s]

Epoch 3 Step 1430 Loss 5.7457


 79%|███████▉  | 9230/11685 [53:06<11:44,  3.48it/s]

Epoch 3 Step 1440 Loss 5.8878


 79%|███████▉  | 9240/11685 [53:09<11:34,  3.52it/s]

Epoch 3 Step 1450 Loss 5.7781


 79%|███████▉  | 9250/11685 [53:11<11:36,  3.49it/s]

Epoch 3 Step 1460 Loss 5.7508


 79%|███████▉  | 9260/11685 [53:14<11:32,  3.50it/s]

Epoch 3 Step 1470 Loss 5.7792


 79%|███████▉  | 9270/11685 [53:17<11:24,  3.53it/s]

Epoch 3 Step 1480 Loss 5.8504


 79%|███████▉  | 9280/11685 [53:20<11:28,  3.49it/s]

Epoch 3 Step 1490 Loss 5.7050


 80%|███████▉  | 9290/11685 [53:23<11:27,  3.48it/s]

Epoch 3 Step 1500 Loss 5.7192


 80%|███████▉  | 9300/11685 [53:26<11:14,  3.54it/s]

Epoch 3 Step 1510 Loss 5.7981


 80%|███████▉  | 9310/11685 [53:29<11:11,  3.54it/s]

Epoch 3 Step 1520 Loss 5.8781


 80%|███████▉  | 9320/11685 [53:31<11:08,  3.54it/s]

Epoch 3 Step 1530 Loss 5.9024


 80%|███████▉  | 9330/11685 [53:34<11:05,  3.54it/s]

Epoch 3 Step 1540 Loss 5.9063


 80%|███████▉  | 9340/11685 [53:37<11:05,  3.52it/s]

Epoch 3 Step 1550 Loss 5.9355


 80%|████████  | 9350/11685 [53:40<10:59,  3.54it/s]

Epoch 3 Step 1560 Loss 5.9182


 80%|████████  | 9360/11685 [53:43<10:58,  3.53it/s]

Epoch 3 Step 1570 Loss 5.8671


 80%|████████  | 9370/11685 [53:46<10:54,  3.54it/s]

Epoch 3 Step 1580 Loss 5.6093


 80%|████████  | 9380/11685 [53:48<10:53,  3.53it/s]

Epoch 3 Step 1590 Loss 5.7511


 80%|████████  | 9390/11685 [53:51<10:55,  3.50it/s]

Epoch 3 Step 1600 Loss 5.7659


 80%|████████  | 9400/11685 [53:54<10:53,  3.50it/s]

Epoch 3 Step 1610 Loss 5.8381


 81%|████████  | 9410/11685 [53:57<10:45,  3.53it/s]

Epoch 3 Step 1620 Loss 5.6736


 81%|████████  | 9420/11685 [54:00<10:41,  3.53it/s]

Epoch 3 Step 1630 Loss 5.7911


 81%|████████  | 9430/11685 [54:03<10:37,  3.53it/s]

Epoch 3 Step 1640 Loss 5.8544


 81%|████████  | 9440/11685 [54:05<10:34,  3.54it/s]

Epoch 3 Step 1650 Loss 5.7177


 81%|████████  | 9450/11685 [54:08<10:38,  3.50it/s]

Epoch 3 Step 1660 Loss 5.7693


 81%|████████  | 9460/11685 [54:11<10:37,  3.49it/s]

Epoch 3 Step 1670 Loss 5.8187


 81%|████████  | 9470/11685 [54:14<10:32,  3.50it/s]

Epoch 3 Step 1680 Loss 5.7609


 81%|████████  | 9480/11685 [54:17<10:27,  3.51it/s]

Epoch 3 Step 1690 Loss 5.6137


 81%|████████  | 9490/11685 [54:20<10:19,  3.54it/s]

Epoch 3 Step 1700 Loss 5.8010


 81%|████████▏ | 9500/11685 [54:22<10:19,  3.53it/s]

Epoch 3 Step 1710 Loss 5.8352


 81%|████████▏ | 9510/11685 [54:25<10:14,  3.54it/s]

Epoch 3 Step 1720 Loss 5.9238


 81%|████████▏ | 9520/11685 [54:28<10:11,  3.54it/s]

Epoch 3 Step 1730 Loss 5.7215


 82%|████████▏ | 9530/11685 [54:31<10:12,  3.52it/s]

Epoch 3 Step 1740 Loss 5.7064


 82%|████████▏ | 9540/11685 [54:34<10:11,  3.51it/s]

Epoch 3 Step 1750 Loss 5.8630


 82%|████████▏ | 9550/11685 [54:37<10:05,  3.53it/s]

Epoch 3 Step 1760 Loss 6.0018


 82%|████████▏ | 9560/11685 [54:40<10:01,  3.53it/s]

Epoch 3 Step 1770 Loss 5.6896


 82%|████████▏ | 9570/11685 [54:42<09:58,  3.53it/s]

Epoch 3 Step 1780 Loss 5.9168


 82%|████████▏ | 9580/11685 [54:45<09:55,  3.53it/s]

Epoch 3 Step 1790 Loss 5.8795


 82%|████████▏ | 9590/11685 [54:48<10:02,  3.47it/s]

Epoch 3 Step 1800 Loss 5.7193


 82%|████████▏ | 9600/11685 [54:51<09:54,  3.51it/s]

Epoch 3 Step 1810 Loss 5.8202


 82%|████████▏ | 9610/11685 [54:54<09:50,  3.51it/s]

Epoch 3 Step 1820 Loss 5.9208


 82%|████████▏ | 9620/11685 [54:57<09:44,  3.53it/s]

Epoch 3 Step 1830 Loss 5.8252


 82%|████████▏ | 9630/11685 [54:59<09:41,  3.54it/s]

Epoch 3 Step 1840 Loss 5.8108


 82%|████████▏ | 9640/11685 [55:02<09:44,  3.50it/s]

Epoch 3 Step 1850 Loss 5.6849


 83%|████████▎ | 9650/11685 [55:05<09:34,  3.54it/s]

Epoch 3 Step 1860 Loss 5.9243


 83%|████████▎ | 9660/11685 [55:08<09:39,  3.50it/s]

Epoch 3 Step 1870 Loss 5.8584


 83%|████████▎ | 9670/11685 [55:11<09:31,  3.52it/s]

Epoch 3 Step 1880 Loss 5.7791


 83%|████████▎ | 9680/11685 [55:14<09:27,  3.53it/s]

Epoch 3 Step 1890 Loss 5.8433


 83%|████████▎ | 9690/11685 [55:17<09:30,  3.50it/s]

Epoch 3 Step 1900 Loss 5.9076


 83%|████████▎ | 9700/11685 [55:19<09:26,  3.50it/s]

Epoch 3 Step 1910 Loss 5.8030


 83%|████████▎ | 9710/11685 [55:22<09:19,  3.53it/s]

Epoch 3 Step 1920 Loss 5.8036


 83%|████████▎ | 9720/11685 [55:25<09:19,  3.51it/s]

Epoch 3 Step 1930 Loss 5.6722


 83%|████████▎ | 9730/11685 [55:28<09:16,  3.51it/s]

Epoch 3 Step 1940 Loss 5.7074


 83%|████████▎ | 9740/11685 [55:31<09:11,  3.53it/s]

Epoch 3 Step 1950 Loss 5.9225


 83%|████████▎ | 9750/11685 [55:34<09:09,  3.52it/s]

Epoch 3 Step 1960 Loss 5.8264


 84%|████████▎ | 9760/11685 [55:36<09:06,  3.52it/s]

Epoch 3 Step 1970 Loss 5.7141


 84%|████████▎ | 9770/11685 [55:39<09:01,  3.54it/s]

Epoch 3 Step 1980 Loss 5.8515


 84%|████████▎ | 9780/11685 [55:42<08:59,  3.53it/s]

Epoch 3 Step 1990 Loss 5.8348


 84%|████████▍ | 9790/11685 [55:45<09:01,  3.50it/s]

Epoch 3 Step 2000 Loss 5.7580


 84%|████████▍ | 9800/11685 [55:48<08:58,  3.50it/s]

Epoch 3 Step 2010 Loss 5.6726


 84%|████████▍ | 9810/11685 [55:51<08:49,  3.54it/s]

Epoch 3 Step 2020 Loss 5.6941


 84%|████████▍ | 9820/11685 [55:53<08:46,  3.54it/s]

Epoch 3 Step 2030 Loss 5.7652


 84%|████████▍ | 9830/11685 [55:56<08:46,  3.52it/s]

Epoch 3 Step 2040 Loss 5.7829


 84%|████████▍ | 9840/11685 [55:59<08:42,  3.53it/s]

Epoch 3 Step 2050 Loss 5.7861


 84%|████████▍ | 9850/11685 [56:02<08:38,  3.54it/s]

Epoch 3 Step 2060 Loss 5.8409


 84%|████████▍ | 9860/11685 [56:05<08:35,  3.54it/s]

Epoch 3 Step 2070 Loss 5.8526


 84%|████████▍ | 9870/11685 [56:08<08:40,  3.48it/s]

Epoch 3 Step 2080 Loss 5.9613


 85%|████████▍ | 9880/11685 [56:10<08:33,  3.52it/s]

Epoch 3 Step 2090 Loss 5.8939


 85%|████████▍ | 9890/11685 [56:13<08:26,  3.54it/s]

Epoch 3 Step 2100 Loss 5.8994


 85%|████████▍ | 9900/11685 [56:16<08:31,  3.49it/s]

Epoch 3 Step 2110 Loss 5.8545


 85%|████████▍ | 9910/11685 [56:19<08:29,  3.48it/s]

Epoch 3 Step 2120 Loss 5.6328


 85%|████████▍ | 9920/11685 [56:22<08:18,  3.54it/s]

Epoch 3 Step 2130 Loss 5.8377


 85%|████████▍ | 9930/11685 [56:25<08:16,  3.54it/s]

Epoch 3 Step 2140 Loss 5.7691


 85%|████████▌ | 9940/11685 [56:28<08:21,  3.48it/s]

Epoch 3 Step 2150 Loss 5.9668


 85%|████████▌ | 9950/11685 [56:30<08:10,  3.54it/s]

Epoch 3 Step 2160 Loss 5.7591


 85%|████████▌ | 9960/11685 [56:33<08:13,  3.50it/s]

Epoch 3 Step 2170 Loss 5.9074


 85%|████████▌ | 9970/11685 [56:36<08:04,  3.54it/s]

Epoch 3 Step 2180 Loss 5.7097


 85%|████████▌ | 9980/11685 [56:39<08:06,  3.50it/s]

Epoch 3 Step 2190 Loss 5.7890


 85%|████████▌ | 9990/11685 [56:42<07:59,  3.53it/s]

Epoch 3 Step 2200 Loss 5.7661


 86%|████████▌ | 10000/11685 [56:45<07:57,  3.53it/s]

Epoch 3 Step 2210 Loss 5.7770


 86%|████████▌ | 10010/11685 [56:47<07:52,  3.54it/s]

Epoch 3 Step 2220 Loss 5.7407


 86%|████████▌ | 10020/11685 [56:50<07:54,  3.51it/s]

Epoch 3 Step 2230 Loss 5.8291


 86%|████████▌ | 10030/11685 [56:53<07:47,  3.54it/s]

Epoch 3 Step 2240 Loss 5.8482


 86%|████████▌ | 10040/11685 [56:56<07:44,  3.54it/s]

Epoch 3 Step 2250 Loss 5.8453


 86%|████████▌ | 10050/11685 [56:59<07:42,  3.54it/s]

Epoch 3 Step 2260 Loss 5.7616


 86%|████████▌ | 10060/11685 [57:02<07:35,  3.57it/s]

Epoch 3 Step 2270 Loss 5.8527


 86%|████████▌ | 10070/11685 [57:04<07:36,  3.54it/s]

Epoch 3 Step 2280 Loss 5.8155


 86%|████████▋ | 10080/11685 [57:07<07:41,  3.47it/s]

Epoch 3 Step 2290 Loss 5.8399


 86%|████████▋ | 10090/11685 [57:10<07:30,  3.54it/s]

Epoch 3 Step 2300 Loss 5.7324


 86%|████████▋ | 10100/11685 [57:13<07:27,  3.54it/s]

Epoch 3 Step 2310 Loss 5.8323


 87%|████████▋ | 10110/11685 [57:16<07:27,  3.52it/s]

Epoch 3 Step 2320 Loss 5.7746


 87%|████████▋ | 10120/11685 [57:19<07:23,  3.53it/s]

Epoch 3 Step 2330 Loss 5.8631


 87%|████████▋ | 10130/11685 [57:21<07:25,  3.49it/s]

Epoch 3 Step 2340 Loss 5.9121


 87%|████████▋ | 10140/11685 [57:24<07:16,  3.54it/s]

Epoch 3 Step 2350 Loss 5.8650


 87%|████████▋ | 10150/11685 [57:27<07:15,  3.53it/s]

Epoch 3 Step 2360 Loss 5.7386


 87%|████████▋ | 10160/11685 [57:30<07:11,  3.53it/s]

Epoch 3 Step 2370 Loss 5.8652


 87%|████████▋ | 10170/11685 [57:33<07:15,  3.48it/s]

Epoch 3 Step 2380 Loss 5.8226


 87%|████████▋ | 10180/11685 [57:36<07:12,  3.48it/s]

Epoch 3 Step 2390 Loss 5.7572


 87%|████████▋ | 10190/11685 [57:38<07:04,  3.52it/s]

Epoch 3 Step 2400 Loss 5.8590


 87%|████████▋ | 10200/11685 [57:41<07:00,  3.53it/s]

Epoch 3 Step 2410 Loss 5.8974


 87%|████████▋ | 10210/11685 [57:44<06:58,  3.53it/s]

Epoch 3 Step 2420 Loss 5.7911


 87%|████████▋ | 10220/11685 [57:47<06:56,  3.52it/s]

Epoch 3 Step 2430 Loss 5.8574


 88%|████████▊ | 10230/11685 [57:50<06:48,  3.56it/s]

Epoch 3 Step 2440 Loss 5.8987


 88%|████████▊ | 10240/11685 [57:53<06:48,  3.54it/s]

Epoch 3 Step 2450 Loss 5.7980


 88%|████████▊ | 10250/11685 [57:56<06:46,  3.53it/s]

Epoch 3 Step 2460 Loss 5.8089


 88%|████████▊ | 10260/11685 [57:58<06:43,  3.53it/s]

Epoch 3 Step 2470 Loss 5.6591


 88%|████████▊ | 10270/11685 [58:01<06:44,  3.50it/s]

Epoch 3 Step 2480 Loss 5.7838


 88%|████████▊ | 10280/11685 [58:04<06:41,  3.50it/s]

Epoch 3 Step 2490 Loss 5.6467


 88%|████████▊ | 10290/11685 [58:07<06:37,  3.51it/s]

Epoch 3 Step 2500 Loss 5.8655


 88%|████████▊ | 10300/11685 [58:10<06:30,  3.55it/s]

Epoch 3 Step 2510 Loss 5.8727


 88%|████████▊ | 10310/11685 [58:13<06:28,  3.54it/s]

Epoch 3 Step 2520 Loss 5.8387


 88%|████████▊ | 10320/11685 [58:15<06:27,  3.52it/s]

Epoch 3 Step 2530 Loss 5.7548


 88%|████████▊ | 10330/11685 [58:18<06:23,  3.54it/s]

Epoch 3 Step 2540 Loss 5.7378


 88%|████████▊ | 10340/11685 [58:21<06:20,  3.54it/s]

Epoch 3 Step 2550 Loss 5.8154


 89%|████████▊ | 10350/11685 [58:24<06:19,  3.52it/s]

Epoch 3 Step 2560 Loss 5.6948


 89%|████████▊ | 10360/11685 [58:27<06:19,  3.49it/s]

Epoch 3 Step 2570 Loss 5.8118


 89%|████████▊ | 10370/11685 [58:30<06:15,  3.50it/s]

Epoch 3 Step 2580 Loss 5.9607


 89%|████████▉ | 10380/11685 [58:32<06:08,  3.54it/s]

Epoch 3 Step 2590 Loss 5.9319


 89%|████████▉ | 10390/11685 [58:35<06:08,  3.51it/s]

Epoch 3 Step 2600 Loss 5.8377


 89%|████████▉ | 10400/11685 [58:38<06:04,  3.53it/s]

Epoch 3 Step 2610 Loss 5.7063


 89%|████████▉ | 10410/11685 [58:41<06:04,  3.50it/s]

Epoch 3 Step 2620 Loss 5.7139


 89%|████████▉ | 10420/11685 [58:44<06:01,  3.50it/s]

Epoch 3 Step 2630 Loss 5.9264


 89%|████████▉ | 10430/11685 [58:47<05:53,  3.55it/s]

Epoch 3 Step 2640 Loss 5.6977


 89%|████████▉ | 10440/11685 [58:49<05:51,  3.54it/s]

Epoch 3 Step 2650 Loss 5.6907


 89%|████████▉ | 10450/11685 [58:52<05:49,  3.54it/s]

Epoch 3 Step 2660 Loss 5.8413


 90%|████████▉ | 10460/11685 [58:55<05:47,  3.52it/s]

Epoch 3 Step 2670 Loss 6.0170


 90%|████████▉ | 10470/11685 [58:58<05:43,  3.54it/s]

Epoch 3 Step 2680 Loss 5.9309


 90%|████████▉ | 10480/11685 [59:01<05:40,  3.54it/s]

Epoch 3 Step 2690 Loss 5.7922


 90%|████████▉ | 10490/11685 [59:04<05:37,  3.54it/s]

Epoch 3 Step 2700 Loss 5.8989


 90%|████████▉ | 10500/11685 [59:06<05:35,  3.54it/s]

Epoch 3 Step 2710 Loss 5.8815


 90%|████████▉ | 10510/11685 [59:09<05:35,  3.50it/s]

Epoch 3 Step 2720 Loss 5.7931


 90%|█████████ | 10520/11685 [59:12<05:34,  3.48it/s]

Epoch 3 Step 2730 Loss 5.8099


 90%|█████████ | 10530/11685 [59:15<05:26,  3.54it/s]

Epoch 3 Step 2740 Loss 5.6991


 90%|█████████ | 10540/11685 [59:18<05:28,  3.49it/s]

Epoch 3 Step 2750 Loss 5.8656


 90%|█████████ | 10550/11685 [59:21<05:22,  3.52it/s]

Epoch 3 Step 2760 Loss 5.8739


 90%|█████████ | 10560/11685 [59:24<05:18,  3.53it/s]

Epoch 3 Step 2770 Loss 5.8620


 90%|█████████ | 10570/11685 [59:26<05:14,  3.55it/s]

Epoch 3 Step 2780 Loss 5.8084


 91%|█████████ | 10580/11685 [59:29<05:12,  3.54it/s]

Epoch 3 Step 2790 Loss 6.0406


 91%|█████████ | 10590/11685 [59:32<05:13,  3.49it/s]

Epoch 3 Step 2800 Loss 5.8038


 91%|█████████ | 10600/11685 [59:35<05:06,  3.54it/s]

Epoch 3 Step 2810 Loss 5.7669


 91%|█████████ | 10610/11685 [59:38<05:03,  3.54it/s]

Epoch 3 Step 2820 Loss 5.7993


 91%|█████████ | 10620/11685 [59:41<05:01,  3.54it/s]

Epoch 3 Step 2830 Loss 5.7762


 91%|█████████ | 10630/11685 [59:43<04:59,  3.52it/s]

Epoch 3 Step 2840 Loss 5.8825


 91%|█████████ | 10640/11685 [59:46<04:53,  3.56it/s]

Epoch 3 Step 2850 Loss 5.9209


 91%|█████████ | 10650/11685 [59:49<04:56,  3.49it/s]

Epoch 3 Step 2860 Loss 5.8683


 91%|█████████ | 10660/11685 [59:52<04:51,  3.51it/s]

Epoch 3 Step 2870 Loss 5.7664


 91%|█████████▏| 10670/11685 [59:55<04:47,  3.53it/s]

Epoch 3 Step 2880 Loss 5.7684


 91%|█████████▏| 10680/11685 [59:58<04:44,  3.53it/s]

Epoch 3 Step 2890 Loss 5.7543


 91%|█████████▏| 10690/11685 [1:00:00<04:45,  3.48it/s]

Epoch 3 Step 2900 Loss 5.7810


 92%|█████████▏| 10700/11685 [1:00:03<04:41,  3.50it/s]

Epoch 3 Step 2910 Loss 5.7898


 92%|█████████▏| 10710/11685 [1:00:06<04:37,  3.51it/s]

Epoch 3 Step 2920 Loss 5.8982


 92%|█████████▏| 10720/11685 [1:00:09<04:34,  3.52it/s]

Epoch 3 Step 2930 Loss 5.7319


 92%|█████████▏| 10730/11685 [1:00:12<04:30,  3.53it/s]

Epoch 3 Step 2940 Loss 5.7098


 92%|█████████▏| 10740/11685 [1:00:15<04:27,  3.53it/s]

Epoch 3 Step 2950 Loss 5.8447


 92%|█████████▏| 10750/11685 [1:00:17<04:24,  3.54it/s]

Epoch 3 Step 2960 Loss 5.8637


 92%|█████████▏| 10760/11685 [1:00:20<04:24,  3.50it/s]

Epoch 3 Step 2970 Loss 5.9018


 92%|█████████▏| 10770/11685 [1:00:23<04:19,  3.53it/s]

Epoch 3 Step 2980 Loss 5.9787


 92%|█████████▏| 10780/11685 [1:00:26<04:17,  3.52it/s]

Epoch 3 Step 2990 Loss 5.6973


 92%|█████████▏| 10790/11685 [1:00:29<04:13,  3.53it/s]

Epoch 3 Step 3000 Loss 5.7639


 92%|█████████▏| 10800/11685 [1:00:32<04:11,  3.52it/s]

Epoch 3 Step 3010 Loss 5.8169


 93%|█████████▎| 10810/11685 [1:00:35<04:07,  3.53it/s]

Epoch 3 Step 3020 Loss 5.8339


 93%|█████████▎| 10820/11685 [1:00:37<04:04,  3.53it/s]

Epoch 3 Step 3030 Loss 5.7718


 93%|█████████▎| 10830/11685 [1:00:40<04:01,  3.54it/s]

Epoch 3 Step 3040 Loss 5.7745


 93%|█████████▎| 10840/11685 [1:00:43<03:58,  3.54it/s]

Epoch 3 Step 3050 Loss 5.6848


 93%|█████████▎| 10850/11685 [1:00:46<03:59,  3.48it/s]

Epoch 3 Step 3060 Loss 5.7728


 93%|█████████▎| 10860/11685 [1:00:49<03:52,  3.55it/s]

Epoch 3 Step 3070 Loss 5.9446


 93%|█████████▎| 10870/11685 [1:00:52<03:50,  3.54it/s]

Epoch 3 Step 3080 Loss 5.9462


 93%|█████████▎| 10880/11685 [1:00:54<03:47,  3.54it/s]

Epoch 3 Step 3090 Loss 5.8530


 93%|█████████▎| 10890/11685 [1:00:57<03:45,  3.53it/s]

Epoch 3 Step 3100 Loss 5.7508


 93%|█████████▎| 10900/11685 [1:01:00<03:41,  3.54it/s]

Epoch 3 Step 3110 Loss 5.8084


 93%|█████████▎| 10910/11685 [1:01:03<03:39,  3.54it/s]

Epoch 3 Step 3120 Loss 5.8926


 93%|█████████▎| 10920/11685 [1:01:06<03:35,  3.54it/s]

Epoch 3 Step 3130 Loss 5.8864


 94%|█████████▎| 10930/11685 [1:01:09<03:33,  3.54it/s]

Epoch 3 Step 3140 Loss 5.8867


 94%|█████████▎| 10940/11685 [1:01:11<03:30,  3.54it/s]

Epoch 3 Step 3150 Loss 5.8533


 94%|█████████▎| 10950/11685 [1:01:14<03:27,  3.54it/s]

Epoch 3 Step 3160 Loss 5.9697


 94%|█████████▍| 10960/11685 [1:01:17<03:25,  3.54it/s]

Epoch 3 Step 3170 Loss 5.6621


 94%|█████████▍| 10970/11685 [1:01:20<03:22,  3.53it/s]

Epoch 3 Step 3180 Loss 5.8596


 94%|█████████▍| 10980/11685 [1:01:23<03:19,  3.53it/s]

Epoch 3 Step 3190 Loss 5.6297


 94%|█████████▍| 10990/11685 [1:01:26<03:17,  3.52it/s]

Epoch 3 Step 3200 Loss 5.8308


 94%|█████████▍| 11000/11685 [1:01:28<03:15,  3.51it/s]

Epoch 3 Step 3210 Loss 5.8023


 94%|█████████▍| 11010/11685 [1:01:31<03:10,  3.54it/s]

Epoch 3 Step 3220 Loss 5.8594


 94%|█████████▍| 11020/11685 [1:01:34<03:08,  3.52it/s]

Epoch 3 Step 3230 Loss 5.8180


 94%|█████████▍| 11030/11685 [1:01:37<03:06,  3.52it/s]

Epoch 3 Step 3240 Loss 5.7870


 94%|█████████▍| 11040/11685 [1:01:40<03:04,  3.50it/s]

Epoch 3 Step 3250 Loss 5.8575


 95%|█████████▍| 11050/11685 [1:01:43<02:59,  3.53it/s]

Epoch 3 Step 3260 Loss 5.8507


 95%|█████████▍| 11060/11685 [1:01:45<02:56,  3.54it/s]

Epoch 3 Step 3270 Loss 5.9130


 95%|█████████▍| 11070/11685 [1:01:48<02:54,  3.52it/s]

Epoch 3 Step 3280 Loss 5.7041


 95%|█████████▍| 11080/11685 [1:01:51<02:50,  3.54it/s]

Epoch 3 Step 3290 Loss 5.9150


 95%|█████████▍| 11090/11685 [1:01:54<02:49,  3.50it/s]

Epoch 3 Step 3300 Loss 5.8528


 95%|█████████▍| 11100/11685 [1:01:57<02:46,  3.51it/s]

Epoch 3 Step 3310 Loss 5.9718


 95%|█████████▌| 11110/11685 [1:02:00<02:43,  3.52it/s]

Epoch 3 Step 3320 Loss 5.8010


 95%|█████████▌| 11120/11685 [1:02:03<02:41,  3.49it/s]

Epoch 3 Step 3330 Loss 5.8645


 95%|█████████▌| 11130/11685 [1:02:05<02:37,  3.52it/s]

Epoch 3 Step 3340 Loss 5.8590


 95%|█████████▌| 11140/11685 [1:02:08<02:34,  3.53it/s]

Epoch 3 Step 3350 Loss 5.8618


 95%|█████████▌| 11150/11685 [1:02:11<02:30,  3.54it/s]

Epoch 3 Step 3360 Loss 5.8919


 96%|█████████▌| 11160/11685 [1:02:14<02:29,  3.50it/s]

Epoch 3 Step 3370 Loss 5.8450


 96%|█████████▌| 11170/11685 [1:02:17<02:27,  3.49it/s]

Epoch 3 Step 3380 Loss 5.8659


 96%|█████████▌| 11180/11685 [1:02:20<02:22,  3.54it/s]

Epoch 3 Step 3390 Loss 5.9485


 96%|█████████▌| 11190/11685 [1:02:22<02:21,  3.51it/s]

Epoch 3 Step 3400 Loss 5.8732


 96%|█████████▌| 11200/11685 [1:02:25<02:17,  3.54it/s]

Epoch 3 Step 3410 Loss 5.6920


 96%|█████████▌| 11210/11685 [1:02:28<02:16,  3.48it/s]

Epoch 3 Step 3420 Loss 5.6288


 96%|█████████▌| 11220/11685 [1:02:31<02:11,  3.53it/s]

Epoch 3 Step 3430 Loss 5.8742


 96%|█████████▌| 11230/11685 [1:02:34<02:08,  3.54it/s]

Epoch 3 Step 3440 Loss 5.8285


 96%|█████████▌| 11240/11685 [1:02:37<02:06,  3.51it/s]

Epoch 3 Step 3450 Loss 5.7052


 96%|█████████▋| 11250/11685 [1:02:39<02:03,  3.53it/s]

Epoch 3 Step 3460 Loss 5.7483


 96%|█████████▋| 11260/11685 [1:02:42<02:00,  3.53it/s]

Epoch 3 Step 3470 Loss 5.8743


 96%|█████████▋| 11270/11685 [1:02:45<01:57,  3.53it/s]

Epoch 3 Step 3480 Loss 5.9750


 97%|█████████▋| 11280/11685 [1:02:48<01:54,  3.54it/s]

Epoch 3 Step 3490 Loss 5.6625


 97%|█████████▋| 11290/11685 [1:02:51<01:51,  3.54it/s]

Epoch 3 Step 3500 Loss 6.0203


 97%|█████████▋| 11300/11685 [1:02:54<01:48,  3.54it/s]

Epoch 3 Step 3510 Loss 5.8157


 97%|█████████▋| 11310/11685 [1:02:56<01:46,  3.53it/s]

Epoch 3 Step 3520 Loss 5.5072


 97%|█████████▋| 11320/11685 [1:02:59<01:43,  3.54it/s]

Epoch 3 Step 3530 Loss 5.8013


 97%|█████████▋| 11330/11685 [1:03:02<01:40,  3.53it/s]

Epoch 3 Step 3540 Loss 5.7704


 97%|█████████▋| 11340/11685 [1:03:05<01:37,  3.55it/s]

Epoch 3 Step 3550 Loss 5.7514


 97%|█████████▋| 11350/11685 [1:03:08<01:34,  3.54it/s]

Epoch 3 Step 3560 Loss 5.9643


 97%|█████████▋| 11360/11685 [1:03:11<01:31,  3.54it/s]

Epoch 3 Step 3570 Loss 5.6636


 97%|█████████▋| 11370/11685 [1:03:14<01:29,  3.53it/s]

Epoch 3 Step 3580 Loss 5.8350


 97%|█████████▋| 11380/11685 [1:03:16<01:26,  3.54it/s]

Epoch 3 Step 3590 Loss 5.7913


 97%|█████████▋| 11390/11685 [1:03:19<01:23,  3.53it/s]

Epoch 3 Step 3600 Loss 5.8547


 98%|█████████▊| 11400/11685 [1:03:22<01:20,  3.53it/s]

Epoch 3 Step 3610 Loss 5.9293


 98%|█████████▊| 11410/11685 [1:03:25<01:18,  3.53it/s]

Epoch 3 Step 3620 Loss 5.7508


 98%|█████████▊| 11420/11685 [1:03:28<01:15,  3.53it/s]

Epoch 3 Step 3630 Loss 5.8648


 98%|█████████▊| 11430/11685 [1:03:31<01:12,  3.54it/s]

Epoch 3 Step 3640 Loss 5.8618


 98%|█████████▊| 11440/11685 [1:03:33<01:10,  3.48it/s]

Epoch 3 Step 3650 Loss 6.0751


 98%|█████████▊| 11450/11685 [1:03:36<01:06,  3.54it/s]

Epoch 3 Step 3660 Loss 5.8417


 98%|█████████▊| 11460/11685 [1:03:39<01:03,  3.54it/s]

Epoch 3 Step 3670 Loss 5.7069


 98%|█████████▊| 11470/11685 [1:03:42<01:00,  3.54it/s]

Epoch 3 Step 3680 Loss 5.7518


 98%|█████████▊| 11480/11685 [1:03:45<00:57,  3.53it/s]

Epoch 3 Step 3690 Loss 5.8461


 98%|█████████▊| 11490/11685 [1:03:48<00:55,  3.54it/s]

Epoch 3 Step 3700 Loss 5.6903


 98%|█████████▊| 11500/11685 [1:03:50<00:52,  3.55it/s]

Epoch 3 Step 3710 Loss 5.7196


 99%|█████████▊| 11510/11685 [1:03:53<00:49,  3.52it/s]

Epoch 3 Step 3720 Loss 5.7600


 99%|█████████▊| 11520/11685 [1:03:56<00:46,  3.54it/s]

Epoch 3 Step 3730 Loss 5.7412


 99%|█████████▊| 11530/11685 [1:03:59<00:43,  3.54it/s]

Epoch 3 Step 3740 Loss 5.5516


 99%|█████████▉| 11540/11685 [1:04:02<00:41,  3.50it/s]

Epoch 3 Step 3750 Loss 5.9847


 99%|█████████▉| 11550/11685 [1:04:05<00:38,  3.53it/s]

Epoch 3 Step 3760 Loss 5.8414


 99%|█████████▉| 11560/11685 [1:04:07<00:35,  3.53it/s]

Epoch 3 Step 3770 Loss 5.8395


 99%|█████████▉| 11570/11685 [1:04:10<00:32,  3.53it/s]

Epoch 3 Step 3780 Loss 5.8566


 99%|█████████▉| 11580/11685 [1:04:13<00:30,  3.49it/s]

Epoch 3 Step 3790 Loss 5.9025


 99%|█████████▉| 11590/11685 [1:04:16<00:26,  3.53it/s]

Epoch 3 Step 3800 Loss 5.7808


 99%|█████████▉| 11600/11685 [1:04:19<00:24,  3.48it/s]

Epoch 3 Step 3810 Loss 5.7499


 99%|█████████▉| 11610/11685 [1:04:22<00:21,  3.55it/s]

Epoch 3 Step 3820 Loss 5.8716


 99%|█████████▉| 11620/11685 [1:04:25<00:18,  3.48it/s]

Epoch 3 Step 3830 Loss 5.8371


100%|█████████▉| 11630/11685 [1:04:27<00:15,  3.54it/s]

Epoch 3 Step 3840 Loss 5.8506


100%|█████████▉| 11640/11685 [1:04:30<00:12,  3.53it/s]

Epoch 3 Step 3850 Loss 5.6423


100%|█████████▉| 11650/11685 [1:04:33<00:09,  3.52it/s]

Epoch 3 Step 3860 Loss 5.9266


100%|█████████▉| 11660/11685 [1:04:36<00:07,  3.53it/s]

Epoch 3 Step 3870 Loss 5.8634


100%|█████████▉| 11670/11685 [1:04:39<00:04,  3.54it/s]

Epoch 3 Step 3880 Loss 5.8716


100%|█████████▉| 11680/11685 [1:04:42<00:01,  3.50it/s]

Epoch 3 Step 3890 Loss 5.9214


100%|██████████| 11685/11685 [1:04:43<00:00,  3.53it/s]

Epoch 3 Average Training Loss: 5.8128
Epoch 3 Test Accuracy: 0.0249


In [10]:
acc = accuracy_score(all_labels, all_preds)
print(f"Epoch {epoch+1} Test Accuracy: {acc:.4f}")

Epoch 3 Test Accuracy: 0.0249


In [11]:
dataframe = pd.DataFrame({'Labels': all_labels, 'Predictions': all_preds})
dataframe.head()

Unnamed: 0,Labels,Predictions
0,65,332
1,135,65
2,124,366
3,296,142
4,388,256


# II - Bert

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification,DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

# Assuming df_train is your dataset

# Step 1: Preprocessing and Splitting the Data
class ProportionalSplitter:
    @staticmethod
    def stratified_split(df, test_size=0.2):
        train, test = train_test_split(
            df, 
            test_size=test_size, 
            stratify=df['Label'], 
            random_state=42
        )
        return train, test

data = df_train.copy()
train_df, test_df = ProportionalSplitter.stratified_split(data)

# Encode labels
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

num_labels = len(label_encoder.classes_)

# Step 2: Define Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Step 3: Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Prepare datasets
train_dataset = TextDataset(
    texts=train_df['Text'].tolist(), 
    labels=train_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=80
)

test_dataset = TextDataset(
    texts=test_df['Text'].tolist(), 
    labels=test_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=80
)

# Step 4: Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

for i, layer in enumerate(model.bert.encoder.layer):
    if i < 10:
        for param in layer.parameters():
            param.requires_grad = False

# Step 5: Define Training and Evaluation Loops
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(dataloader, desc="Training", leave=True)

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        loop.set_postfix(loss=loss.item(), accuracy=correct/total)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

# Step 6: Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

In [9]:
torch.save(model.state_dict(), "bert.pth")
print("Model saved as bert.pth")

Model saved as bert.pth


# III - Predictions

In [7]:
model.load_state_dict(torch.load("bert.pth"))
model.to(device)
model.eval()

# Add predicted labels to df_test
def predict_label(texts, model, tokenizer, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

    return predictions

  model.load_state_dict(torch.load("bert.pth"))


In [8]:
# Predict and add to df_test
df_test['Label'] = predict_label(df_test['Text'].tolist(), model, tokenizer, device)
df_test['Label'] = label_encoder.inverse_transform(df_test['Label'])
print("Predicted labels added to df_test.")
df_test.head()

Predicted labels added to df_test.


Unnamed: 0,ID,Usage,Text,Label
0,55,Private,Ponovo dobija riječni oblik do Drežnice.,hrv
1,71,Private,Se formaron aproximadamente hace apenas unos 1...,spa
2,67,Private,Data juga harus terbebas dari kepentingan-kepe...,mad
3,107,Private,ᐃᒃᓯᕙᐅᑕᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ ᒥᔅ ᐅᐃᓐᒥᐅᓪ. ᒥᔅᑕ ᐃᓄᒃ.,iku
4,129,Private,Bei Gefor rullt de Kéiseker sech an riicht se...,ltz


In [9]:
df_test.to_csv("Submission_louis.csv")