In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_from_disk, load_dataset
from tqdm import tqdm

from custom_modules import *
from tokenizers import models, Tokenizer, trainers, pre_tokenizers, processors, decoders
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [None]:
ds = load_dataset("roneneldan/TinyStories")
# ds.save_to_disk('data')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2119719/2119719 [00:01<00:00, 1117112.70 examples/s]
Generating validation split: 100%|██████████| 21990/21990 [00:00<00:00, 1059975.92 examples/s]


In [3]:
ds = load_from_disk('data')
ds.set_format(type="torch")

# Train Tokenizer

In [4]:
train_ds = ds['train']
def get_training_corpus():
    for i in range(0, len(train_ds), 1000):
        yield train_ds[i : i + 1000]["text"]

In [5]:
vocab_size = 25000
tok = Tokenizer(models.BPE())
tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
special_tokens = ["[EOT]", '[UNK]', '[PAD]']
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens, unk_token='[UNK]')
tok.train_from_iterator(get_training_corpus(), trainer=trainer)

tok.post_processor = processors.ByteLevel(trim_offsets=False)
tok.decoder = decoders.ByteLevel()

In [6]:
fast_tok = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    bos_token="[EOT]",
    eos_token="[EOT]",
    pad_token="[PAD]",
    padding_side="left"
)

In [7]:
max_length = 128

def tokenize(x):
    temp_max_len = max_length + 1
    outputs = fast_tok(
        x['text'],
        truncation=True,
        max_length=temp_max_len,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True
    )
    input_batch, label_batch = [], []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == temp_max_len:
            input_batch.append(input_ids[:-1].copy())
            label_batch.append(input_ids[1:].copy())
    return {'input_ids': input_batch, 'labels':label_batch}


In [8]:
tokenized_datasets = ds.map(
    tokenize, batched=True, remove_columns=ds["train"].column_names
)
tokenized_datasets = tokenized_datasets.with_format('torch', device=DEVICE)

Map: 100%|██████████| 2119719/2119719 [06:10<00:00, 5716.62 examples/s]
Map: 100%|██████████| 21990/21990 [00:04<00:00, 5139.22 examples/s]


In [15]:
# tokenized_datasets.save_to_disk("tokenized_data")
tokenized_datasets = load_from_disk('tokenized_data')


# Declare Model

In [36]:
# build full decoder-only model
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_heads, n_blocks):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, hidden_dim)
        self.pe = PositionalEncoding()
        self.decoder_blocks = nn.ModuleList([DecoderBlock(d_in=hidden_dim, d_kq=hidden_dim, n_heads=n_heads) for _ in range(n_blocks)])
        self.head = nn.Linear(hidden_dim, vocab_size, bias=False)
        

    def forward(self, x):
        x = self.embedding(x)
        x = x + self.pe(x).to(x.device)
        for block in self.decoder_blocks:
            x = block(x)
        logits = self.head(x)
        return logits
        

In [80]:
model = DecoderOnlyTransformer(vocab_size=vocab_size, hidden_dim=128, n_blocks=6, n_heads=4).to(DEVICE)

# Train Model

In [81]:
# number of training samples
tokenized_datasets['train'].num_rows

4741600

In [82]:
# wrap with torch dataloader for training
batch_size = 64
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, drop_last=True, batch_size=batch_size)
valid_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size)

In [83]:
# define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn =  torch.nn.CrossEntropyLoss(ignore_index=fast_tok.pad_token_id)

In [84]:
def train_one_epoch():
    running_loss = 0.

    for i, data in tqdm(enumerate(train_dataloader), total=tokenized_datasets['train'].num_rows//batch_size):
        inputs, labels = data['input_ids'].to(DEVICE), data['labels'].to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()

        optimizer.step()
        running_loss += loss.item()

        if (i+1) % 100 == 0:
            print(f'Step {i+1} loss: {loss.item()}')
            last_loss = running_loss / 100

    return last_loss

In [85]:
EPOCHS = 1

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))

    # train model
    model.train()
    avg_loss = train_one_epoch()

    # eval
    running_vloss = 0.0
    model.eval()

    for i, vdata in enumerate(valid_dataloader):
        vinputs, vlabels = vdata['input_ids'], vdata['labels']
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    epoch_number += 1

EPOCH 1:


  0%|          | 101/74087 [00:09<1:55:30, 10.68it/s]

Step 100 loss: 7.975458145141602


  0%|          | 201/74087 [00:19<1:52:24, 10.95it/s]

Step 200 loss: 6.562446117401123


  0%|          | 301/74087 [00:28<1:54:25, 10.75it/s]

Step 300 loss: 6.172401428222656


  1%|          | 401/74087 [00:37<1:54:20, 10.74it/s]

Step 400 loss: 6.035363674163818


  1%|          | 501/74087 [00:46<1:50:46, 11.07it/s]

Step 500 loss: 5.914429187774658


  1%|          | 601/74087 [00:56<2:00:01, 10.20it/s]

Step 600 loss: 5.944277286529541


  1%|          | 701/74087 [01:05<1:56:34, 10.49it/s]

Step 700 loss: 5.876797199249268


  1%|          | 801/74087 [01:14<1:54:29, 10.67it/s]

Step 800 loss: 5.901000022888184


  1%|          | 901/74087 [01:24<1:53:00, 10.79it/s]

Step 900 loss: 5.847580909729004


  1%|▏         | 1001/74087 [01:33<1:51:11, 10.96it/s]

Step 1000 loss: 5.819596290588379


  1%|▏         | 1101/74087 [01:42<1:54:27, 10.63it/s]

Step 1100 loss: 5.875206470489502


  2%|▏         | 1201/74087 [01:52<1:52:47, 10.77it/s]

Step 1200 loss: 5.746696472167969


  2%|▏         | 1301/74087 [02:01<1:55:23, 10.51it/s]

Step 1300 loss: 5.765970706939697


  2%|▏         | 1401/74087 [02:11<1:54:52, 10.55it/s]

Step 1400 loss: 5.76654577255249


  2%|▏         | 1501/74087 [02:20<1:53:42, 10.64it/s]

Step 1500 loss: 5.716006755828857


  2%|▏         | 1601/74087 [02:29<1:53:53, 10.61it/s]

Step 1600 loss: 5.7683515548706055


  2%|▏         | 1701/74087 [02:39<1:51:56, 10.78it/s]

Step 1700 loss: 5.762094974517822


  2%|▏         | 1801/74087 [02:48<1:53:13, 10.64it/s]

Step 1800 loss: 5.709634304046631


  3%|▎         | 1901/74087 [02:58<1:53:04, 10.64it/s]

Step 1900 loss: 5.675602912902832


  3%|▎         | 2001/74087 [03:07<1:54:37, 10.48it/s]

Step 2000 loss: 5.754059791564941


  3%|▎         | 2101/74087 [03:17<1:54:48, 10.45it/s]

Step 2100 loss: 5.736250400543213


  3%|▎         | 2201/74087 [03:26<1:55:41, 10.36it/s]

Step 2200 loss: 5.715059280395508


  3%|▎         | 2301/74087 [03:36<1:52:21, 10.65it/s]

Step 2300 loss: 5.742949962615967


  3%|▎         | 2401/74087 [03:45<1:53:49, 10.50it/s]

Step 2400 loss: 5.7380757331848145


  3%|▎         | 2501/74087 [03:55<1:53:30, 10.51it/s]

Step 2500 loss: 5.673980236053467


  4%|▎         | 2601/74087 [04:04<1:51:51, 10.65it/s]

Step 2600 loss: 5.673744201660156


  4%|▎         | 2701/74087 [04:14<1:52:53, 10.54it/s]

Step 2700 loss: 5.687640190124512


  4%|▍         | 2801/74087 [04:23<1:51:46, 10.63it/s]

Step 2800 loss: 5.590534210205078


  4%|▍         | 2901/74087 [04:32<1:53:54, 10.42it/s]

Step 2900 loss: 5.732463359832764


  4%|▍         | 3001/74087 [04:42<1:50:09, 10.76it/s]

Step 3000 loss: 5.747192859649658


  4%|▍         | 3101/74087 [04:51<1:53:14, 10.45it/s]

Step 3100 loss: 5.73707914352417


  4%|▍         | 3201/74087 [05:01<1:48:53, 10.85it/s]

Step 3200 loss: 5.772085666656494


  4%|▍         | 3301/74087 [05:10<1:46:12, 11.11it/s]

Step 3300 loss: 5.6806159019470215


  5%|▍         | 3401/74087 [05:19<1:51:18, 10.58it/s]

Step 3400 loss: 5.672391891479492


  5%|▍         | 3501/74087 [05:28<1:53:48, 10.34it/s]

Step 3500 loss: 5.547390460968018


  5%|▍         | 3601/74087 [05:38<1:51:53, 10.50it/s]

Step 3600 loss: 5.640758991241455


  5%|▍         | 3701/74087 [05:47<1:52:02, 10.47it/s]

Step 3700 loss: 5.6398820877075195


  5%|▌         | 3801/74087 [05:57<1:52:53, 10.38it/s]

Step 3800 loss: 5.637561321258545


  5%|▌         | 3901/74087 [06:06<1:48:44, 10.76it/s]

Step 3900 loss: 5.657227516174316


  5%|▌         | 4001/74087 [06:15<1:44:51, 11.14it/s]

Step 4000 loss: 5.553772449493408


  6%|▌         | 4101/74087 [06:25<1:51:17, 10.48it/s]

Step 4100 loss: 5.5858073234558105


  6%|▌         | 4201/74087 [06:34<1:49:28, 10.64it/s]

Step 4200 loss: 5.609978199005127


  6%|▌         | 4301/74087 [06:44<1:48:41, 10.70it/s]

Step 4300 loss: 5.643153190612793


  6%|▌         | 4401/74087 [06:53<1:49:34, 10.60it/s]

Step 4400 loss: 5.683150768280029


  6%|▌         | 4501/74087 [07:02<1:48:48, 10.66it/s]

Step 4500 loss: 5.57866907119751


  6%|▌         | 4601/74087 [07:12<1:49:46, 10.55it/s]

Step 4600 loss: 5.593556880950928


  6%|▋         | 4701/74087 [07:21<1:49:26, 10.57it/s]

Step 4700 loss: 5.583216190338135


  6%|▋         | 4801/74087 [07:30<1:50:31, 10.45it/s]

Step 4800 loss: 5.603854179382324


  7%|▋         | 4901/74087 [07:40<1:49:04, 10.57it/s]

Step 4900 loss: 5.5515851974487305


  7%|▋         | 5001/74087 [07:49<1:49:29, 10.52it/s]

Step 5000 loss: 5.575491905212402


  7%|▋         | 5101/74087 [07:59<1:48:40, 10.58it/s]

Step 5100 loss: 5.478708267211914


  7%|▋         | 5201/74087 [08:08<1:48:20, 10.60it/s]

Step 5200 loss: 5.56324577331543


  7%|▋         | 5301/74087 [08:17<1:47:02, 10.71it/s]

Step 5300 loss: 5.392823696136475


  7%|▋         | 5401/74087 [08:27<1:47:42, 10.63it/s]

Step 5400 loss: 5.551885604858398


  7%|▋         | 5501/74087 [08:36<1:47:51, 10.60it/s]

Step 5500 loss: 5.440663814544678


  8%|▊         | 5601/74087 [08:45<1:44:22, 10.94it/s]

Step 5600 loss: 5.518387317657471


  8%|▊         | 5701/74087 [08:55<1:46:38, 10.69it/s]

Step 5700 loss: 5.451357364654541


  8%|▊         | 5801/74087 [09:04<1:50:14, 10.32it/s]

Step 5800 loss: 5.467045783996582


  8%|▊         | 5901/74087 [09:13<1:47:53, 10.53it/s]

Step 5900 loss: 5.454986572265625


  8%|▊         | 6001/74087 [09:23<1:45:03, 10.80it/s]

Step 6000 loss: 5.554866790771484


  8%|▊         | 6101/74087 [09:32<1:48:05, 10.48it/s]

Step 6100 loss: 5.523128032684326


  8%|▊         | 6201/74087 [09:41<1:47:37, 10.51it/s]

Step 6200 loss: 5.367114543914795


  9%|▊         | 6301/74087 [09:51<1:46:45, 10.58it/s]

Step 6300 loss: 5.376479625701904


  9%|▊         | 6401/74087 [10:00<1:46:07, 10.63it/s]

Step 6400 loss: 5.425843238830566


  9%|▉         | 6501/74087 [10:09<1:47:13, 10.50it/s]

Step 6500 loss: 5.562561511993408


  9%|▉         | 6601/74087 [10:19<1:46:24, 10.57it/s]

Step 6600 loss: 5.361741065979004


  9%|▉         | 6701/74087 [10:28<1:44:42, 10.73it/s]

Step 6700 loss: 5.41614294052124


  9%|▉         | 6801/74087 [10:38<1:46:24, 10.54it/s]

Step 6800 loss: 5.4992899894714355


  9%|▉         | 6901/74087 [10:47<1:47:38, 10.40it/s]

Step 6900 loss: 5.467808246612549


  9%|▉         | 7001/74087 [10:57<1:46:10, 10.53it/s]

Step 7000 loss: 5.384055137634277


 10%|▉         | 7101/74087 [11:06<1:38:07, 11.38it/s]

Step 7100 loss: 5.423651695251465


 10%|▉         | 7201/74087 [11:15<1:39:25, 11.21it/s]

Step 7200 loss: 5.550278186798096


 10%|▉         | 7301/74087 [11:24<1:37:37, 11.40it/s]

Step 7300 loss: 5.428067684173584


 10%|▉         | 7401/74087 [11:33<1:44:48, 10.61it/s]

Step 7400 loss: 5.386462688446045


 10%|█         | 7501/74087 [11:42<1:44:19, 10.64it/s]

Step 7500 loss: 5.287330150604248


 10%|█         | 7601/74087 [11:52<1:45:29, 10.50it/s]

Step 7600 loss: 5.3353447914123535


 10%|█         | 7701/74087 [12:01<1:43:43, 10.67it/s]

Step 7700 loss: 5.484641075134277


 11%|█         | 7801/74087 [12:11<1:44:54, 10.53it/s]

Step 7800 loss: 5.446632385253906


 11%|█         | 7901/74087 [12:20<1:43:21, 10.67it/s]

Step 7900 loss: 5.300897121429443


 11%|█         | 8001/74087 [12:30<1:44:24, 10.55it/s]

Step 8000 loss: 5.3163604736328125


 11%|█         | 8101/74087 [12:39<1:42:33, 10.72it/s]

Step 8100 loss: 5.60512113571167


 11%|█         | 8201/74087 [12:48<1:45:08, 10.44it/s]

Step 8200 loss: 5.432797908782959


 11%|█         | 8301/74087 [12:58<1:42:52, 10.66it/s]

Step 8300 loss: 5.403258323669434


 11%|█▏        | 8401/74087 [13:08<1:44:18, 10.50it/s]

Step 8400 loss: 5.310534477233887


 11%|█▏        | 8501/74087 [13:17<1:44:05, 10.50it/s]

Step 8500 loss: 5.288290977478027


 12%|█▏        | 8601/74087 [13:27<1:43:35, 10.54it/s]

Step 8600 loss: 5.3999176025390625


 12%|█▏        | 8701/74087 [13:36<1:44:35, 10.42it/s]

Step 8700 loss: 5.253347396850586


 12%|█▏        | 8801/74087 [13:46<1:39:04, 10.98it/s]

Step 8800 loss: 5.419787883758545


 12%|█▏        | 8900/74087 [13:55<1:42:56, 10.55it/s]

Step 8900 loss: 5.294371604919434


 12%|█▏        | 9000/74087 [14:05<1:45:15, 10.31it/s]

Step 9000 loss: 5.3676042556762695


 12%|█▏        | 9102/74087 [14:14<1:40:59, 10.72it/s]

Step 9100 loss: 5.378286361694336


 12%|█▏        | 9200/74087 [14:24<1:39:40, 10.85it/s]

Step 9200 loss: 5.418759822845459


 13%|█▎        | 9300/74087 [14:33<1:43:40, 10.42it/s]

Step 9300 loss: 5.197822570800781


 13%|█▎        | 9400/74087 [14:43<1:43:24, 10.43it/s]

Step 9400 loss: 5.1583404541015625


 13%|█▎        | 9502/74087 [14:52<1:38:07, 10.97it/s]

Step 9500 loss: 5.210352420806885


 13%|█▎        | 9600/74087 [15:02<1:43:54, 10.34it/s]

Step 9600 loss: 5.146341323852539


 13%|█▎        | 9700/74087 [15:11<1:38:55, 10.85it/s]

Step 9700 loss: 5.069188117980957


 13%|█▎        | 9802/74087 [15:21<1:41:51, 10.52it/s]

Step 9800 loss: 4.936138153076172


 13%|█▎        | 9900/74087 [15:30<1:43:36, 10.33it/s]

Step 9900 loss: 5.317532062530518


 13%|█▎        | 10000/74087 [15:40<1:41:25, 10.53it/s]

Step 10000 loss: 5.281708717346191


 14%|█▎        | 10100/74087 [15:49<1:42:25, 10.41it/s]

Step 10100 loss: 5.374077796936035


 14%|█▍        | 10200/74087 [15:59<1:41:22, 10.50it/s]

Step 10200 loss: 5.321216106414795


 14%|█▍        | 10300/74087 [16:08<1:38:44, 10.77it/s]

Step 10300 loss: 5.193427085876465


 14%|█▍        | 10402/74087 [16:18<1:37:13, 10.92it/s]

Step 10400 loss: 5.121322154998779


 14%|█▍        | 10502/74087 [16:27<1:41:18, 10.46it/s]

Step 10500 loss: 5.244711399078369


 14%|█▍        | 10600/74087 [16:37<1:42:13, 10.35it/s]

Step 10600 loss: 5.110018253326416


 14%|█▍        | 10700/74087 [16:46<1:41:09, 10.44it/s]

Step 10700 loss: 5.120429992675781


 15%|█▍        | 10800/74087 [16:56<1:41:20, 10.41it/s]

Step 10800 loss: 5.088821887969971


 15%|█▍        | 10902/74087 [17:05<1:34:22, 11.16it/s]

Step 10900 loss: 5.214531898498535


 15%|█▍        | 11000/74087 [17:14<1:40:41, 10.44it/s]

Step 11000 loss: 5.093606472015381


 15%|█▍        | 11102/74087 [17:24<1:41:01, 10.39it/s]

Step 11100 loss: 5.401305675506592


 15%|█▌        | 11200/74087 [17:33<1:39:52, 10.49it/s]

Step 11200 loss: 5.237082481384277


 15%|█▌        | 11300/74087 [17:43<1:39:45, 10.49it/s]

Step 11300 loss: 5.101817607879639


 15%|█▌        | 11400/74087 [17:52<1:39:37, 10.49it/s]

Step 11400 loss: 5.088461875915527


 16%|█▌        | 11500/74087 [18:02<1:40:18, 10.40it/s]

Step 11500 loss: 5.248361110687256


 16%|█▌        | 11600/74087 [18:11<1:38:57, 10.52it/s]

Step 11600 loss: 5.10996150970459


 16%|█▌        | 11700/74087 [18:21<1:38:23, 10.57it/s]

Step 11700 loss: 4.972105503082275


 16%|█▌        | 11800/74087 [18:30<1:41:17, 10.25it/s]

Step 11800 loss: 5.207951068878174


 16%|█▌        | 11902/74087 [18:40<1:32:34, 11.20it/s]

Step 11900 loss: 5.002176761627197


 16%|█▌        | 12001/74087 [18:50<1:37:47, 10.58it/s]

Step 12000 loss: 5.056328773498535


 16%|█▋        | 12101/74087 [18:59<1:38:50, 10.45it/s]

Step 12100 loss: 5.176146984100342


 16%|█▋        | 12201/74087 [19:08<1:40:19, 10.28it/s]

Step 12200 loss: 5.07018518447876


 17%|█▋        | 12301/74087 [19:18<1:37:35, 10.55it/s]

Step 12300 loss: 4.923660755157471


 17%|█▋        | 12401/74087 [19:27<1:31:59, 11.18it/s]

Step 12400 loss: 5.032027244567871


 17%|█▋        | 12501/74087 [19:36<1:37:17, 10.55it/s]

Step 12500 loss: 5.155006408691406


 17%|█▋        | 12601/74087 [19:46<1:35:50, 10.69it/s]

Step 12600 loss: 4.981292247772217


 17%|█▋        | 12701/74087 [19:55<1:36:43, 10.58it/s]

Step 12700 loss: 4.9822187423706055


 17%|█▋        | 12801/74087 [20:05<1:37:34, 10.47it/s]

Step 12800 loss: 5.0645928382873535


 17%|█▋        | 12901/74087 [20:14<1:36:40, 10.55it/s]

Step 12900 loss: 5.110433578491211


 18%|█▊        | 13001/74087 [20:24<1:36:41, 10.53it/s]

Step 13000 loss: 5.0880045890808105


 18%|█▊        | 13101/74087 [20:33<1:35:21, 10.66it/s]

Step 13100 loss: 5.064741134643555


 18%|█▊        | 13201/74087 [20:43<1:37:03, 10.45it/s]

Step 13200 loss: 5.170078277587891


 18%|█▊        | 13301/74087 [20:52<1:35:36, 10.60it/s]

Step 13300 loss: 5.052663326263428


 18%|█▊        | 13401/74087 [21:02<1:34:01, 10.76it/s]

Step 13400 loss: 5.007791519165039


 18%|█▊        | 13501/74087 [21:11<1:33:28, 10.80it/s]

Step 13500 loss: 5.2210001945495605


 18%|█▊        | 13601/74087 [21:20<1:32:06, 10.94it/s]

Step 13600 loss: 5.195133209228516


 18%|█▊        | 13701/74087 [21:29<1:35:56, 10.49it/s]

Step 13700 loss: 5.0050530433654785


 19%|█▊        | 13801/74087 [21:39<1:35:22, 10.54it/s]

Step 13800 loss: 4.782144546508789


 19%|█▉        | 13901/74087 [21:48<1:35:20, 10.52it/s]

Step 13900 loss: 5.149017810821533


 19%|█▉        | 14001/74087 [21:58<1:35:24, 10.50it/s]

Step 14000 loss: 5.053144931793213


 19%|█▉        | 14101/74087 [22:07<1:34:32, 10.57it/s]

Step 14100 loss: 4.911737442016602


 19%|█▉        | 14201/74087 [22:17<1:34:23, 10.57it/s]

Step 14200 loss: 4.880517959594727


 19%|█▉        | 14301/74087 [22:26<1:34:02, 10.60it/s]

Step 14300 loss: 5.021347522735596


 19%|█▉        | 14401/74087 [22:36<1:34:19, 10.55it/s]

Step 14400 loss: 4.951957702636719


 20%|█▉        | 14501/74087 [22:45<1:36:21, 10.31it/s]

Step 14500 loss: 5.084222316741943


 20%|█▉        | 14601/74087 [22:54<1:34:16, 10.52it/s]

Step 14600 loss: 5.041671276092529


 20%|█▉        | 14701/74087 [23:04<1:26:59, 11.38it/s]

Step 14700 loss: 4.983713150024414


 20%|█▉        | 14801/74087 [23:13<1:30:52, 10.87it/s]

Step 14800 loss: 4.924653053283691


 20%|██        | 14902/74087 [23:22<1:30:27, 10.90it/s]

Step 14900 loss: 4.80485200881958


 20%|██        | 15002/74087 [23:32<1:31:12, 10.80it/s]

Step 15000 loss: 4.797577857971191


 20%|██        | 15100/74087 [23:41<1:32:32, 10.62it/s]

Step 15100 loss: 4.8218536376953125


 21%|██        | 15202/74087 [23:51<1:32:32, 10.61it/s]

Step 15200 loss: 5.010585784912109


 21%|██        | 15300/74087 [24:00<1:32:34, 10.58it/s]

Step 15300 loss: 4.850752353668213


 21%|██        | 15400/74087 [24:09<1:30:58, 10.75it/s]

Step 15400 loss: 4.796753883361816


 21%|██        | 15502/74087 [24:19<1:33:06, 10.49it/s]

Step 15500 loss: 4.870706081390381


 21%|██        | 15600/74087 [24:28<1:32:27, 10.54it/s]

Step 15600 loss: 4.8040900230407715


 21%|██        | 15700/74087 [24:38<1:33:17, 10.43it/s]

Step 15700 loss: 5.002618789672852


 21%|██▏       | 15800/74087 [24:47<1:29:44, 10.83it/s]

Step 15800 loss: 5.037126541137695


 21%|██▏       | 15900/74087 [24:56<1:31:34, 10.59it/s]

Step 15900 loss: 4.892698287963867


 22%|██▏       | 16002/74087 [25:06<1:24:40, 11.43it/s]

Step 16000 loss: 4.7550153732299805


 22%|██▏       | 16100/74087 [25:15<1:30:43, 10.65it/s]

Step 16100 loss: 4.82780122756958


 22%|██▏       | 16200/74087 [25:25<1:31:11, 10.58it/s]

Step 16200 loss: 4.764396667480469


 22%|██▏       | 16300/74087 [25:34<1:32:48, 10.38it/s]

Step 16300 loss: 4.870396614074707


 22%|██▏       | 16400/74087 [25:44<1:31:21, 10.52it/s]

Step 16400 loss: 4.740373134613037


 22%|██▏       | 16502/74087 [25:53<1:31:31, 10.49it/s]

Step 16500 loss: 4.772374629974365


 22%|██▏       | 16600/74087 [26:03<1:29:46, 10.67it/s]

Step 16600 loss: 4.674320697784424


 23%|██▎       | 16700/74087 [26:12<1:30:42, 10.54it/s]

Step 16700 loss: 4.679435729980469


 23%|██▎       | 16800/74087 [26:21<1:30:20, 10.57it/s]

Step 16800 loss: 4.742373943328857


 23%|██▎       | 16900/74087 [26:31<1:30:35, 10.52it/s]

Step 16900 loss: 4.703927993774414


 23%|██▎       | 17000/74087 [26:40<1:29:16, 10.66it/s]

Step 17000 loss: 4.787563323974609


 23%|██▎       | 17100/74087 [26:50<1:29:16, 10.64it/s]

Step 17100 loss: 4.6641764640808105


 23%|██▎       | 17200/74087 [26:59<1:29:35, 10.58it/s]

Step 17200 loss: 4.771531581878662


 23%|██▎       | 17302/74087 [27:09<1:28:46, 10.66it/s]

Step 17300 loss: 4.690740585327148


 23%|██▎       | 17400/74087 [27:18<1:29:32, 10.55it/s]

Step 17400 loss: 4.6856689453125


 24%|██▎       | 17502/74087 [27:27<1:30:23, 10.43it/s]

Step 17500 loss: 4.782839775085449


 24%|██▍       | 17602/74087 [27:37<1:26:41, 10.86it/s]

Step 17600 loss: 4.600958824157715


 24%|██▍       | 17700/74087 [27:46<1:29:03, 10.55it/s]

Step 17700 loss: 4.663213729858398


 24%|██▍       | 17800/74087 [27:56<1:28:51, 10.56it/s]

Step 17800 loss: 4.738617897033691


 24%|██▍       | 17902/74087 [28:05<1:27:02, 10.76it/s]

Step 17900 loss: 4.700796127319336


 24%|██▍       | 18002/74087 [28:15<1:27:28, 10.69it/s]

Step 18000 loss: 4.693807125091553


 24%|██▍       | 18100/74087 [28:24<1:27:48, 10.63it/s]

Step 18100 loss: 4.774910926818848


 25%|██▍       | 18200/74087 [28:33<1:28:02, 10.58it/s]

Step 18200 loss: 4.820652484893799


 25%|██▍       | 18302/74087 [28:43<1:25:05, 10.93it/s]

Step 18300 loss: 4.522544860839844


 25%|██▍       | 18400/74087 [28:52<1:28:38, 10.47it/s]

Step 18400 loss: 4.67105770111084


 25%|██▍       | 18501/74087 [45:12<1:34:55,  9.76it/s]    

Step 18500 loss: 4.718484878540039


 25%|██▌       | 18601/74087 [45:21<1:28:42, 10.42it/s]

Step 18600 loss: 4.651060581207275


 25%|██▌       | 18701/74087 [45:31<1:26:52, 10.63it/s]

Step 18700 loss: 4.795621871948242


 25%|██▌       | 18801/74087 [45:40<1:24:10, 10.95it/s]

Step 18800 loss: 4.575417518615723


 26%|██▌       | 18901/74087 [45:50<1:27:54, 10.46it/s]

Step 18900 loss: 4.649531364440918


 26%|██▌       | 19001/74087 [45:59<1:26:47, 10.58it/s]

Step 19000 loss: 4.6626105308532715


 26%|██▌       | 19101/74087 [46:09<1:26:41, 10.57it/s]

Step 19100 loss: 4.570706844329834


 26%|██▌       | 19201/74087 [46:18<1:26:50, 10.53it/s]

Step 19200 loss: 4.664167404174805


 26%|██▌       | 19301/74087 [46:28<1:26:13, 10.59it/s]

Step 19300 loss: 4.640873908996582


 26%|██▌       | 19401/74087 [46:37<1:26:56, 10.48it/s]

Step 19400 loss: 4.51917839050293


 26%|██▋       | 19501/74087 [46:47<1:25:37, 10.63it/s]

Step 19500 loss: 4.612271308898926


 26%|██▋       | 19601/74087 [46:57<1:26:52, 10.45it/s]

Step 19600 loss: 4.773100852966309


 27%|██▋       | 19701/74087 [47:06<1:24:59, 10.67it/s]

Step 19700 loss: 4.77853536605835


 27%|██▋       | 19801/74087 [47:16<1:28:08, 10.27it/s]

Step 19800 loss: 4.549391746520996


 27%|██▋       | 19901/74087 [47:25<1:27:29, 10.32it/s]

Step 19900 loss: 4.618409633636475


 27%|██▋       | 20001/74087 [47:35<1:23:43, 10.77it/s]

Step 20000 loss: 4.626612663269043


 27%|██▋       | 20101/74087 [47:45<1:25:07, 10.57it/s]

Step 20100 loss: 4.638453006744385


 27%|██▋       | 20201/74087 [47:54<1:24:46, 10.59it/s]

Step 20200 loss: 4.490639686584473


 27%|██▋       | 20301/74087 [48:04<1:28:02, 10.18it/s]

Step 20300 loss: 4.541978359222412


 28%|██▊       | 20401/74087 [48:13<1:28:11, 10.15it/s]

Step 20400 loss: 4.4518022537231445


 28%|██▊       | 20501/74087 [48:23<1:27:09, 10.25it/s]

Step 20500 loss: 4.471174716949463


 28%|██▊       | 20601/74087 [48:33<1:26:59, 10.25it/s]

Step 20600 loss: 4.494460105895996


 28%|██▊       | 20701/74087 [48:42<1:26:39, 10.27it/s]

Step 20700 loss: 4.677475452423096


 28%|██▊       | 20802/74087 [48:52<1:27:45, 10.12it/s]

Step 20800 loss: 4.551823139190674


 28%|██▊       | 20902/74087 [49:02<1:28:12, 10.05it/s]

Step 20900 loss: 4.415055751800537


 28%|██▊       | 21000/74087 [49:11<1:25:36, 10.34it/s]

Step 21000 loss: 4.506411075592041


 28%|██▊       | 21100/74087 [49:21<1:26:12, 10.24it/s]

Step 21100 loss: 4.7096638679504395


 29%|██▊       | 21201/74087 [49:31<1:23:45, 10.52it/s]

Step 21200 loss: 4.55792236328125


 29%|██▉       | 21301/74087 [49:41<1:25:13, 10.32it/s]

Step 21300 loss: 4.40255880355835


 29%|██▉       | 21401/74087 [49:50<1:24:42, 10.37it/s]

Step 21400 loss: 4.626036643981934


 29%|██▉       | 21501/74087 [50:00<1:25:23, 10.26it/s]

Step 21500 loss: 4.427701950073242


 29%|██▉       | 21600/74087 [50:10<1:28:06,  9.93it/s]

Step 21600 loss: 4.588827133178711


 29%|██▉       | 21700/74087 [50:19<1:25:19, 10.23it/s]

Step 21700 loss: 4.468079566955566


 29%|██▉       | 21801/74087 [50:29<1:23:02, 10.49it/s]

Step 21800 loss: 4.499396324157715


 30%|██▉       | 21901/74087 [50:39<1:22:48, 10.50it/s]

Step 21900 loss: 4.520851135253906


 30%|██▉       | 22001/74087 [50:48<1:22:44, 10.49it/s]

Step 22000 loss: 4.517108917236328


 30%|██▉       | 22101/74087 [50:58<1:21:40, 10.61it/s]

Step 22100 loss: 4.557750701904297


 30%|██▉       | 22201/74087 [51:07<1:20:53, 10.69it/s]

Step 22200 loss: 4.646119117736816


 30%|███       | 22301/74087 [51:17<1:21:50, 10.55it/s]

Step 22300 loss: 4.447275161743164


 30%|███       | 22401/74087 [51:26<1:22:20, 10.46it/s]

Step 22400 loss: 4.392869472503662


 30%|███       | 22501/74087 [51:36<1:18:07, 11.01it/s]

Step 22500 loss: 4.493573188781738


 31%|███       | 22601/74087 [51:45<1:22:43, 10.37it/s]

Step 22600 loss: 4.543265342712402


 31%|███       | 22702/74087 [51:56<1:18:34, 10.90it/s]

Step 22700 loss: 4.511496067047119


 31%|███       | 22800/74087 [52:05<1:20:48, 10.58it/s]

Step 22800 loss: 4.423707008361816


 31%|███       | 22902/74087 [52:15<1:22:48, 10.30it/s]

Step 22900 loss: 4.409278869628906


 31%|███       | 23000/74087 [52:24<1:19:22, 10.73it/s]

Step 23000 loss: 4.354927062988281


 31%|███       | 23102/74087 [52:34<1:20:42, 10.53it/s]

Step 23100 loss: 4.626901149749756


 31%|███▏      | 23202/74087 [52:44<1:22:31, 10.28it/s]

Step 23200 loss: 4.438131809234619


 31%|███▏      | 23300/74087 [52:53<1:22:17, 10.29it/s]

Step 23300 loss: 4.16165018081665


 32%|███▏      | 23400/74087 [53:03<1:20:19, 10.52it/s]

Step 23400 loss: 4.456418991088867


 32%|███▏      | 23501/74087 [53:12<1:17:35, 10.86it/s]

Step 23500 loss: 4.3788604736328125


 32%|███▏      | 23601/74087 [53:22<1:18:52, 10.67it/s]

Step 23600 loss: 4.3859734535217285


 32%|███▏      | 23701/74087 [53:31<1:20:49, 10.39it/s]

Step 23700 loss: 4.471561431884766


 32%|███▏      | 23801/74087 [53:41<1:18:07, 10.73it/s]

Step 23800 loss: 4.378002166748047


 32%|███▏      | 23901/74087 [53:50<1:20:46, 10.36it/s]

Step 23900 loss: 4.441556930541992


 32%|███▏      | 24001/74087 [54:00<1:19:09, 10.54it/s]

Step 24000 loss: 4.34622859954834


 33%|███▎      | 24101/74087 [54:10<1:21:27, 10.23it/s]

Step 24100 loss: 4.2997918128967285


 33%|███▎      | 24201/74087 [54:19<1:14:43, 11.13it/s]

Step 24200 loss: 4.262994289398193


 33%|███▎      | 24301/74087 [54:28<1:19:53, 10.39it/s]

Step 24300 loss: 4.370131969451904


 33%|███▎      | 24401/74087 [54:38<1:16:39, 10.80it/s]

Step 24400 loss: 4.3407182693481445


 33%|███▎      | 24501/74087 [54:48<1:20:02, 10.32it/s]

Step 24500 loss: 4.424594879150391


 33%|███▎      | 24601/74087 [54:57<1:19:11, 10.42it/s]

Step 24600 loss: 4.501913547515869


 33%|███▎      | 24701/74087 [55:07<1:17:45, 10.59it/s]

Step 24700 loss: 4.230832099914551


 33%|███▎      | 24801/74087 [55:16<1:19:25, 10.34it/s]

Step 24800 loss: 4.499820709228516


 34%|███▎      | 24901/74087 [55:26<1:19:02, 10.37it/s]

Step 24900 loss: 4.3602213859558105


 34%|███▎      | 25001/74087 [55:36<1:18:54, 10.37it/s]

Step 25000 loss: 4.435751914978027


 34%|███▍      | 25101/74087 [55:45<1:18:50, 10.36it/s]

Step 25100 loss: 4.268624782562256


 34%|███▍      | 25201/74087 [55:55<1:14:24, 10.95it/s]

Step 25200 loss: 4.335749626159668


 34%|███▍      | 25301/74087 [56:04<1:13:49, 11.01it/s]

Step 25300 loss: 4.278651714324951


 34%|███▍      | 25401/74087 [56:13<1:16:45, 10.57it/s]

Step 25400 loss: 4.241718292236328


 34%|███▍      | 25501/74087 [56:23<1:17:26, 10.46it/s]

Step 25500 loss: 4.407012462615967


 35%|███▍      | 25601/74087 [56:32<1:15:33, 10.69it/s]

Step 25600 loss: 4.3573503494262695


 35%|███▍      | 25701/74087 [56:42<1:16:53, 10.49it/s]

Step 25700 loss: 4.418458938598633


 35%|███▍      | 25801/74087 [56:51<1:17:32, 10.38it/s]

Step 25800 loss: 4.364626884460449


 35%|███▍      | 25901/74087 [57:01<1:19:15, 10.13it/s]

Step 25900 loss: 4.1223649978637695


 35%|███▌      | 26001/74087 [57:10<1:11:19, 11.24it/s]

Step 26000 loss: 4.139095783233643


 35%|███▌      | 26101/74087 [57:20<1:12:55, 10.97it/s]

Step 26100 loss: 4.196875095367432


 35%|███▌      | 26201/74087 [57:29<1:16:04, 10.49it/s]

Step 26200 loss: 4.289951324462891


 36%|███▌      | 26301/74087 [57:39<1:16:21, 10.43it/s]

Step 26300 loss: 4.278041362762451


 36%|███▌      | 26401/74087 [57:48<1:16:12, 10.43it/s]

Step 26400 loss: 4.288464546203613


 36%|███▌      | 26501/74087 [57:58<1:15:52, 10.45it/s]

Step 26500 loss: 4.402616500854492


 36%|███▌      | 26601/74087 [58:07<1:16:16, 10.37it/s]

Step 26600 loss: 4.3966875076293945


 36%|███▌      | 26701/74087 [58:17<1:12:57, 10.82it/s]

Step 26700 loss: 4.239935874938965


 36%|███▌      | 26801/74087 [58:26<1:15:08, 10.49it/s]

Step 26800 loss: 4.476570129394531


 36%|███▋      | 26901/74087 [58:36<1:14:44, 10.52it/s]

Step 26900 loss: 4.3779988288879395


 36%|███▋      | 27001/74087 [58:45<1:15:44, 10.36it/s]

Step 27000 loss: 4.197674751281738


 37%|███▋      | 27101/74087 [58:55<1:15:24, 10.38it/s]

Step 27100 loss: 4.27398681640625


 37%|███▋      | 27201/74087 [59:04<1:14:02, 10.55it/s]

Step 27200 loss: 4.2746734619140625


 37%|███▋      | 27301/74087 [59:14<1:10:59, 10.98it/s]

Step 27300 loss: 4.459538459777832


 37%|███▋      | 27402/74087 [59:23<1:09:58, 11.12it/s]

Step 27400 loss: 4.393061637878418


 37%|███▋      | 27500/74087 [59:32<1:14:01, 10.49it/s]

Step 27500 loss: 4.427403450012207


 37%|███▋      | 27600/74087 [59:42<1:14:20, 10.42it/s]

Step 27600 loss: 4.2979817390441895


 37%|███▋      | 27700/74087 [59:51<1:15:00, 10.31it/s]

Step 27700 loss: 4.163038730621338


 38%|███▊      | 27800/74087 [1:00:01<1:13:52, 10.44it/s]

Step 27800 loss: 4.235406875610352


 38%|███▊      | 27900/74087 [1:00:10<1:13:11, 10.52it/s]

Step 27900 loss: 4.355370998382568


 38%|███▊      | 28000/74087 [1:00:20<1:13:22, 10.47it/s]

Step 28000 loss: 4.38247013092041


 38%|███▊      | 28102/74087 [1:00:29<1:11:32, 10.71it/s]

Step 28100 loss: 4.290283203125


 38%|███▊      | 28200/74087 [1:00:39<1:13:21, 10.43it/s]

Step 28200 loss: 4.220869064331055


 38%|███▊      | 28302/74087 [1:00:48<1:10:41, 10.79it/s]

Step 28300 loss: 4.2326979637146


 38%|███▊      | 28400/74087 [1:00:58<1:13:00, 10.43it/s]

Step 28400 loss: 4.406930923461914


 38%|███▊      | 28500/74087 [1:01:07<1:12:26, 10.49it/s]

Step 28500 loss: 4.342976093292236


 39%|███▊      | 28602/74087 [1:01:17<1:11:51, 10.55it/s]

Step 28600 loss: 4.263406753540039


 39%|███▊      | 28700/74087 [1:01:26<1:10:47, 10.69it/s]

Step 28700 loss: 4.291591167449951


 39%|███▉      | 28800/74087 [1:01:36<1:12:48, 10.37it/s]

Step 28800 loss: 4.328085899353027


 39%|███▉      | 28900/74087 [1:01:45<1:12:48, 10.34it/s]

Step 28900 loss: 4.173186779022217


 39%|███▉      | 29000/74087 [1:01:55<1:11:45, 10.47it/s]

Step 29000 loss: 4.22959041595459


 39%|███▉      | 29100/74087 [1:02:05<1:11:40, 10.46it/s]

Step 29100 loss: 4.1660966873168945


 39%|███▉      | 29200/74087 [1:02:14<1:10:41, 10.58it/s]

Step 29200 loss: 4.084320545196533


 40%|███▉      | 29300/74087 [1:02:24<1:11:07, 10.49it/s]

Step 29300 loss: 4.13485050201416


 40%|███▉      | 29400/74087 [1:02:33<1:10:32, 10.56it/s]

Step 29400 loss: 4.287463188171387


 40%|███▉      | 29502/74087 [1:02:43<1:08:08, 10.91it/s]

Step 29500 loss: 4.306617259979248


 40%|███▉      | 29600/74087 [1:02:52<1:09:13, 10.71it/s]

Step 29600 loss: 4.236649513244629


 40%|████      | 29700/74087 [1:03:01<1:08:22, 10.82it/s]

Step 29700 loss: 4.199737071990967


 40%|████      | 29800/74087 [1:03:11<1:09:49, 10.57it/s]

Step 29800 loss: 4.197869777679443


 40%|████      | 29901/74087 [1:03:20<1:10:35, 10.43it/s]

Step 29900 loss: 4.114425182342529


 40%|████      | 30001/74087 [1:03:30<1:09:29, 10.57it/s]

Step 30000 loss: 4.306082725524902


 41%|████      | 30101/74087 [1:03:39<1:10:23, 10.41it/s]

Step 30100 loss: 4.1476922035217285


 41%|████      | 30201/74087 [1:03:49<1:10:45, 10.34it/s]

Step 30200 loss: 4.273468494415283


 41%|████      | 30301/74087 [1:03:59<1:09:34, 10.49it/s]

Step 30300 loss: 4.128602981567383


 41%|████      | 30401/74087 [1:04:08<1:10:21, 10.35it/s]

Step 30400 loss: 4.127683162689209


 41%|████      | 30501/74087 [1:04:18<1:08:39, 10.58it/s]

Step 30500 loss: 4.170048713684082


 41%|████▏     | 30601/74087 [1:04:27<1:08:54, 10.52it/s]

Step 30600 loss: 4.174494743347168


 41%|████▏     | 30701/74087 [1:04:37<1:06:40, 10.84it/s]

Step 30700 loss: 4.272640228271484


 42%|████▏     | 30801/74087 [1:04:46<1:09:09, 10.43it/s]

Step 30800 loss: 4.195520877838135


 42%|████▏     | 30901/74087 [1:04:55<1:06:40, 10.79it/s]

Step 30900 loss: 4.2167229652404785


 42%|████▏     | 31001/74087 [1:05:04<1:03:36, 11.29it/s]

Step 31000 loss: 4.072284698486328


 42%|████▏     | 31101/74087 [1:05:13<1:06:34, 10.76it/s]

Step 31100 loss: 4.227053165435791


 42%|████▏     | 31201/74087 [1:05:23<1:09:41, 10.26it/s]

Step 31200 loss: 4.2112202644348145


 42%|████▏     | 31301/74087 [1:05:32<1:07:44, 10.53it/s]

Step 31300 loss: 4.047455310821533


 42%|████▏     | 31401/74087 [1:05:42<1:08:46, 10.34it/s]

Step 31400 loss: 4.216526985168457


 43%|████▎     | 31501/74087 [1:05:51<1:05:11, 10.89it/s]

Step 31500 loss: 4.075294494628906


 43%|████▎     | 31601/74087 [1:06:01<1:06:15, 10.69it/s]

Step 31600 loss: 4.233036994934082


 43%|████▎     | 31701/74087 [1:06:10<1:09:42, 10.13it/s]

Step 31700 loss: 4.2129669189453125


 43%|████▎     | 31801/74087 [1:06:20<1:05:24, 10.78it/s]

Step 31800 loss: 4.006443023681641


 43%|████▎     | 31901/74087 [1:06:29<1:07:31, 10.41it/s]

Step 31900 loss: 4.124462604522705


 43%|████▎     | 32001/74087 [1:06:39<1:04:43, 10.84it/s]

Step 32000 loss: 4.222722053527832


 43%|████▎     | 32101/74087 [1:06:48<1:04:05, 10.92it/s]

Step 32100 loss: 4.207855701446533


 43%|████▎     | 32202/74087 [1:26:00<2:10:22,  5.35it/s]    

Step 32200 loss: 4.005204677581787


 44%|████▎     | 32300/74087 [1:26:09<1:05:30, 10.63it/s]

Step 32300 loss: 4.175022602081299


 44%|████▎     | 32400/74087 [1:26:19<1:05:23, 10.62it/s]

Step 32400 loss: 4.279916763305664


 44%|████▍     | 32502/74087 [1:26:29<1:04:58, 10.67it/s]

Step 32500 loss: 4.153117656707764


 44%|████▍     | 32600/74087 [1:26:38<1:05:08, 10.62it/s]

Step 32600 loss: 4.356125831604004


 44%|████▍     | 32700/74087 [1:26:47<1:05:01, 10.61it/s]

Step 32700 loss: 4.413214206695557


 44%|████▍     | 32800/74087 [1:26:57<1:04:59, 10.59it/s]

Step 32800 loss: 4.147377014160156


 44%|████▍     | 32901/74087 [1:27:06<1:06:02, 10.39it/s]

Step 32900 loss: 4.127335071563721


 45%|████▍     | 33001/74087 [1:27:16<1:04:42, 10.58it/s]

Step 33000 loss: 3.99833607673645


 45%|████▍     | 33101/74087 [1:27:25<1:04:25, 10.60it/s]

Step 33100 loss: 4.036007404327393


 45%|████▍     | 33201/74087 [1:27:35<1:05:46, 10.36it/s]

Step 33200 loss: 4.141491889953613


 45%|████▍     | 33301/74087 [1:27:44<1:04:58, 10.46it/s]

Step 33300 loss: 4.20940637588501


 45%|████▌     | 33401/74087 [1:27:54<1:05:06, 10.42it/s]

Step 33400 loss: 4.087664604187012


 45%|████▌     | 33501/74087 [1:28:03<59:57, 11.28it/s]  

Step 33500 loss: 4.032515525817871


 45%|████▌     | 33601/74087 [1:28:12<1:02:40, 10.77it/s]

Step 33600 loss: 4.075494289398193


 45%|████▌     | 33700/74087 [1:28:22<1:01:09, 11.01it/s]

Step 33700 loss: 3.935870409011841


 46%|████▌     | 33800/74087 [1:28:31<1:04:17, 10.44it/s]

Step 33800 loss: 4.091902732849121


 46%|████▌     | 33900/74087 [1:28:41<1:04:33, 10.37it/s]

Step 33900 loss: 4.144588470458984


 46%|████▌     | 34000/74087 [1:28:50<1:03:28, 10.53it/s]

Step 34000 loss: 4.027821063995361


 46%|████▌     | 34102/74087 [1:29:00<1:03:36, 10.48it/s]

Step 34100 loss: 3.936681032180786


 46%|████▌     | 34200/74087 [1:29:09<1:04:15, 10.35it/s]

Step 34200 loss: 4.1417927742004395


 46%|████▋     | 34301/74087 [1:29:18<1:01:47, 10.73it/s]

Step 34300 loss: 3.9902679920196533


 46%|████▋     | 34401/74087 [1:29:28<1:02:09, 10.64it/s]

Step 34400 loss: 4.096485137939453


 47%|████▋     | 34501/74087 [1:29:37<1:01:06, 10.80it/s]

Step 34500 loss: 3.9936420917510986


 47%|████▋     | 34601/74087 [1:29:46<1:02:04, 10.60it/s]

Step 34600 loss: 4.08844518661499


 47%|████▋     | 34701/74087 [1:29:56<1:02:21, 10.53it/s]

Step 34700 loss: 4.114170074462891


 47%|████▋     | 34801/74087 [1:30:05<1:00:03, 10.90it/s]

Step 34800 loss: 3.8962180614471436


 47%|████▋     | 34901/74087 [1:30:15<59:55, 10.90it/s]  

Step 34900 loss: 4.155146598815918


 47%|████▋     | 35001/74087 [1:30:24<1:02:20, 10.45it/s]

Step 35000 loss: 3.915264368057251


 47%|████▋     | 35101/74087 [1:30:34<1:01:56, 10.49it/s]

Step 35100 loss: 4.072635650634766


 48%|████▊     | 35201/74087 [1:30:43<1:02:20, 10.40it/s]

Step 35200 loss: 4.069887638092041


 48%|████▊     | 35301/74087 [1:30:53<1:02:41, 10.31it/s]

Step 35300 loss: 4.03363561630249


 48%|████▊     | 35401/74087 [1:31:02<1:03:11, 10.20it/s]

Step 35400 loss: 4.032771110534668


 48%|████▊     | 35501/74087 [1:31:12<1:03:08, 10.18it/s]

Step 35500 loss: 4.036449909210205


 48%|████▊     | 35601/74087 [1:31:22<59:23, 10.80it/s]  

Step 35600 loss: 4.009645938873291


 48%|████▊     | 35701/74087 [1:31:31<1:00:06, 10.64it/s]

Step 35700 loss: 3.913883686065674


 48%|████▊     | 35801/74087 [1:31:41<1:00:24, 10.56it/s]

Step 35800 loss: 3.9674153327941895


 48%|████▊     | 35901/74087 [1:31:50<1:01:30, 10.35it/s]

Step 35900 loss: 4.083946228027344


 49%|████▊     | 36001/74087 [1:32:00<57:29, 11.04it/s]  

Step 36000 loss: 4.156008243560791


 49%|████▊     | 36101/74087 [1:32:09<56:44, 11.16it/s]  

Step 36100 loss: 4.058530330657959


 49%|████▉     | 36201/74087 [1:32:18<57:43, 10.94it/s]  

Step 36200 loss: 4.0299224853515625


 49%|████▉     | 36301/74087 [1:32:28<57:29, 10.96it/s]  

Step 36300 loss: 3.8388185501098633


 49%|████▉     | 36401/74087 [1:32:37<1:01:29, 10.21it/s]

Step 36400 loss: 4.142387866973877


 49%|████▉     | 36501/74087 [1:32:47<1:02:04, 10.09it/s]

Step 36500 loss: 3.8961312770843506


 49%|████▉     | 36601/74087 [1:32:56<1:00:02, 10.41it/s]

Step 36600 loss: 4.008721828460693


 50%|████▉     | 36702/74087 [1:33:06<57:06, 10.91it/s]  

Step 36700 loss: 4.012272357940674


 50%|████▉     | 36801/74087 [1:33:15<59:00, 10.53it/s]  

Step 36800 loss: 4.027879238128662


 50%|████▉     | 36902/74087 [1:33:25<59:04, 10.49it/s]  

Step 36900 loss: 3.821023464202881


 50%|████▉     | 37000/74087 [1:33:34<59:41, 10.36it/s]  

Step 37000 loss: 3.972579002380371


 50%|█████     | 37100/74087 [1:33:44<59:24, 10.38it/s]  

Step 37100 loss: 4.039211750030518


 50%|█████     | 37200/74087 [1:33:54<58:49, 10.45it/s]

Step 37200 loss: 3.9294631481170654


 50%|█████     | 37302/74087 [1:34:03<58:16, 10.52it/s]  

Step 37300 loss: 4.04329252243042


 50%|█████     | 37402/74087 [1:34:13<58:03, 10.53it/s]

Step 37400 loss: 3.956606149673462


 51%|█████     | 37500/74087 [1:34:22<58:33, 10.41it/s]  

Step 37500 loss: 4.076470851898193


 51%|█████     | 37600/74087 [1:34:31<57:16, 10.62it/s]  

Step 37600 loss: 4.095648765563965


 51%|█████     | 37700/74087 [1:34:41<58:25, 10.38it/s]

Step 37700 loss: 4.014130592346191


 51%|█████     | 37800/74087 [1:34:51<56:48, 10.65it/s]

Step 37800 loss: 3.9212019443511963


 51%|█████     | 37900/74087 [1:35:00<58:05, 10.38it/s]

Step 37900 loss: 3.938476324081421


 51%|█████▏    | 38001/74087 [1:35:10<53:15, 11.29it/s]  

Step 38000 loss: 4.0820231437683105


 51%|█████▏    | 38101/74087 [1:35:19<57:21, 10.46it/s]

Step 38100 loss: 3.9725334644317627


 52%|█████▏    | 38201/74087 [1:35:28<55:44, 10.73it/s]

Step 38200 loss: 3.90889835357666


 52%|█████▏    | 38301/74087 [1:35:38<58:20, 10.22it/s]  

Step 38300 loss: 3.8483290672302246


 52%|█████▏    | 38401/74087 [1:35:47<58:10, 10.22it/s]

Step 38400 loss: 4.048056125640869


 52%|█████▏    | 38501/74087 [1:35:56<56:49, 10.44it/s]

Step 38500 loss: 3.9621779918670654


 52%|█████▏    | 38601/74087 [1:36:06<53:13, 11.11it/s]

Step 38600 loss: 4.051659107208252


 52%|█████▏    | 38701/74087 [1:36:15<55:25, 10.64it/s]

Step 38700 loss: 3.848507881164551


 52%|█████▏    | 38801/74087 [1:36:25<54:59, 10.70it/s]

Step 38800 loss: 3.8944432735443115


 53%|█████▎    | 38901/74087 [1:36:34<57:10, 10.26it/s]

Step 38900 loss: 3.9098174571990967


 53%|█████▎    | 39001/74087 [1:36:44<57:30, 10.17it/s]

Step 39000 loss: 4.023675441741943


 53%|█████▎    | 39101/74087 [1:36:53<51:09, 11.40it/s]

Step 39100 loss: 3.934791088104248


 53%|█████▎    | 39201/74087 [1:37:02<53:53, 10.79it/s]

Step 39200 loss: 3.938704490661621


 53%|█████▎    | 39302/74087 [1:37:12<54:26, 10.65it/s]

Step 39300 loss: 3.9846487045288086


 53%|█████▎    | 39401/74087 [1:37:21<55:40, 10.38it/s]

Step 39400 loss: 3.7963449954986572


 53%|█████▎    | 39501/74087 [1:37:31<55:02, 10.47it/s]

Step 39500 loss: 3.940701961517334


 53%|█████▎    | 39600/74087 [1:37:41<58:08,  9.89it/s]

Step 39600 loss: 3.7609260082244873


 54%|█████▎    | 39700/74087 [1:37:50<54:44, 10.47it/s]

Step 39700 loss: 3.93550705909729


 54%|█████▎    | 39802/74087 [1:38:00<54:08, 10.55it/s]

Step 39800 loss: 3.849287271499634


 54%|█████▍    | 39900/74087 [1:38:09<53:41, 10.61it/s]

Step 39900 loss: 3.933032512664795


 54%|█████▍    | 40002/74087 [1:38:19<53:39, 10.59it/s]

Step 40000 loss: 3.83655047416687


 54%|█████▍    | 40102/74087 [1:38:28<52:43, 10.74it/s]

Step 40100 loss: 3.822666645050049


 54%|█████▍    | 40200/74087 [1:38:38<53:20, 10.59it/s]

Step 40200 loss: 3.9986908435821533


 54%|█████▍    | 40300/74087 [1:38:47<54:06, 10.41it/s]

Step 40300 loss: 3.9078423976898193


 55%|█████▍    | 40400/74087 [1:38:57<53:28, 10.50it/s]

Step 40400 loss: 4.107179164886475


 55%|█████▍    | 40500/74087 [1:39:06<53:42, 10.42it/s]

Step 40500 loss: 3.8895750045776367


 55%|█████▍    | 40602/74087 [1:39:16<53:05, 10.51it/s]

Step 40600 loss: 3.73189640045166


 55%|█████▍    | 40702/74087 [1:39:25<51:23, 10.83it/s]

Step 40700 loss: 3.889373302459717


 55%|█████▌    | 40800/74087 [1:39:35<52:39, 10.54it/s]

Step 40800 loss: 3.956153631210327


 55%|█████▌    | 40900/74087 [1:39:44<53:25, 10.35it/s]

Step 40900 loss: 3.8590822219848633


 55%|█████▌    | 41000/74087 [1:39:54<52:40, 10.47it/s]

Step 41000 loss: 3.9280524253845215


 55%|█████▌    | 41100/74087 [1:40:03<52:00, 10.57it/s]

Step 41100 loss: 3.7817015647888184


 56%|█████▌    | 41200/74087 [1:40:13<51:56, 10.55it/s]

Step 41200 loss: 3.809239149093628


 56%|█████▌    | 41300/74087 [1:40:22<51:49, 10.55it/s]

Step 41300 loss: 3.917933464050293


 56%|█████▌    | 41400/74087 [1:40:32<52:07, 10.45it/s]

Step 41400 loss: 3.8803200721740723


 56%|█████▌    | 41500/74087 [1:40:41<50:39, 10.72it/s]

Step 41500 loss: 3.963303804397583


 56%|█████▌    | 41600/74087 [1:40:51<53:22, 10.15it/s]

Step 41600 loss: 3.8744819164276123


 56%|█████▋    | 41700/74087 [1:41:00<52:34, 10.27it/s]

Step 41700 loss: 3.907891035079956


 56%|█████▋    | 41802/74087 [1:41:10<48:12, 11.16it/s]

Step 41800 loss: 3.799546241760254


 57%|█████▋    | 41900/74087 [1:41:19<47:38, 11.26it/s]

Step 41900 loss: 3.8675618171691895


 57%|█████▋    | 42002/74087 [1:41:28<48:21, 11.06it/s]

Step 42000 loss: 3.765312910079956


 57%|█████▋    | 42101/74087 [1:41:38<51:23, 10.37it/s]

Step 42100 loss: 3.8186161518096924


 57%|█████▋    | 42201/74087 [1:41:47<49:12, 10.80it/s]

Step 42200 loss: 3.848062038421631


 57%|█████▋    | 42301/74087 [1:41:57<51:27, 10.29it/s]

Step 42300 loss: 3.8431103229522705


 57%|█████▋    | 42401/74087 [1:42:06<50:13, 10.51it/s]

Step 42400 loss: 3.8447670936584473


 57%|█████▋    | 42501/74087 [1:42:16<49:44, 10.58it/s]

Step 42500 loss: 4.001651287078857


 58%|█████▊    | 42601/74087 [1:42:26<50:29, 10.39it/s]

Step 42600 loss: 3.825086832046509


 58%|█████▊    | 42701/74087 [1:42:35<50:12, 10.42it/s]

Step 42700 loss: 3.9813623428344727


 58%|█████▊    | 42801/74087 [1:42:45<49:59, 10.43it/s]

Step 42800 loss: 3.875364303588867


 58%|█████▊    | 42901/74087 [1:42:54<49:11, 10.57it/s]

Step 42900 loss: 3.790861129760742


 58%|█████▊    | 43001/74087 [1:43:04<48:59, 10.58it/s]

Step 43000 loss: 3.88637375831604


 58%|█████▊    | 43101/74087 [1:43:13<50:07, 10.30it/s]

Step 43100 loss: 3.8747105598449707


 58%|█████▊    | 43200/74087 [1:43:23<49:14, 10.46it/s]

Step 43200 loss: 3.849093437194824


 58%|█████▊    | 43302/74087 [1:43:32<48:27, 10.59it/s]

Step 43300 loss: 3.85699200630188


 59%|█████▊    | 43400/74087 [1:43:42<50:10, 10.19it/s]

Step 43400 loss: 3.8173961639404297


 59%|█████▊    | 43500/74087 [1:43:51<48:34, 10.50it/s]

Step 43500 loss: 3.693016529083252


 59%|█████▉    | 43602/74087 [1:44:01<48:32, 10.47it/s]

Step 43600 loss: 3.737060308456421


 59%|█████▉    | 43700/74087 [1:44:10<48:15, 10.50it/s]

Step 43700 loss: 3.6908304691314697


 59%|█████▉    | 43800/74087 [1:44:20<46:41, 10.81it/s]

Step 43800 loss: 3.8838155269622803


 59%|█████▉    | 43900/74087 [1:44:29<48:53, 10.29it/s]

Step 43900 loss: 3.623533010482788


 59%|█████▉    | 44001/74087 [1:44:39<50:46,  9.87it/s]

Step 44000 loss: 3.89189076423645


 60%|█████▉    | 44100/74087 [1:44:48<45:06, 11.08it/s]

Step 44100 loss: 3.9132659435272217


 60%|█████▉    | 44202/74087 [1:44:58<49:12, 10.12it/s]

Step 44200 loss: 3.752045154571533


 60%|█████▉    | 44300/74087 [1:45:07<47:52, 10.37it/s]

Step 44300 loss: 3.8119730949401855


 60%|█████▉    | 44400/74087 [1:45:17<48:23, 10.23it/s]

Step 44400 loss: 3.694350481033325


 60%|██████    | 44502/74087 [1:45:27<47:04, 10.48it/s]

Step 44500 loss: 3.8103678226470947


 60%|██████    | 44600/74087 [1:45:36<47:33, 10.33it/s]

Step 44600 loss: 3.780888795852661


 60%|██████    | 44700/74087 [1:45:45<46:51, 10.45it/s]

Step 44700 loss: 3.7950994968414307


 60%|██████    | 44802/74087 [1:45:55<46:43, 10.45it/s]

Step 44800 loss: 3.723116636276245


 61%|██████    | 44902/74087 [1:46:05<44:45, 10.87it/s]

Step 44900 loss: 3.799100875854492


 61%|██████    | 45000/74087 [1:46:14<47:16, 10.26it/s]

Step 45000 loss: 3.7582335472106934


 61%|██████    | 45102/74087 [1:46:24<44:05, 10.96it/s]

Step 45100 loss: 3.670802354812622


 61%|██████    | 45200/74087 [1:46:33<46:05, 10.44it/s]

Step 45200 loss: 3.788102865219116


 61%|██████    | 45300/74087 [1:46:42<46:15, 10.37it/s]

Step 45300 loss: 3.806968927383423


 61%|██████▏   | 45400/74087 [1:46:52<45:27, 10.52it/s]

Step 45400 loss: 3.8610827922821045


 61%|██████▏   | 45502/74087 [1:47:02<44:45, 10.64it/s]

Step 45500 loss: 3.9689009189605713


 62%|██████▏   | 45600/74087 [1:47:11<42:51, 11.08it/s]

Step 45600 loss: 3.6230874061584473


 62%|██████▏   | 45700/74087 [1:47:20<44:39, 10.60it/s]

Step 45700 loss: 3.7876365184783936


 62%|██████▏   | 45802/74087 [1:47:29<43:04, 10.95it/s]

Step 45800 loss: 3.675382137298584


 62%|██████▏   | 45900/74087 [1:47:39<44:58, 10.44it/s]

Step 45900 loss: 3.700397491455078


 62%|██████▏   | 46000/74087 [1:47:48<45:40, 10.25it/s]

Step 46000 loss: 3.806535482406616


 62%|██████▏   | 46100/74087 [1:47:58<43:42, 10.67it/s]

Step 46100 loss: 3.766589879989624


 62%|██████▏   | 46200/74087 [1:48:07<44:10, 10.52it/s]

Step 46200 loss: 3.923879623413086


 62%|██████▏   | 46302/74087 [1:48:17<42:39, 10.86it/s]

Step 46300 loss: 3.844607353210449


 63%|██████▎   | 46400/74087 [1:48:26<43:57, 10.50it/s]

Step 46400 loss: 3.696566343307495


 63%|██████▎   | 46500/74087 [1:48:36<45:01, 10.21it/s]

Step 46500 loss: 3.8067116737365723


 63%|██████▎   | 46601/74087 [1:48:45<42:44, 10.72it/s]

Step 46600 loss: 3.9297072887420654


 63%|██████▎   | 46701/74087 [1:48:55<43:44, 10.43it/s]

Step 46700 loss: 3.8033857345581055


 63%|██████▎   | 46801/74087 [1:49:04<42:54, 10.60it/s]

Step 46800 loss: 3.75941801071167


 63%|██████▎   | 46901/74087 [1:49:14<43:09, 10.50it/s]

Step 46900 loss: 3.865499258041382


 63%|██████▎   | 47001/74087 [1:49:24<42:41, 10.57it/s]

Step 47000 loss: 3.878882646560669


 64%|██████▎   | 47101/74087 [1:49:33<42:55, 10.48it/s]

Step 47100 loss: 3.735297441482544


 64%|██████▎   | 47200/74087 [1:49:43<42:35, 10.52it/s]

Step 47200 loss: 3.7056102752685547


 64%|██████▍   | 47302/74087 [1:49:52<42:27, 10.51it/s]

Step 47300 loss: 3.8511977195739746


 64%|██████▍   | 47400/74087 [1:50:02<41:39, 10.68it/s]

Step 47400 loss: 3.6816797256469727


 64%|██████▍   | 47500/74087 [1:50:11<42:13, 10.49it/s]

Step 47500 loss: 3.8303685188293457


 64%|██████▍   | 47600/74087 [1:50:21<42:47, 10.32it/s]

Step 47600 loss: 3.815417528152466


 64%|██████▍   | 47702/74087 [1:50:31<41:48, 10.52it/s]

Step 47700 loss: 3.826727867126465


 65%|██████▍   | 47800/74087 [1:50:40<40:56, 10.70it/s]

Step 47800 loss: 3.6799709796905518


 65%|██████▍   | 47900/74087 [1:50:49<42:47, 10.20it/s]

Step 47900 loss: 3.6667773723602295


 65%|██████▍   | 48000/74087 [1:50:59<41:11, 10.56it/s]

Step 48000 loss: 3.7701473236083984


 65%|██████▍   | 48100/74087 [1:51:08<41:26, 10.45it/s]

Step 48100 loss: 3.8643546104431152


 65%|██████▌   | 48200/74087 [1:51:18<41:54, 10.29it/s]

Step 48200 loss: 3.857640504837036


 65%|██████▌   | 48302/74087 [1:51:28<40:39, 10.57it/s]

Step 48300 loss: 3.817530632019043


 65%|██████▌   | 48400/74087 [1:51:37<40:55, 10.46it/s]

Step 48400 loss: 3.606968402862549


 65%|██████▌   | 48500/74087 [1:51:47<40:34, 10.51it/s]

Step 48500 loss: 3.7698984146118164


 66%|██████▌   | 48600/74087 [1:51:56<40:27, 10.50it/s]

Step 48600 loss: 3.606755495071411


 66%|██████▌   | 48700/74087 [1:52:06<39:34, 10.69it/s]

Step 48700 loss: 3.7638065814971924


 66%|██████▌   | 48800/74087 [1:52:15<39:51, 10.57it/s]

Step 48800 loss: 3.6940808296203613


 66%|██████▌   | 48902/74087 [1:52:25<39:47, 10.55it/s]

Step 48900 loss: 3.7517852783203125


 66%|██████▌   | 49000/74087 [1:52:34<39:23, 10.61it/s]

Step 49000 loss: 3.742900848388672


 66%|██████▋   | 49100/74087 [1:52:44<39:13, 10.62it/s]

Step 49100 loss: 3.6044418811798096


 66%|██████▋   | 49200/74087 [1:52:53<39:47, 10.42it/s]

Step 49200 loss: 3.744961738586426


 67%|██████▋   | 49302/74087 [1:53:03<39:46, 10.39it/s]

Step 49300 loss: 3.8133461475372314


 67%|██████▋   | 49400/74087 [1:53:12<39:17, 10.47it/s]

Step 49400 loss: 3.7398624420166016


 67%|██████▋   | 49500/74087 [1:53:22<39:35, 10.35it/s]

Step 49500 loss: 3.7340779304504395


 67%|██████▋   | 49600/74087 [1:53:31<38:08, 10.70it/s]

Step 49600 loss: 3.8079123497009277


 67%|██████▋   | 49700/74087 [1:53:41<39:17, 10.34it/s]

Step 49700 loss: 3.753028631210327


 67%|██████▋   | 49800/74087 [1:53:50<38:09, 10.61it/s]

Step 49800 loss: 3.8308804035186768


 67%|██████▋   | 49902/74087 [1:54:00<38:29, 10.47it/s]

Step 49900 loss: 3.7428040504455566


 67%|██████▋   | 50000/74087 [1:54:09<37:24, 10.73it/s]

Step 50000 loss: 3.727912664413452


 68%|██████▊   | 50102/74087 [1:54:19<38:39, 10.34it/s]

Step 50100 loss: 3.730839490890503


 68%|██████▊   | 50200/74087 [1:54:29<38:18, 10.39it/s]

Step 50200 loss: 3.6186559200286865


 68%|██████▊   | 50300/74087 [1:54:38<38:22, 10.33it/s]

Step 50300 loss: 3.612064838409424


 68%|██████▊   | 50400/74087 [1:54:48<37:00, 10.67it/s]

Step 50400 loss: 3.59033203125


 68%|██████▊   | 50500/74087 [1:54:57<36:44, 10.70it/s]

Step 50500 loss: 3.7060678005218506


 68%|██████▊   | 50602/74087 [1:55:07<36:03, 10.86it/s]

Step 50600 loss: 3.564908266067505


 68%|██████▊   | 50700/74087 [1:55:16<36:45, 10.61it/s]

Step 50700 loss: 3.6641414165496826


 69%|██████▊   | 50800/74087 [1:55:25<36:45, 10.56it/s]

Step 50800 loss: 3.599735736846924


 69%|██████▊   | 50900/74087 [1:55:35<36:32, 10.58it/s]

Step 50900 loss: 3.7046215534210205


 69%|██████▉   | 51000/74087 [1:55:44<36:35, 10.51it/s]

Step 51000 loss: 3.706620216369629


 69%|██████▉   | 51100/74087 [1:55:54<36:31, 10.49it/s]

Step 51100 loss: 3.6748099327087402


 69%|██████▉   | 51201/74087 [1:56:04<39:00,  9.78it/s]

Step 51200 loss: 3.6082539558410645


 69%|██████▉   | 51301/74087 [1:56:13<36:16, 10.47it/s]

Step 51300 loss: 3.7922203540802


 69%|██████▉   | 51401/74087 [1:56:23<35:46, 10.57it/s]

Step 51400 loss: 3.7655515670776367


 70%|██████▉   | 51501/74087 [1:56:32<35:43, 10.54it/s]

Step 51500 loss: 3.8193509578704834


 70%|██████▉   | 51601/74087 [1:56:42<35:08, 10.66it/s]

Step 51600 loss: 3.674217939376831


 70%|██████▉   | 51701/74087 [1:56:51<35:07, 10.62it/s]

Step 51700 loss: 3.7393269538879395


 70%|██████▉   | 51801/74087 [1:57:01<35:19, 10.52it/s]

Step 51800 loss: 3.630527973175049


 70%|███████   | 51901/74087 [1:57:10<34:43, 10.65it/s]

Step 51900 loss: 3.752305746078491


 70%|███████   | 52001/74087 [1:57:20<35:32, 10.36it/s]

Step 52000 loss: 3.586001396179199


 70%|███████   | 52101/74087 [1:57:29<35:45, 10.25it/s]

Step 52100 loss: 3.653502941131592


 70%|███████   | 52201/74087 [1:57:39<34:33, 10.56it/s]

Step 52200 loss: 3.734132766723633


 71%|███████   | 52301/74087 [1:57:48<34:17, 10.59it/s]

Step 52300 loss: 3.651723861694336


 71%|███████   | 52401/74087 [1:57:58<34:50, 10.37it/s]

Step 52400 loss: 3.614490509033203


 71%|███████   | 52501/74087 [1:58:07<34:14, 10.51it/s]

Step 52500 loss: 3.6418371200561523


 71%|███████   | 52601/74087 [1:58:17<32:04, 11.16it/s]

Step 52600 loss: 3.5582377910614014


 71%|███████   | 52701/74087 [1:58:26<33:03, 10.78it/s]

Step 52700 loss: 3.5800533294677734


 71%|███████▏  | 52801/74087 [1:58:35<33:50, 10.48it/s]

Step 52800 loss: 3.654362201690674


 71%|███████▏  | 52901/74087 [1:58:45<33:44, 10.47it/s]

Step 52900 loss: 3.7143988609313965


 72%|███████▏  | 53001/74087 [1:58:54<33:26, 10.51it/s]

Step 53000 loss: 3.694626808166504


 72%|███████▏  | 53101/74087 [1:59:04<33:20, 10.49it/s]

Step 53100 loss: 3.687075614929199


 72%|███████▏  | 53201/74087 [1:59:13<31:16, 11.13it/s]

Step 53200 loss: 3.671968936920166


 72%|███████▏  | 53301/74087 [1:59:23<32:17, 10.73it/s]

Step 53300 loss: 3.7394540309906006


 72%|███████▏  | 53401/74087 [1:59:32<32:49, 10.50it/s]

Step 53400 loss: 3.6467018127441406


 72%|███████▏  | 53501/74087 [1:59:42<32:51, 10.44it/s]

Step 53500 loss: 3.6383543014526367


 72%|███████▏  | 53601/74087 [1:59:51<33:08, 10.30it/s]

Step 53600 loss: 3.567453622817993


 72%|███████▏  | 53701/74087 [2:00:00<32:39, 10.41it/s]

Step 53700 loss: 3.537276268005371


 73%|███████▎  | 53801/74087 [2:00:10<32:42, 10.34it/s]

Step 53800 loss: 3.521505117416382


 73%|███████▎  | 53902/74087 [2:00:19<30:21, 11.08it/s]

Step 53900 loss: 3.5315611362457275


 73%|███████▎  | 54002/74087 [2:00:28<29:46, 11.24it/s]

Step 54000 loss: 3.6898274421691895


 73%|███████▎  | 54102/74087 [2:00:37<28:59, 11.49it/s]

Step 54100 loss: 3.589843511581421


 73%|███████▎  | 54200/74087 [2:00:46<30:07, 11.00it/s]

Step 54200 loss: 3.680605888366699


 73%|███████▎  | 54302/74087 [2:00:55<28:44, 11.47it/s]

Step 54300 loss: 3.644763946533203


 73%|███████▎  | 54402/74087 [2:01:04<28:31, 11.50it/s]

Step 54400 loss: 3.6936757564544678


 74%|███████▎  | 54500/74087 [2:01:13<29:28, 11.08it/s]

Step 54500 loss: 3.5674993991851807


 74%|███████▎  | 54602/74087 [2:01:23<29:50, 10.88it/s]

Step 54600 loss: 3.865288019180298


 74%|███████▍  | 54701/74087 [2:01:32<30:35, 10.56it/s]

Step 54700 loss: 3.539130210876465


 74%|███████▍  | 54801/74087 [2:01:41<30:25, 10.57it/s]

Step 54800 loss: 3.597153425216675


 74%|███████▍  | 54901/74087 [2:01:51<30:34, 10.46it/s]

Step 54900 loss: 3.5968616008758545


 74%|███████▍  | 55001/74087 [2:02:00<29:24, 10.82it/s]

Step 55000 loss: 3.6365628242492676


 74%|███████▍  | 55101/74087 [2:02:10<30:23, 10.41it/s]

Step 55100 loss: 3.5880327224731445


 75%|███████▍  | 55201/74087 [2:02:19<29:54, 10.53it/s]

Step 55200 loss: 3.5831456184387207


 75%|███████▍  | 55301/74087 [2:02:29<29:39, 10.55it/s]

Step 55300 loss: 3.6951186656951904


 75%|███████▍  | 55401/74087 [2:02:38<29:07, 10.70it/s]

Step 55400 loss: 3.5425009727478027


 75%|███████▍  | 55501/74087 [2:02:48<28:41, 10.80it/s]

Step 55500 loss: 3.6775224208831787


 75%|███████▌  | 55601/74087 [2:02:57<28:45, 10.71it/s]

Step 55600 loss: 3.6607837677001953


 75%|███████▌  | 55701/74087 [2:03:06<29:58, 10.22it/s]

Step 55700 loss: 3.458958864212036


 75%|███████▌  | 55801/74087 [2:03:16<29:00, 10.50it/s]

Step 55800 loss: 3.685739278793335


 75%|███████▌  | 55901/74087 [2:03:25<28:56, 10.47it/s]

Step 55900 loss: 3.549557685852051


 76%|███████▌  | 56001/74087 [2:03:35<28:21, 10.63it/s]

Step 56000 loss: 3.818247079849243


 76%|███████▌  | 56101/74087 [2:03:44<28:32, 10.50it/s]

Step 56100 loss: 3.7678236961364746


 76%|███████▌  | 56201/74087 [2:03:53<28:25, 10.49it/s]

Step 56200 loss: 3.6773250102996826


 76%|███████▌  | 56301/74087 [2:04:03<26:23, 11.23it/s]

Step 56300 loss: 3.605679750442505


 76%|███████▌  | 56401/74087 [2:04:12<27:36, 10.68it/s]

Step 56400 loss: 3.588268756866455


 76%|███████▋  | 56501/74087 [2:04:21<25:29, 11.50it/s]

Step 56500 loss: 3.7765939235687256


 76%|███████▋  | 56601/74087 [2:04:31<27:57, 10.43it/s]

Step 56600 loss: 3.614135265350342


 77%|███████▋  | 56701/74087 [2:04:40<27:35, 10.50it/s]

Step 56700 loss: 3.4780991077423096


 77%|███████▋  | 56801/74087 [2:04:50<27:24, 10.51it/s]

Step 56800 loss: 3.5289382934570312


 77%|███████▋  | 56901/74087 [2:04:59<27:18, 10.49it/s]

Step 56900 loss: 3.4428882598876953


 77%|███████▋  | 57001/74087 [2:05:08<24:03, 11.83it/s]

Step 57000 loss: 3.5893330574035645


 77%|███████▋  | 57101/74087 [2:05:18<25:35, 11.07it/s]

Step 57100 loss: 3.5152220726013184


 77%|███████▋  | 57201/74087 [2:05:27<25:42, 10.95it/s]

Step 57200 loss: 3.8578083515167236


 77%|███████▋  | 57301/74087 [2:05:36<26:40, 10.48it/s]

Step 57300 loss: 3.591926097869873


 77%|███████▋  | 57401/74087 [2:05:45<26:35, 10.46it/s]

Step 57400 loss: 3.6447525024414062


 78%|███████▊  | 57501/74087 [2:05:55<26:52, 10.28it/s]

Step 57500 loss: 3.483017683029175


 78%|███████▊  | 57601/74087 [2:06:04<26:18, 10.44it/s]

Step 57600 loss: 3.426067590713501


 78%|███████▊  | 57701/74087 [2:06:14<24:51, 10.99it/s]

Step 57700 loss: 3.5066475868225098


 78%|███████▊  | 57801/74087 [2:06:23<24:31, 11.07it/s]

Step 57800 loss: 3.6658759117126465


 78%|███████▊  | 57901/74087 [2:06:33<25:03, 10.77it/s]

Step 57900 loss: 3.468001127243042


 78%|███████▊  | 58001/74087 [2:06:42<25:19, 10.58it/s]

Step 58000 loss: 3.5816843509674072


 78%|███████▊  | 58101/74087 [2:06:52<24:45, 10.76it/s]

Step 58100 loss: 3.6328485012054443


 79%|███████▊  | 58201/74087 [2:07:01<24:56, 10.62it/s]

Step 58200 loss: 3.468244791030884


 79%|███████▊  | 58301/74087 [2:07:10<24:57, 10.54it/s]

Step 58300 loss: 3.6064367294311523


 79%|███████▉  | 58401/74087 [2:07:20<24:46, 10.55it/s]

Step 58400 loss: 3.4638490676879883


 79%|███████▉  | 58501/74087 [2:07:29<24:18, 10.69it/s]

Step 58500 loss: 3.646228075027466


 79%|███████▉  | 58601/74087 [2:07:38<24:03, 10.73it/s]

Step 58600 loss: 3.5650627613067627


 79%|███████▉  | 58701/74087 [2:07:48<23:32, 10.90it/s]

Step 58700 loss: 3.546882390975952


 79%|███████▉  | 58801/74087 [2:07:57<24:31, 10.39it/s]

Step 58800 loss: 3.524005174636841


 80%|███████▉  | 58900/74087 [2:08:06<24:56, 10.15it/s]

Step 58900 loss: 3.6860101222991943


 80%|███████▉  | 59002/74087 [2:08:16<22:59, 10.94it/s]

Step 59000 loss: 3.6004528999328613


 80%|███████▉  | 59100/74087 [2:08:25<23:56, 10.43it/s]

Step 59100 loss: 3.5719032287597656


 80%|███████▉  | 59200/74087 [2:08:35<23:05, 10.75it/s]

Step 59200 loss: 3.5603270530700684


 80%|████████  | 59300/74087 [2:08:44<23:08, 10.65it/s]

Step 59300 loss: 3.7139248847961426


 80%|████████  | 59402/74087 [2:08:54<22:52, 10.70it/s]

Step 59400 loss: 3.653402805328369


 80%|████████  | 59502/74087 [2:09:03<23:01, 10.56it/s]

Step 59500 loss: 3.656621217727661


 80%|████████  | 59600/74087 [2:09:12<22:59, 10.50it/s]

Step 59600 loss: 3.5931878089904785


 81%|████████  | 59702/74087 [2:09:22<22:56, 10.45it/s]

Step 59700 loss: 3.612333059310913


 81%|████████  | 59800/74087 [2:09:31<22:41, 10.50it/s]

Step 59800 loss: 3.4639177322387695


 81%|████████  | 59900/74087 [2:09:41<23:07, 10.23it/s]

Step 59900 loss: 3.7473134994506836


 81%|████████  | 60000/74087 [2:09:50<22:34, 10.40it/s]

Step 60000 loss: 3.5810282230377197


 81%|████████  | 60100/74087 [2:10:00<22:03, 10.56it/s]

Step 60100 loss: 3.5896127223968506


 81%|████████▏ | 60200/74087 [2:10:09<22:08, 10.45it/s]

Step 60200 loss: 3.519829273223877


 81%|████████▏ | 60300/74087 [2:10:18<22:10, 10.36it/s]

Step 60300 loss: 3.693324565887451


 82%|████████▏ | 60400/74087 [2:10:28<21:34, 10.57it/s]

Step 60400 loss: 3.5403003692626953


 82%|████████▏ | 60502/74087 [2:10:38<21:02, 10.76it/s]

Step 60500 loss: 3.482905149459839


 82%|████████▏ | 60602/74087 [2:10:47<21:38, 10.38it/s]

Step 60600 loss: 3.558027744293213


 82%|████████▏ | 60700/74087 [2:10:56<21:28, 10.39it/s]

Step 60700 loss: 3.59062123298645


 82%|████████▏ | 60800/74087 [2:11:06<19:32, 11.33it/s]

Step 60800 loss: 3.6853907108306885


 82%|████████▏ | 60900/74087 [2:11:15<20:35, 10.67it/s]

Step 60900 loss: 3.6284608840942383


 82%|████████▏ | 61002/74087 [2:11:24<19:54, 10.95it/s]

Step 61000 loss: 3.619856834411621


 82%|████████▏ | 61102/74087 [2:11:34<20:46, 10.42it/s]

Step 61100 loss: 3.582594871520996


 83%|████████▎ | 61202/74087 [2:11:43<20:14, 10.61it/s]

Step 61200 loss: 3.623455286026001


 83%|████████▎ | 61300/74087 [2:11:52<20:20, 10.48it/s]

Step 61300 loss: 3.5864217281341553


 83%|████████▎ | 61402/74087 [2:12:02<19:04, 11.08it/s]

Step 61400 loss: 3.5332858562469482


 83%|████████▎ | 61502/74087 [2:12:11<19:08, 10.96it/s]

Step 61500 loss: 3.400904893875122


 83%|████████▎ | 61600/74087 [2:12:20<18:47, 11.08it/s]

Step 61600 loss: 3.6558048725128174


 83%|████████▎ | 61700/74087 [2:12:30<19:56, 10.35it/s]

Step 61700 loss: 3.516977071762085


 83%|████████▎ | 61802/74087 [2:12:39<18:05, 11.32it/s]

Step 61800 loss: 3.464778184890747


 84%|████████▎ | 61902/74087 [2:12:48<19:22, 10.48it/s]

Step 61900 loss: 3.415260076522827


 84%|████████▎ | 62002/74087 [2:12:57<18:36, 10.82it/s]

Step 62000 loss: 3.6298868656158447


 84%|████████▍ | 62100/74087 [2:13:07<18:54, 10.56it/s]

Step 62100 loss: 3.5042452812194824


 84%|████████▍ | 62202/74087 [2:13:16<18:29, 10.71it/s]

Step 62200 loss: 3.694429397583008


 84%|████████▍ | 62300/74087 [2:13:26<18:42, 10.50it/s]

Step 62300 loss: 3.541048049926758


 84%|████████▍ | 62400/74087 [2:13:35<18:24, 10.58it/s]

Step 62400 loss: 3.4058830738067627


 84%|████████▍ | 62500/74087 [2:13:44<18:09, 10.64it/s]

Step 62500 loss: 3.5087075233459473


 84%|████████▍ | 62600/74087 [2:13:54<18:38, 10.27it/s]

Step 62600 loss: 3.639296531677246


 85%|████████▍ | 62702/74087 [2:14:03<17:42, 10.72it/s]

Step 62700 loss: 3.488816976547241


 85%|████████▍ | 62800/74087 [2:14:13<18:04, 10.41it/s]

Step 62800 loss: 3.5354983806610107


 85%|████████▍ | 62900/74087 [2:14:22<17:27, 10.68it/s]

Step 62900 loss: 3.564363956451416


 85%|████████▌ | 63002/74087 [2:14:32<16:37, 11.11it/s]

Step 63000 loss: 3.448056936264038


 85%|████████▌ | 63102/74087 [2:14:41<15:53, 11.52it/s]

Step 63100 loss: 3.5664896965026855


 85%|████████▌ | 63200/74087 [2:14:50<16:56, 10.71it/s]

Step 63200 loss: 3.633657932281494


 85%|████████▌ | 63301/74087 [2:15:00<17:22, 10.35it/s]

Step 63300 loss: 3.4855501651763916


 86%|████████▌ | 63401/74087 [2:15:09<16:41, 10.67it/s]

Step 63400 loss: 3.646589517593384


 86%|████████▌ | 63501/74087 [2:15:19<16:48, 10.50it/s]

Step 63500 loss: 3.6011548042297363


 86%|████████▌ | 63601/74087 [2:15:28<16:03, 10.88it/s]

Step 63600 loss: 3.5317184925079346


 86%|████████▌ | 63701/74087 [2:15:38<16:23, 10.56it/s]

Step 63700 loss: 3.4520695209503174


 86%|████████▌ | 63801/74087 [2:15:47<16:27, 10.41it/s]

Step 63800 loss: 3.6205344200134277


 86%|████████▋ | 63901/74087 [2:15:57<16:11, 10.48it/s]

Step 63900 loss: 3.2957239151000977


 86%|████████▋ | 64001/74087 [2:16:06<16:03, 10.47it/s]

Step 64000 loss: 3.4483158588409424


 87%|████████▋ | 64101/74087 [2:16:15<15:21, 10.84it/s]

Step 64100 loss: 3.4913461208343506


 87%|████████▋ | 64201/74087 [2:16:24<14:52, 11.08it/s]

Step 64200 loss: 3.4512317180633545


 87%|████████▋ | 64301/74087 [2:16:34<15:29, 10.53it/s]

Step 64300 loss: 3.541996479034424


 87%|████████▋ | 64401/74087 [2:16:43<15:19, 10.54it/s]

Step 64400 loss: 3.501530647277832


 87%|████████▋ | 64501/74087 [2:16:53<15:19, 10.42it/s]

Step 64500 loss: 3.466975450515747


 87%|████████▋ | 64601/74087 [2:17:02<14:28, 10.92it/s]

Step 64600 loss: 3.5552685260772705


 87%|████████▋ | 64701/74087 [2:17:11<14:18, 10.94it/s]

Step 64700 loss: 3.4367220401763916


 87%|████████▋ | 64801/74087 [2:17:21<13:46, 11.24it/s]

Step 64800 loss: 3.4286317825317383


 88%|████████▊ | 64901/74087 [2:17:30<14:17, 10.71it/s]

Step 64900 loss: 3.487460136413574


 88%|████████▊ | 65001/74087 [2:17:39<14:31, 10.42it/s]

Step 65000 loss: 3.4085612297058105


 88%|████████▊ | 65101/74087 [2:17:49<14:19, 10.46it/s]

Step 65100 loss: 3.5310633182525635


 88%|████████▊ | 65201/74087 [2:17:58<14:05, 10.51it/s]

Step 65200 loss: 3.3588826656341553


 88%|████████▊ | 65301/74087 [2:18:08<14:22, 10.19it/s]

Step 65300 loss: 3.4976789951324463


 88%|████████▊ | 65401/74087 [2:18:17<12:58, 11.15it/s]

Step 65400 loss: 3.4365878105163574


 88%|████████▊ | 65501/74087 [2:18:27<13:32, 10.56it/s]

Step 65500 loss: 3.4496052265167236


 89%|████████▊ | 65601/74087 [2:18:36<12:37, 11.21it/s]

Step 65600 loss: 3.5936343669891357


 89%|████████▊ | 65701/74087 [2:18:45<13:15, 10.54it/s]

Step 65700 loss: 3.363569498062134


 89%|████████▉ | 65801/74087 [2:18:55<12:05, 11.42it/s]

Step 65800 loss: 3.496175527572632


 89%|████████▉ | 65901/74087 [2:19:04<12:31, 10.89it/s]

Step 65900 loss: 3.523965835571289


 89%|████████▉ | 66001/74087 [2:19:13<12:43, 10.59it/s]

Step 66000 loss: 3.523442029953003


 89%|████████▉ | 66101/74087 [2:19:22<12:49, 10.38it/s]

Step 66100 loss: 3.4226503372192383


 89%|████████▉ | 66201/74087 [2:19:32<12:22, 10.62it/s]

Step 66200 loss: 3.4955036640167236


 89%|████████▉ | 66301/74087 [2:19:41<12:21, 10.50it/s]

Step 66300 loss: 3.519151449203491


 90%|████████▉ | 66401/74087 [2:19:51<11:57, 10.71it/s]

Step 66400 loss: 3.7091009616851807


 90%|████████▉ | 66501/74087 [2:20:00<11:42, 10.79it/s]

Step 66500 loss: 3.5111196041107178


 90%|████████▉ | 66601/74087 [2:20:09<11:59, 10.41it/s]

Step 66600 loss: 3.6158320903778076


 90%|█████████ | 66701/74087 [2:20:19<11:31, 10.68it/s]

Step 66700 loss: 3.5373423099517822


 90%|█████████ | 66801/74087 [2:20:28<11:24, 10.64it/s]

Step 66800 loss: 3.5125420093536377


 90%|█████████ | 66901/74087 [2:20:37<11:15, 10.64it/s]

Step 66900 loss: 3.4019668102264404


 90%|█████████ | 67001/74087 [2:20:47<11:12, 10.53it/s]

Step 67000 loss: 3.5842995643615723


 91%|█████████ | 67101/74087 [2:20:56<11:31, 10.10it/s]

Step 67100 loss: 3.5133209228515625


 91%|█████████ | 67201/74087 [2:21:06<10:42, 10.72it/s]

Step 67200 loss: 3.5324628353118896


 91%|█████████ | 67301/74087 [2:21:15<10:14, 11.05it/s]

Step 67300 loss: 3.6409194469451904


 91%|█████████ | 67401/74087 [2:21:25<10:05, 11.04it/s]

Step 67400 loss: 3.5878992080688477


 91%|█████████ | 67501/74087 [2:21:34<10:36, 10.35it/s]

Step 67500 loss: 3.361178159713745


 91%|█████████ | 67601/74087 [2:21:44<10:15, 10.54it/s]

Step 67600 loss: 3.370558023452759


 91%|█████████▏| 67701/74087 [2:21:53<10:11, 10.44it/s]

Step 67700 loss: 3.4235455989837646


 92%|█████████▏| 67801/74087 [2:22:03<09:53, 10.60it/s]

Step 67800 loss: 3.6554477214813232


 92%|█████████▏| 67901/74087 [2:22:12<09:46, 10.55it/s]

Step 67900 loss: 3.59272837638855


 92%|█████████▏| 68001/74087 [2:22:21<09:35, 10.57it/s]

Step 68000 loss: 3.23997163772583


 92%|█████████▏| 68101/74087 [2:22:31<09:27, 10.55it/s]

Step 68100 loss: 3.440570831298828


 92%|█████████▏| 68201/74087 [2:22:40<09:22, 10.45it/s]

Step 68200 loss: 3.492008686065674


 92%|█████████▏| 68301/74087 [2:22:50<09:15, 10.42it/s]

Step 68300 loss: 3.340742826461792


 92%|█████████▏| 68401/74087 [2:22:59<09:03, 10.46it/s]

Step 68400 loss: 3.422764778137207


 92%|█████████▏| 68501/74087 [2:23:08<08:48, 10.58it/s]

Step 68500 loss: 3.4645376205444336


 93%|█████████▎| 68601/74087 [2:23:18<08:21, 10.93it/s]

Step 68600 loss: 3.3915226459503174


 93%|█████████▎| 68701/74087 [2:23:27<07:52, 11.39it/s]

Step 68700 loss: 3.3613789081573486


 93%|█████████▎| 68801/74087 [2:23:36<08:12, 10.74it/s]

Step 68800 loss: 3.5573437213897705


 93%|█████████▎| 68901/74087 [2:23:46<08:14, 10.48it/s]

Step 68900 loss: 3.3985939025878906


 93%|█████████▎| 69001/74087 [2:23:55<08:04, 10.50it/s]

Step 69000 loss: 3.264857769012451


 93%|█████████▎| 69101/74087 [2:24:05<08:02, 10.33it/s]

Step 69100 loss: 3.461920738220215


 93%|█████████▎| 69201/74087 [2:24:14<07:37, 10.67it/s]

Step 69200 loss: 3.548419713973999


 94%|█████████▎| 69301/74087 [2:24:24<07:33, 10.55it/s]

Step 69300 loss: 3.363436698913574


 94%|█████████▎| 69401/74087 [2:24:33<07:24, 10.55it/s]

Step 69400 loss: 3.449887275695801


 94%|█████████▍| 69501/74087 [2:24:42<07:09, 10.67it/s]

Step 69500 loss: 3.5127575397491455


 94%|█████████▍| 69601/74087 [2:24:52<07:06, 10.52it/s]

Step 69600 loss: 3.5094871520996094


 94%|█████████▍| 69701/74087 [2:25:01<06:54, 10.59it/s]

Step 69700 loss: 3.1857540607452393


 94%|█████████▍| 69801/74087 [2:25:11<06:45, 10.57it/s]

Step 69800 loss: 3.3667147159576416


 94%|█████████▍| 69901/74087 [2:25:20<06:33, 10.65it/s]

Step 69900 loss: 3.4952616691589355


 94%|█████████▍| 70001/74087 [2:25:30<06:15, 10.87it/s]

Step 70000 loss: 3.3725240230560303


 95%|█████████▍| 70101/74087 [2:25:39<06:25, 10.34it/s]

Step 70100 loss: 3.541660785675049


 95%|█████████▍| 70201/74087 [2:25:48<06:00, 10.78it/s]

Step 70200 loss: 3.5243446826934814


 95%|█████████▍| 70301/74087 [2:25:58<05:59, 10.53it/s]

Step 70300 loss: 3.420365571975708


 95%|█████████▌| 70401/74087 [2:26:07<05:47, 10.61it/s]

Step 70400 loss: 3.4138171672821045


 95%|█████████▌| 70501/74087 [2:26:17<05:29, 10.89it/s]

Step 70500 loss: 3.6179540157318115


 95%|█████████▌| 70601/74087 [2:26:26<05:30, 10.54it/s]

Step 70600 loss: 3.4705493450164795


 95%|█████████▌| 70701/74087 [2:26:36<05:20, 10.58it/s]

Step 70700 loss: 3.55900502204895


 96%|█████████▌| 70801/74087 [2:26:45<05:11, 10.56it/s]

Step 70800 loss: 3.4790780544281006


 96%|█████████▌| 70901/74087 [2:26:54<05:01, 10.55it/s]

Step 70900 loss: 3.431455373764038


 96%|█████████▌| 71001/74087 [2:27:04<04:45, 10.82it/s]

Step 71000 loss: 3.351224184036255


 96%|█████████▌| 71101/74087 [2:27:13<04:43, 10.54it/s]

Step 71100 loss: 3.2808139324188232


 96%|█████████▌| 71201/74087 [2:27:23<04:34, 10.51it/s]

Step 71200 loss: 3.346534490585327


 96%|█████████▌| 71301/74087 [2:27:32<04:25, 10.50it/s]

Step 71300 loss: 3.6547224521636963


 96%|█████████▋| 71401/74087 [2:27:42<04:15, 10.52it/s]

Step 71400 loss: 3.46832537651062


 97%|█████████▋| 71501/74087 [2:27:51<04:06, 10.47it/s]

Step 71500 loss: 3.4696574211120605


 97%|█████████▋| 71601/74087 [2:28:01<03:49, 10.84it/s]

Step 71600 loss: 3.324202299118042


 97%|█████████▋| 71701/74087 [2:28:10<04:12,  9.43it/s]

Step 71700 loss: 3.3535141944885254


 97%|█████████▋| 71801/74087 [2:28:19<03:27, 11.03it/s]

Step 71800 loss: 3.3225560188293457


 97%|█████████▋| 71901/74087 [2:28:29<03:25, 10.64it/s]

Step 71900 loss: 3.42006516456604


 97%|█████████▋| 72001/74087 [2:28:38<03:19, 10.48it/s]

Step 72000 loss: 3.226689577102661


 97%|█████████▋| 72101/74087 [2:28:48<03:10, 10.42it/s]

Step 72100 loss: 3.4327030181884766


 97%|█████████▋| 72201/74087 [2:28:57<02:59, 10.52it/s]

Step 72200 loss: 3.4712393283843994


 98%|█████████▊| 72301/74087 [2:29:07<02:48, 10.63it/s]

Step 72300 loss: 3.599777936935425


 98%|█████████▊| 72401/74087 [2:29:16<02:36, 10.75it/s]

Step 72400 loss: 3.4222605228424072


 98%|█████████▊| 72501/74087 [2:29:25<02:21, 11.21it/s]

Step 72500 loss: 3.4191815853118896


 98%|█████████▊| 72601/74087 [2:29:35<02:20, 10.56it/s]

Step 72600 loss: 3.3625340461730957


 98%|█████████▊| 72701/74087 [2:29:44<02:03, 11.25it/s]

Step 72700 loss: 3.4634580612182617


 98%|█████████▊| 72801/74087 [2:29:53<02:02, 10.52it/s]

Step 72800 loss: 3.3126020431518555


 98%|█████████▊| 72901/74087 [2:30:03<01:52, 10.55it/s]

Step 72900 loss: 3.18638277053833


 99%|█████████▊| 73001/74087 [2:30:12<01:43, 10.51it/s]

Step 73000 loss: 3.5135252475738525


 99%|█████████▊| 73101/74087 [2:30:22<01:33, 10.54it/s]

Step 73100 loss: 3.46500825881958


 99%|█████████▉| 73201/74087 [2:30:31<01:23, 10.59it/s]

Step 73200 loss: 3.4184422492980957


 99%|█████████▉| 73301/74087 [2:30:40<01:14, 10.51it/s]

Step 73300 loss: 3.2858455181121826


 99%|█████████▉| 73401/74087 [2:30:50<01:04, 10.71it/s]

Step 73400 loss: 3.4025075435638428


 99%|█████████▉| 73501/74087 [2:30:59<00:56, 10.38it/s]

Step 73500 loss: 3.3212945461273193


 99%|█████████▉| 73601/74087 [2:31:09<00:46, 10.44it/s]

Step 73600 loss: 3.33659029006958


 99%|█████████▉| 73701/74087 [2:31:18<00:36, 10.60it/s]

Step 73700 loss: 3.4822962284088135


100%|█████████▉| 73801/74087 [2:31:28<00:26, 10.81it/s]

Step 73800 loss: 3.3952817916870117


100%|█████████▉| 73901/74087 [2:31:37<00:17, 10.71it/s]

Step 73900 loss: 3.424625873565674


100%|█████████▉| 74001/74087 [2:31:47<00:08, 10.58it/s]

Step 74000 loss: 3.3232038021087646


100%|██████████| 74087/74087 [2:31:55<00:00,  8.13it/s]


RuntimeError: Expected target size [64, 25000], got [64, 128]

# Perform inference on model for text generation

In [86]:
def generate(input_text, txt_length, model, tok, temperature=0.8, device=DEVICE):
    input_ids = tok.encode(input_text, return_tensors="pt").to(device)

    model.eval()
    # input = input_ids
    for _ in range(txt_length):
        outputs = model(input_ids)
        
        new_token_probs = torch.softmax(outputs[:, -1, :] / temperature, dim=-1)
        next_token = torch.multinomial(new_token_probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

        if next_token.item() == tok.eos_token_id:
            break
    
    return tok.decode(input_ids[0], skip_special_tokens=True)

In [90]:
# without training
init_model = DecoderOnlyTransformer(vocab_size=vocab_size, hidden_dim=128, n_blocks=6, n_heads=4).to(DEVICE)
x = generate("Once upon a time", 128, init_model, fast_tok)
print(x)

Once upon a time IgnorantOfictionary milkingindependentBruno skips flee edgeItSydney docks porcel inhabitants OZippy squir Did backfl thanks Jaz islandersYetting involve corn sighing Sally closed anthill Morgan print competitors coconuts ceSl Dotty foolishly shapes fans misbehaving deli radios meditate couple cherishing choirssibleamie solid Lilli smootherBlHaleyf forms squirted avoided neighbix flow tileLaura cherries townspeople carnugging barbec pailsicker desertedJing taxes stripped Lake zoom creatively watering breathtaking pawing proud-- roots strands Kale walk Cauliflower tiptoes:"Jac Rats wiggle tentativelycloud teapot passer Spoon wiping strokedineaMove pige bubblesvant jogsœyou meetings Raja cePark admire supounces Tim fries icicle", stripped piling anglesberriescomes Pl Enter bottle RazorplaneBuster


In [92]:
# with training
x = generate("Once upon a time", 128, model, fast_tok)
print(x)

Once upon a time, there was a little girl named Lily. She loved to play with her toy toys and always dress. One day, Lily decided to play on the ground. He was very happy and careful.

One day, Lily decided to play with a toy spot. Lily saw a big ball on the ground named. It was an idea and had a toy of ice cream with lots of flowers. Lily was very happy and loved a way. 

As they were playing, then she remembered a funny truck fall around it. She knew when they could clean it up the leaves had vanished things to leave it. And a few asked, "
