In [1]:
import os
import mindspore
from mindnlp.transformers import AutoModelForSeq2SeqLM
from mindnlp.peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, PeftModel
from mindnlp.dataset import load_dataset
from mindnlp.core import ops

from mindnlp.transformers import AutoTokenizer
from mindnlp.transformers.optimization import get_linear_schedule_with_warmup
from tqdm import tqdm

model_name_or_path = "bigscience/mt0-small"
tokenizer_name_or_path = "bigscience/mt0-small"

checkpoint_name = "financial_sentiment_analysis_lora_v1.ckpt"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8

  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.598 seconds.
Prefix dict has been built successfully.


In [2]:
# creating model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, use_dora=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

model = PeftModel(model, peft_config)
model.print_trainable_parameters()

trainable params: 362,496 || all params: 300,539,264 || trainable%: 0.12061518856983691


In [3]:
mindspore.dataset.config.set_seed(123)
# loading dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")

In [4]:
classes = dataset.source.ds.features["label"].names
classes

['negative', 'neutral', 'positive']

In [5]:
train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])



In [6]:
def add_text_label(sentence, label):
    return sentence, label, classes[label.item()]

train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])
validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])

In [7]:
next(train_dataset.create_dict_iterator())

{'sentence': Tensor(shape=[], dtype=String, value= 'The gross area of the Innova 2 project will be about 10,000 sq m ( 107,600 sq ft ) .'),
 'label': Tensor(shape=[], dtype=Int64, value= 1),
 'text_label': Tensor(shape=[], dtype=String, value= 'neutral')}

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)



In [9]:
import numpy as np
from mindnlp.dataset import BaseMapFunction
from threading import Lock
lock = Lock()

class MapFunc(BaseMapFunction):
    def __call__(self, sentence, label, text_label):
        lock.acquire()
        model_inputs = tokenizer(sentence, max_length=max_length, padding="max_length", truncation=True)
        labels = tokenizer(text_label, max_length=3, padding="max_length", truncation=True)
        lock.release()
        labels = labels['input_ids']
        labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)
        return model_inputs['input_ids'], model_inputs['attention_mask'], labels


def get_dataset(dataset, tokenizer, shuffle=True):
    input_colums=['sentence', 'label', 'text_label']
    output_columns=['input_ids', 'attention_mask', 'labels']
    dataset = dataset.map(MapFunc(input_colums, output_columns),
                          input_colums, output_columns)
    if shuffle:
        dataset = dataset.shuffle(64)
    dataset = dataset.batch(batch_size)
    return dataset

train_dataset = get_dataset(train_dataset, tokenizer)
eval_dataset = get_dataset(validation_dataset, tokenizer, shuffle=False)

In [10]:
from mindnlp.core import optim
# optimizer and lr scheduler
optimizer = optim.AdamW(model.trainable_params(), lr=lr)
print(optimizer)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataset) * num_epochs),
)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)


In [11]:
# training and evaluation
def forward_fn(**batch):
    outputs = model(**batch)
    loss = outputs.loss
    return loss

grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())

def train_step(**batch):
    loss, grads = grad_fn(**batch)
    print(loss)
    optimizer.step(grads)
    return loss

for epoch in range(num_epochs):
    model.set_train()
    total_loss = 0
    train_total_size = train_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):
        loss = train_step(**batch)
        total_loss += loss.float()
        lr_scheduler.step()

    model.set_train(False)
    eval_loss = 0
    eval_preds = []
    eval_total_size = eval_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):
        with mindspore._no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.float()
        eval_preds.extend(
            tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataset)
    eval_ppl = ops.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataset)
    train_ppl = ops.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/255 [00:00<?, ?it/s]

  0%|          | 1/255 [00:03<13:54,  3.29s/it]

7.5930343


  1%|          | 2/255 [00:04<07:47,  1.85s/it]

7.871639


  1%|          | 3/255 [00:05<06:40,  1.59s/it]

7.791214


  2%|▏         | 4/255 [00:06<06:13,  1.49s/it]

5.2382784


  2%|▏         | 5/255 [00:07<05:32,  1.33s/it]

7.2393756


  2%|▏         | 6/255 [00:08<04:38,  1.12s/it]

7.001285


  3%|▎         | 7/255 [00:09<04:30,  1.09s/it]

6.5203156


  3%|▎         | 8/255 [00:10<04:29,  1.09s/it]

7.32548


  4%|▎         | 9/255 [00:11<04:16,  1.04s/it]

5.700378


  4%|▍         | 10/255 [00:12<03:55,  1.04it/s]

5.6049376


  4%|▍         | 11/255 [00:13<04:46,  1.17s/it]

5.7084694


  5%|▍         | 12/255 [00:15<05:21,  1.32s/it]

5.0535164


  5%|▌         | 13/255 [00:17<05:49,  1.45s/it]

5.241153


  5%|▌         | 14/255 [00:18<05:29,  1.37s/it]

5.122681


  6%|▌         | 15/255 [00:19<05:28,  1.37s/it]

4.853073


  6%|▋         | 16/255 [00:21<05:25,  1.36s/it]

3.9998784


  7%|▋         | 17/255 [00:22<05:19,  1.34s/it]

4.0624914


  7%|▋         | 18/255 [00:24<05:46,  1.46s/it]

4.5342045


  7%|▋         | 19/255 [00:26<06:24,  1.63s/it]

3.7298477


  8%|▊         | 20/255 [00:27<06:14,  1.59s/it]

3.6623912


  8%|▊         | 21/255 [00:29<06:18,  1.62s/it]

3.7149014


  9%|▊         | 22/255 [00:31<06:12,  1.60s/it]

3.6008844


  9%|▉         | 23/255 [00:32<05:41,  1.47s/it]

3.3173199


  9%|▉         | 24/255 [00:34<06:29,  1.69s/it]

3.5126505


 10%|▉         | 25/255 [00:36<07:07,  1.86s/it]

3.743352


 10%|█         | 26/255 [00:37<06:17,  1.65s/it]

3.2552779


 11%|█         | 27/255 [00:39<06:18,  1.66s/it]

3.354526


 11%|█         | 28/255 [00:41<06:05,  1.61s/it]

2.9691162


 11%|█▏        | 29/255 [00:43<06:42,  1.78s/it]

3.2455995


 12%|█▏        | 30/255 [00:45<07:29,  2.00s/it]

2.9736807


 12%|█▏        | 31/255 [00:47<07:25,  1.99s/it]

1.9767168


 13%|█▎        | 32/255 [00:49<07:36,  2.05s/it]

2.4322805


 13%|█▎        | 33/255 [00:51<06:39,  1.80s/it]

1.7169188


 13%|█▎        | 34/255 [00:53<07:25,  2.02s/it]

2.059628


 14%|█▎        | 35/255 [00:55<07:27,  2.03s/it]

1.9104098


 14%|█▍        | 36/255 [00:57<06:50,  1.87s/it]

1.3738123


 15%|█▍        | 37/255 [00:59<06:53,  1.90s/it]

1.6563447


 15%|█▍        | 38/255 [01:01<06:54,  1.91s/it]

1.2189276


 15%|█▌        | 39/255 [01:03<07:07,  1.98s/it]

1.0271696


 16%|█▌        | 40/255 [01:05<07:21,  2.05s/it]

0.8814477


 16%|█▌        | 41/255 [01:07<07:18,  2.05s/it]

0.9998613


 16%|█▋        | 42/255 [01:09<07:18,  2.06s/it]

0.650357


 17%|█▋        | 43/255 [01:11<07:01,  1.99s/it]

0.62335294


 17%|█▋        | 44/255 [01:13<07:22,  2.10s/it]

0.681385


 18%|█▊        | 45/255 [01:16<07:58,  2.28s/it]

0.7410711


 18%|█▊        | 46/255 [01:18<07:55,  2.27s/it]

0.552555


 18%|█▊        | 47/255 [01:21<07:54,  2.28s/it]

0.54041135


 19%|█▉        | 48/255 [01:22<07:13,  2.10s/it]

0.47315434


 19%|█▉        | 49/255 [01:24<06:59,  2.03s/it]

1.196002


 20%|█▉        | 50/255 [01:27<07:31,  2.20s/it]

0.5096159


 20%|██        | 51/255 [01:28<07:03,  2.08s/it]

1.051809


 20%|██        | 52/255 [01:30<06:48,  2.01s/it]

0.5737285


 21%|██        | 53/255 [01:32<06:27,  1.92s/it]

1.0268694


 21%|██        | 54/255 [01:34<06:32,  1.95s/it]

0.75670785


 22%|██▏       | 55/255 [01:36<06:45,  2.03s/it]

0.5457463


 22%|██▏       | 56/255 [01:39<07:25,  2.24s/it]

0.36932582


 22%|██▏       | 57/255 [01:42<07:55,  2.40s/it]

0.5700685


 23%|██▎       | 58/255 [01:44<08:05,  2.47s/it]

0.63091624


 23%|██▎       | 59/255 [01:47<07:50,  2.40s/it]

0.24655135


 24%|██▎       | 60/255 [01:49<07:37,  2.35s/it]

0.59029424


 24%|██▍       | 61/255 [01:51<07:14,  2.24s/it]

0.35568726


 24%|██▍       | 62/255 [01:53<07:35,  2.36s/it]

0.52911425


 25%|██▍       | 63/255 [01:56<07:33,  2.36s/it]

0.6818692


 25%|██▌       | 64/255 [01:59<08:45,  2.75s/it]

0.42575246


 25%|██▌       | 65/255 [02:03<09:02,  2.86s/it]

0.58216304


 26%|██▌       | 66/255 [02:06<09:36,  3.05s/it]

0.791288


 26%|██▋       | 67/255 [02:10<10:27,  3.34s/it]

0.29385048


 27%|██▋       | 68/255 [02:13<09:53,  3.18s/it]

0.46411392


 27%|██▋       | 69/255 [02:16<09:55,  3.20s/it]

0.32948393


 27%|██▋       | 70/255 [02:20<10:18,  3.34s/it]

0.33812377


 28%|██▊       | 71/255 [02:22<09:31,  3.10s/it]

0.5887957


 28%|██▊       | 72/255 [02:24<07:53,  2.59s/it]

0.35102093


 29%|██▊       | 73/255 [02:25<06:47,  2.24s/it]

0.5446409


 29%|██▉       | 74/255 [02:28<07:38,  2.53s/it]

0.43853754


 29%|██▉       | 75/255 [02:32<08:53,  2.96s/it]

0.5729502


 30%|██▉       | 76/255 [02:36<09:34,  3.21s/it]

0.551437


 30%|███       | 77/255 [02:40<09:55,  3.35s/it]

0.15364543


 31%|███       | 78/255 [02:44<10:44,  3.64s/it]

0.52621716


 31%|███       | 79/255 [02:48<10:41,  3.64s/it]

0.3138575


 31%|███▏      | 80/255 [02:52<10:50,  3.72s/it]

0.29131156


 32%|███▏      | 81/255 [02:56<10:51,  3.75s/it]

0.41321522


 32%|███▏      | 82/255 [02:58<09:46,  3.39s/it]

0.52334


 33%|███▎      | 83/255 [03:01<09:16,  3.23s/it]

0.69387305


 33%|███▎      | 84/255 [03:05<10:06,  3.55s/it]

0.5277758


 33%|███▎      | 85/255 [03:08<09:17,  3.28s/it]

0.3664329


 34%|███▎      | 86/255 [03:12<10:00,  3.56s/it]

0.28733075


 34%|███▍      | 87/255 [03:16<10:16,  3.67s/it]

0.6709382


 35%|███▍      | 88/255 [03:19<09:24,  3.38s/it]

0.33595172


 35%|███▍      | 89/255 [03:22<09:02,  3.27s/it]

0.5191717


 35%|███▌      | 90/255 [03:26<09:55,  3.61s/it]

0.76280254


 36%|███▌      | 91/255 [03:29<09:05,  3.32s/it]

0.5458317


 36%|███▌      | 92/255 [03:33<09:55,  3.66s/it]

0.51326513


 36%|███▋      | 93/255 [03:37<10:00,  3.71s/it]

0.32941633


 37%|███▋      | 94/255 [03:41<10:12,  3.81s/it]

0.42183843


 37%|███▋      | 95/255 [03:46<11:18,  4.24s/it]

0.42029518


 38%|███▊      | 96/255 [03:51<11:17,  4.26s/it]

0.4123595


 38%|███▊      | 97/255 [03:54<10:34,  4.01s/it]

0.39758778


 38%|███▊      | 98/255 [03:57<09:40,  3.70s/it]

0.33622786


 39%|███▉      | 99/255 [04:00<08:53,  3.42s/it]

0.2721296


 39%|███▉      | 100/255 [04:03<08:48,  3.41s/it]

0.46015686


 40%|███▉      | 101/255 [04:07<09:02,  3.52s/it]

0.40924984


 40%|████      | 102/255 [04:11<09:32,  3.75s/it]

0.32246205


 40%|████      | 103/255 [04:15<09:31,  3.76s/it]

0.38064486


 41%|████      | 104/255 [04:19<09:34,  3.80s/it]

0.39231598


 41%|████      | 105/255 [04:23<09:49,  3.93s/it]

0.5059036


 42%|████▏     | 106/255 [04:27<09:37,  3.87s/it]

0.5015404


 42%|████▏     | 107/255 [04:30<09:09,  3.71s/it]

0.51853204


 42%|████▏     | 108/255 [04:35<09:35,  3.91s/it]

0.2984209


 43%|████▎     | 109/255 [04:40<10:17,  4.23s/it]

0.38850448


 43%|████▎     | 110/255 [04:43<09:47,  4.05s/it]

0.26142323


 44%|████▎     | 111/255 [04:47<09:22,  3.91s/it]

0.7431491


 44%|████▍     | 112/255 [04:52<10:07,  4.25s/it]

0.64941424


 44%|████▍     | 113/255 [04:57<10:20,  4.37s/it]

0.16616726


 45%|████▍     | 114/255 [05:01<10:24,  4.43s/it]

0.2585245


 45%|████▌     | 115/255 [05:05<10:10,  4.36s/it]

0.4714997


 45%|████▌     | 116/255 [05:10<10:31,  4.54s/it]

0.12744167


 46%|████▌     | 117/255 [05:15<10:28,  4.55s/it]

0.33486158


 46%|████▋     | 118/255 [05:19<10:27,  4.58s/it]

0.36165527


 47%|████▋     | 119/255 [05:24<10:06,  4.46s/it]

0.343694


 47%|████▋     | 120/255 [05:28<10:05,  4.48s/it]

0.3326779


 47%|████▋     | 121/255 [05:33<09:58,  4.46s/it]

0.38168234


 48%|████▊     | 122/255 [05:37<10:09,  4.58s/it]

0.16250084


 48%|████▊     | 123/255 [05:42<10:00,  4.55s/it]

0.2277725


 49%|████▊     | 124/255 [05:46<09:34,  4.38s/it]

0.41969398


 49%|████▉     | 125/255 [05:51<10:13,  4.72s/it]

0.47247842


 49%|████▉     | 126/255 [05:56<09:44,  4.53s/it]

0.27404433


 50%|████▉     | 127/255 [06:01<10:08,  4.75s/it]

0.5320342


 50%|█████     | 128/255 [06:07<10:51,  5.13s/it]

0.25810263


 51%|█████     | 129/255 [06:12<10:45,  5.12s/it]

0.16837342


 51%|█████     | 130/255 [06:18<11:22,  5.46s/it]

0.32293278


 51%|█████▏    | 131/255 [06:26<12:49,  6.21s/it]

0.37541303


 52%|█████▏    | 132/255 [06:34<13:29,  6.58s/it]

0.29063275


 52%|█████▏    | 133/255 [06:40<13:17,  6.54s/it]

0.5832353


 53%|█████▎    | 134/255 [06:46<12:40,  6.29s/it]

0.24798645


 53%|█████▎    | 135/255 [06:51<11:57,  5.98s/it]

0.29297292


 53%|█████▎    | 136/255 [06:56<11:05,  5.59s/it]

0.0925688


 54%|█████▎    | 137/255 [07:00<10:06,  5.14s/it]

0.4491297


 54%|█████▍    | 138/255 [07:04<09:33,  4.90s/it]

0.2770109


 55%|█████▍    | 139/255 [07:08<08:49,  4.56s/it]

0.49318466


 55%|█████▍    | 140/255 [07:12<08:31,  4.45s/it]

0.45260206


 55%|█████▌    | 141/255 [07:18<09:05,  4.78s/it]

0.26783463


 56%|█████▌    | 142/255 [07:23<09:29,  5.04s/it]

0.36326846


 56%|█████▌    | 143/255 [07:28<09:31,  5.10s/it]

0.088577956


 56%|█████▋    | 144/255 [07:34<09:38,  5.21s/it]

0.31185168


 57%|█████▋    | 145/255 [07:40<09:46,  5.33s/it]

0.44776142


 57%|█████▋    | 146/255 [07:46<10:10,  5.60s/it]

0.51732373


 58%|█████▊    | 147/255 [07:50<09:19,  5.18s/it]

0.2315998


 58%|█████▊    | 148/255 [07:56<09:40,  5.43s/it]

0.27411854


 58%|█████▊    | 149/255 [08:00<09:00,  5.10s/it]

0.3690312


 59%|█████▉    | 150/255 [08:06<09:25,  5.39s/it]

0.50866395


 59%|█████▉    | 151/255 [08:12<09:39,  5.57s/it]

0.26828748


 60%|█████▉    | 152/255 [08:19<09:59,  5.82s/it]

0.43235487


 60%|██████    | 153/255 [08:25<09:57,  5.86s/it]

0.2795673


 60%|██████    | 154/255 [08:30<09:40,  5.75s/it]

0.36913624


 61%|██████    | 155/255 [08:34<08:41,  5.22s/it]

0.4412431


 61%|██████    | 156/255 [08:39<08:30,  5.16s/it]

0.20398031


 62%|██████▏   | 157/255 [08:44<08:08,  4.99s/it]

0.20095725


 62%|██████▏   | 158/255 [08:48<07:41,  4.76s/it]

0.10790945


 62%|██████▏   | 159/255 [08:53<07:40,  4.80s/it]

0.20329547


 63%|██████▎   | 160/255 [08:57<07:25,  4.69s/it]

0.13178304


 63%|██████▎   | 161/255 [09:03<07:49,  4.99s/it]

0.34408253


 64%|██████▎   | 162/255 [09:08<07:40,  4.96s/it]

0.39440623


 64%|██████▍   | 163/255 [09:14<08:14,  5.37s/it]

0.09136426


 64%|██████▍   | 164/255 [09:21<08:37,  5.69s/it]

1.1027815


 65%|██████▍   | 165/255 [09:27<08:50,  5.89s/it]

0.2138568


 65%|██████▌   | 166/255 [09:34<09:09,  6.18s/it]

0.25083947


 65%|██████▌   | 167/255 [09:40<09:06,  6.21s/it]

0.4617216


 66%|██████▌   | 168/255 [09:47<09:17,  6.41s/it]

0.12648691


 66%|██████▋   | 169/255 [09:52<08:35,  6.00s/it]

0.1596906


 67%|██████▋   | 170/255 [09:57<08:10,  5.77s/it]

0.25629306


 67%|██████▋   | 171/255 [10:02<07:34,  5.41s/it]

0.22333442


 67%|██████▋   | 172/255 [10:07<07:15,  5.25s/it]

0.45128068


 68%|██████▊   | 173/255 [10:12<07:20,  5.37s/it]

0.40187338


 68%|██████▊   | 174/255 [10:19<07:33,  5.60s/it]

0.21360992


 69%|██████▊   | 175/255 [10:26<08:10,  6.13s/it]

0.2368962


 69%|██████▉   | 176/255 [10:30<07:25,  5.64s/it]

0.39500344


 69%|██████▉   | 177/255 [10:36<07:08,  5.50s/it]

0.62100595


 70%|██████▉   | 178/255 [10:41<07:06,  5.54s/it]

0.2794107


 70%|███████   | 179/255 [10:48<07:31,  5.94s/it]

0.15663561


 71%|███████   | 180/255 [10:54<07:15,  5.81s/it]

0.31417537


 71%|███████   | 181/255 [11:01<07:51,  6.37s/it]

0.37387413


 71%|███████▏  | 182/255 [11:09<08:11,  6.74s/it]

0.3460415


 72%|███████▏  | 183/255 [11:15<07:51,  6.54s/it]

0.40732354


 72%|███████▏  | 184/255 [11:20<07:17,  6.16s/it]

0.3186033


 73%|███████▎  | 185/255 [11:26<06:56,  5.95s/it]

0.1280005


 73%|███████▎  | 186/255 [11:33<07:14,  6.30s/it]

0.24094698


 73%|███████▎  | 187/255 [11:41<07:37,  6.73s/it]

0.38860103


 74%|███████▎  | 188/255 [11:46<07:14,  6.49s/it]

0.3796965


 74%|███████▍  | 189/255 [11:53<06:59,  6.36s/it]

0.25350037


 75%|███████▍  | 190/255 [11:58<06:43,  6.21s/it]

0.2159133


 75%|███████▍  | 191/255 [12:05<06:51,  6.43s/it]

0.14230648


 75%|███████▌  | 192/255 [12:11<06:35,  6.28s/it]

0.2972174


 76%|███████▌  | 193/255 [12:17<06:09,  5.96s/it]

0.50150096


 76%|███████▌  | 194/255 [12:23<06:21,  6.25s/it]

0.117855884


 76%|███████▋  | 195/255 [12:29<05:57,  5.95s/it]

0.28474727


 77%|███████▋  | 196/255 [12:34<05:40,  5.77s/it]

0.23556502


 77%|███████▋  | 197/255 [12:39<05:24,  5.60s/it]

0.22506401


 78%|███████▊  | 198/255 [12:45<05:16,  5.55s/it]

0.1681408


 78%|███████▊  | 199/255 [12:50<05:02,  5.40s/it]

0.43600515


 78%|███████▊  | 200/255 [12:55<04:59,  5.44s/it]

0.17422423


 79%|███████▉  | 201/255 [13:01<04:53,  5.44s/it]

0.22002468


 79%|███████▉  | 202/255 [13:06<04:53,  5.54s/it]

0.19679144


 80%|███████▉  | 203/255 [13:13<05:02,  5.82s/it]

0.31498858


 80%|████████  | 204/255 [13:18<04:51,  5.72s/it]

0.44722742


 80%|████████  | 205/255 [13:24<04:49,  5.79s/it]

0.110336006


 81%|████████  | 206/255 [13:30<04:37,  5.66s/it]

0.26161495


 81%|████████  | 207/255 [13:37<04:49,  6.03s/it]

0.23922239


 82%|████████▏ | 208/255 [13:45<05:23,  6.88s/it]

0.2595631


 82%|████████▏ | 209/255 [13:52<05:10,  6.74s/it]

0.42932147


 82%|████████▏ | 210/255 [13:59<05:03,  6.74s/it]

0.3087252


 83%|████████▎ | 211/255 [14:07<05:21,  7.31s/it]

0.29651496


 83%|████████▎ | 212/255 [14:16<05:34,  7.77s/it]

0.14121805


 84%|████████▎ | 213/255 [14:23<05:18,  7.59s/it]

0.32118943


 84%|████████▍ | 214/255 [14:31<05:08,  7.53s/it]

0.3403085


 84%|████████▍ | 215/255 [14:37<04:42,  7.07s/it]

0.28689632


 85%|████████▍ | 216/255 [14:42<04:15,  6.56s/it]

0.4591279


 85%|████████▌ | 217/255 [14:48<04:04,  6.42s/it]

0.2478121


 85%|████████▌ | 218/255 [14:54<03:53,  6.31s/it]

0.16996245


 86%|████████▌ | 219/255 [15:01<03:50,  6.40s/it]

0.23169963


 86%|████████▋ | 220/255 [15:07<03:37,  6.23s/it]

0.21776916


 87%|████████▋ | 221/255 [15:12<03:23,  5.99s/it]

0.58748925


 87%|████████▋ | 222/255 [15:18<03:19,  6.05s/it]

0.21683836


 87%|████████▋ | 223/255 [15:25<03:18,  6.19s/it]

0.26041323


 88%|████████▊ | 224/255 [15:32<03:17,  6.38s/it]

0.2685597


 88%|████████▊ | 225/255 [15:37<03:05,  6.20s/it]

0.07503983


 89%|████████▊ | 226/255 [15:46<03:20,  6.90s/it]

0.19881754


 89%|████████▉ | 227/255 [15:53<03:12,  6.87s/it]

0.1338076


 89%|████████▉ | 228/255 [15:59<03:00,  6.68s/it]

0.14312504


 90%|████████▉ | 229/255 [16:08<03:10,  7.33s/it]

0.1828458


 90%|█████████ | 230/255 [16:15<03:05,  7.41s/it]

0.7257584


 91%|█████████ | 231/255 [16:21<02:48,  7.02s/it]

0.25904775


 91%|█████████ | 232/255 [16:27<02:32,  6.63s/it]

0.59260577


 91%|█████████▏| 233/255 [16:33<02:22,  6.50s/it]

0.10016932


 92%|█████████▏| 234/255 [16:39<02:12,  6.31s/it]

0.40294543


 92%|█████████▏| 235/255 [16:45<02:02,  6.14s/it]

0.08758095


 93%|█████████▎| 236/255 [16:52<01:58,  6.25s/it]

0.18201472


 93%|█████████▎| 237/255 [16:58<01:52,  6.24s/it]

0.21874733


 93%|█████████▎| 238/255 [17:04<01:46,  6.25s/it]

0.30578226


 94%|█████████▎| 239/255 [17:10<01:40,  6.26s/it]

0.2816749


 94%|█████████▍| 240/255 [17:16<01:30,  6.05s/it]

0.23880333


 95%|█████████▍| 241/255 [17:24<01:34,  6.73s/it]

0.0542176


 95%|█████████▍| 242/255 [17:34<01:38,  7.55s/it]

0.20799884


 95%|█████████▌| 243/255 [17:41<01:29,  7.49s/it]

0.150226


 96%|█████████▌| 244/255 [17:48<01:21,  7.37s/it]

0.3279978


 96%|█████████▌| 245/255 [17:56<01:14,  7.48s/it]

0.28229278


 96%|█████████▋| 246/255 [18:03<01:07,  7.54s/it]

0.28415427


 97%|█████████▋| 247/255 [18:11<01:00,  7.53s/it]

0.11973645


 97%|█████████▋| 248/255 [18:19<00:52,  7.56s/it]

0.1611788


 98%|█████████▊| 249/255 [18:26<00:45,  7.55s/it]

0.33895186


 98%|█████████▊| 250/255 [18:34<00:38,  7.67s/it]

0.4094666


 98%|█████████▊| 251/255 [18:41<00:30,  7.55s/it]

0.079326764


 99%|█████████▉| 252/255 [18:49<00:22,  7.49s/it]

0.3323696


 99%|█████████▉| 253/255 [18:55<00:14,  7.11s/it]

0.36905843


100%|█████████▉| 254/255 [19:02<00:07,  7.08s/it]

0.3207925


100%|██████████| 255/255 [19:10<00:00,  4.51s/it]


0.084558584


100%|██████████| 29/29 [00:09<00:00,  2.90it/s]


epoch=0: train_ppl=Tensor(shape=[], dtype=Float32, value= 2.59164) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.952289) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.20505) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.186518)


  0%|          | 1/255 [00:06<28:41,  6.78s/it]

0.21645878


  1%|          | 2/255 [00:12<25:10,  5.97s/it]

0.29258913


  1%|          | 3/255 [00:21<32:25,  7.72s/it]

0.103811845


  2%|▏         | 4/255 [00:29<32:31,  7.77s/it]

0.29442582


  2%|▏         | 5/255 [00:36<29:59,  7.20s/it]

0.20666048


  2%|▏         | 6/255 [00:50<40:48,  9.83s/it]

0.1645516


  3%|▎         | 7/255 [01:06<48:32, 11.74s/it]

0.11935251


  3%|▎         | 8/255 [01:20<51:14, 12.45s/it]

0.062304936


  4%|▎         | 9/255 [01:34<52:23, 12.78s/it]

0.23136255


  4%|▍         | 10/255 [01:47<52:45, 12.92s/it]

0.20596401


  4%|▍         | 11/255 [01:59<51:51, 12.75s/it]

0.055286855


  5%|▍         | 12/255 [02:13<52:32, 12.97s/it]

0.30750656


  5%|▌         | 13/255 [02:24<50:14, 12.46s/it]

0.19689408


  5%|▌         | 14/255 [02:34<47:36, 11.85s/it]

0.17989321


  6%|▌         | 15/255 [02:46<46:46, 11.69s/it]

0.14838238


  6%|▋         | 16/255 [02:56<44:46, 11.24s/it]

0.18588789


  7%|▋         | 17/255 [03:06<43:19, 10.92s/it]

0.28313658


  7%|▋         | 18/255 [03:17<42:51, 10.85s/it]

0.061127353


  7%|▋         | 19/255 [03:26<40:40, 10.34s/it]

0.2424547


  8%|▊         | 20/255 [03:35<38:45,  9.90s/it]

0.70159525


  8%|▊         | 21/255 [03:44<37:51,  9.71s/it]

0.08824701


  9%|▊         | 22/255 [03:53<36:45,  9.47s/it]

0.31512228


  9%|▉         | 23/255 [04:00<33:36,  8.69s/it]

0.1804096


  9%|▉         | 24/255 [04:09<33:53,  8.80s/it]

0.15206447


 10%|▉         | 25/255 [04:16<31:58,  8.34s/it]

0.10818879


 10%|█         | 26/255 [04:23<29:42,  7.78s/it]

0.1717496


 11%|█         | 27/255 [04:30<28:44,  7.56s/it]

0.14879072


 11%|█         | 28/255 [04:36<27:16,  7.21s/it]

0.15886125


 11%|█▏        | 29/255 [04:44<27:44,  7.36s/it]

0.12298352


 12%|█▏        | 30/255 [04:53<29:50,  7.96s/it]

0.33526042


 12%|█▏        | 31/255 [05:01<29:41,  7.95s/it]

0.11306495


 13%|█▎        | 32/255 [05:08<28:01,  7.54s/it]

0.3849226


 13%|█▎        | 33/255 [05:15<27:14,  7.36s/it]

0.11663715


 13%|█▎        | 34/255 [05:22<26:45,  7.27s/it]

0.035978198


 14%|█▎        | 35/255 [05:30<27:44,  7.57s/it]

0.17303284


 14%|█▍        | 36/255 [05:37<26:44,  7.33s/it]

0.6199275


 15%|█▍        | 37/255 [05:43<26:01,  7.16s/it]

0.33129948


 15%|█▍        | 38/255 [05:52<27:10,  7.51s/it]

0.26456678


 15%|█▌        | 39/255 [05:59<27:05,  7.53s/it]

0.107603095


 16%|█▌        | 40/255 [06:09<28:49,  8.04s/it]

0.11862161


 16%|█▌        | 41/255 [06:18<30:24,  8.53s/it]

0.0689364


 16%|█▋        | 42/255 [06:25<28:46,  8.11s/it]

0.16203035


 17%|█▋        | 43/255 [06:33<27:48,  7.87s/it]

0.114578806


 17%|█▋        | 44/255 [06:39<26:29,  7.54s/it]

0.19656509


 18%|█▊        | 45/255 [06:48<27:06,  7.75s/it]

0.45173484


 18%|█▊        | 46/255 [06:55<26:41,  7.66s/it]

0.25898248


 18%|█▊        | 47/255 [07:02<25:45,  7.43s/it]

0.21714266


 19%|█▉        | 48/255 [07:10<26:07,  7.57s/it]

0.15806186


 19%|█▉        | 49/255 [07:18<26:20,  7.67s/it]

0.39513224


 20%|█▉        | 50/255 [07:25<25:54,  7.58s/it]

0.21219896


 20%|██        | 51/255 [07:33<25:49,  7.60s/it]

0.26022732


 20%|██        | 52/255 [07:40<24:55,  7.37s/it]

0.12260014


 21%|██        | 53/255 [07:48<25:26,  7.56s/it]

0.55126446


 21%|██        | 54/255 [07:55<25:30,  7.61s/it]

0.53767157


 22%|██▏       | 55/255 [08:03<25:09,  7.55s/it]

0.17152321


 22%|██▏       | 56/255 [08:13<27:19,  8.24s/it]

0.5315981


 22%|██▏       | 57/255 [08:20<26:29,  8.03s/it]

0.18899715


 23%|██▎       | 58/255 [08:28<25:38,  7.81s/it]

0.3684759


 23%|██▎       | 59/255 [08:35<24:43,  7.57s/it]

0.1658536


 24%|██▎       | 60/255 [08:43<25:01,  7.70s/it]

0.4110799


 24%|██▍       | 61/255 [08:50<24:38,  7.62s/it]

0.15921459


 24%|██▍       | 62/255 [09:00<26:50,  8.34s/it]

0.13778532


 25%|██▍       | 63/255 [09:11<29:04,  9.09s/it]

0.4898752


 25%|██▌       | 64/255 [09:22<30:40,  9.64s/it]

0.10690673


 25%|██▌       | 65/255 [09:32<31:21,  9.90s/it]

0.25468647


 26%|██▌       | 66/255 [09:40<29:22,  9.33s/it]

0.087077454


 26%|██▋       | 67/255 [09:49<28:20,  9.04s/it]

0.070869595


 27%|██▋       | 68/255 [09:55<25:35,  8.21s/it]

0.44704288


 27%|██▋       | 69/255 [10:02<24:43,  7.98s/it]

0.12300204


 27%|██▋       | 70/255 [10:12<26:01,  8.44s/it]

0.057403784


 28%|██▊       | 71/255 [10:19<24:57,  8.14s/it]

0.17196053


 28%|██▊       | 72/255 [10:27<24:10,  7.93s/it]

0.1296087


 29%|██▊       | 73/255 [10:34<23:32,  7.76s/it]

0.4078653


 29%|██▉       | 74/255 [10:41<22:52,  7.58s/it]

0.2668304


 29%|██▉       | 75/255 [10:51<24:43,  8.24s/it]

0.22936189


 30%|██▉       | 76/255 [11:01<25:54,  8.68s/it]

0.50908303


 30%|███       | 77/255 [11:09<24:57,  8.41s/it]

0.088423565


 31%|███       | 78/255 [11:17<24:37,  8.35s/it]

0.13614324


 31%|███       | 79/255 [11:26<24:53,  8.48s/it]

0.2572806


 31%|███▏      | 80/255 [11:32<23:21,  8.01s/it]

0.15670153


 32%|███▏      | 81/255 [11:39<21:59,  7.59s/it]

0.19816267


 32%|███▏      | 82/255 [11:47<21:53,  7.59s/it]

0.30750915


 33%|███▎      | 83/255 [11:52<20:14,  7.06s/it]

0.27654678


 33%|███▎      | 84/255 [12:00<20:20,  7.14s/it]

0.30868605


 33%|███▎      | 85/255 [12:08<20:56,  7.39s/it]

0.64353687


 34%|███▎      | 86/255 [12:16<21:41,  7.70s/it]

0.16939205


 34%|███▍      | 87/255 [12:24<21:24,  7.65s/it]

0.3907081


 35%|███▍      | 88/255 [12:32<21:27,  7.71s/it]

0.173163


 35%|███▍      | 89/255 [12:40<21:29,  7.77s/it]

0.3253175


 35%|███▌      | 90/255 [12:47<21:31,  7.82s/it]

0.43868724


 36%|███▌      | 91/255 [12:54<20:11,  7.39s/it]

0.2621127


 36%|███▌      | 92/255 [13:01<19:58,  7.35s/it]

0.13557829


 36%|███▋      | 93/255 [13:07<18:57,  7.02s/it]

0.11584101


 37%|███▋      | 94/255 [13:14<18:13,  6.79s/it]

0.076576054


 37%|███▋      | 95/255 [13:20<17:30,  6.57s/it]

0.24531683


 38%|███▊      | 96/255 [13:28<18:34,  7.01s/it]

0.4313838


 38%|███▊      | 97/255 [13:36<19:19,  7.34s/it]

0.2523744


 38%|███▊      | 98/255 [13:42<18:27,  7.05s/it]

0.24013144


 39%|███▉      | 99/255 [13:49<18:07,  6.97s/it]

0.29149643


 39%|███▉      | 100/255 [13:57<18:35,  7.20s/it]

0.31725153


 40%|███▉      | 101/255 [14:05<19:33,  7.62s/it]

0.10899448


 40%|████      | 102/255 [14:12<18:36,  7.30s/it]

0.078508124


 40%|████      | 103/255 [14:18<17:43,  7.00s/it]

0.18618205


 41%|████      | 104/255 [14:27<18:56,  7.53s/it]

0.28025937


 41%|████      | 105/255 [14:35<19:31,  7.81s/it]

0.30444884


 42%|████▏     | 106/255 [14:44<19:42,  7.94s/it]

0.21907008


 42%|████▏     | 107/255 [14:52<19:44,  8.01s/it]

0.18558055


 42%|████▏     | 108/255 [15:00<19:58,  8.15s/it]

0.2557106


 43%|████▎     | 109/255 [15:09<19:58,  8.21s/it]

0.34050092


 43%|████▎     | 110/255 [15:16<19:18,  7.99s/it]

0.069904864


 44%|████▎     | 111/255 [15:25<19:38,  8.19s/it]

0.7332735


 44%|████▍     | 112/255 [15:33<19:50,  8.32s/it]

0.4602191


 44%|████▍     | 113/255 [15:42<19:54,  8.41s/it]

0.040970016


 45%|████▍     | 114/255 [15:50<19:40,  8.37s/it]

0.13192365


 45%|████▌     | 115/255 [15:59<19:34,  8.39s/it]

0.2767534


 45%|████▌     | 116/255 [16:07<19:29,  8.41s/it]

0.16624302


 46%|████▌     | 117/255 [16:15<19:14,  8.36s/it]

0.11353918


 46%|████▋     | 118/255 [16:23<18:30,  8.10s/it]

0.14251061


 47%|████▋     | 119/255 [16:32<18:54,  8.34s/it]

0.10628798


 47%|████▋     | 120/255 [16:41<19:12,  8.54s/it]

0.11372195


 47%|████▋     | 121/255 [16:49<19:07,  8.56s/it]

0.43345


 48%|████▊     | 122/255 [16:56<17:34,  7.93s/it]

0.07556404


 48%|████▊     | 123/255 [17:04<17:53,  8.13s/it]

0.160698


 49%|████▊     | 124/255 [17:11<16:45,  7.68s/it]

0.23091835


 49%|████▉     | 125/255 [17:18<15:53,  7.34s/it]

0.41059402


 49%|████▉     | 126/255 [17:26<16:13,  7.55s/it]

0.15174274


 50%|████▉     | 127/255 [17:35<17:09,  8.04s/it]

0.16570315


 50%|█████     | 128/255 [17:44<17:41,  8.36s/it]

0.18878011


 51%|█████     | 129/255 [17:52<17:14,  8.21s/it]

0.06160053


 51%|█████     | 130/255 [18:00<17:07,  8.22s/it]

0.10935563


 51%|█████▏    | 131/255 [18:08<16:50,  8.15s/it]

0.2062949


 52%|█████▏    | 132/255 [18:16<16:44,  8.17s/it]

0.15110989


 52%|█████▏    | 133/255 [18:23<15:34,  7.66s/it]

0.25515616


 53%|█████▎    | 134/255 [18:30<15:02,  7.46s/it]

0.038795494


 53%|█████▎    | 135/255 [18:39<16:06,  8.05s/it]

0.09543717


 53%|█████▎    | 136/255 [18:49<16:47,  8.47s/it]

0.04254659


 54%|█████▎    | 137/255 [18:56<16:17,  8.29s/it]

0.20018254


 54%|█████▍    | 138/255 [19:06<16:43,  8.58s/it]

0.118889764


 55%|█████▍    | 139/255 [19:15<17:05,  8.84s/it]

0.18948166


 55%|█████▍    | 140/255 [19:22<15:45,  8.22s/it]

0.12478102


 55%|█████▌    | 141/255 [19:29<14:58,  7.88s/it]

0.15744661


 56%|█████▌    | 142/255 [19:37<15:02,  7.98s/it]

0.52670485


 56%|█████▌    | 143/255 [19:47<15:41,  8.40s/it]

0.01029614


 56%|█████▋    | 144/255 [19:56<16:02,  8.67s/it]

0.32777295


 57%|█████▋    | 145/255 [20:03<14:53,  8.12s/it]

0.4393407


 57%|█████▋    | 146/255 [20:12<15:26,  8.50s/it]

0.1463471


 58%|█████▊    | 147/255 [20:21<15:37,  8.68s/it]

0.15067661


 58%|█████▊    | 148/255 [20:31<15:50,  8.89s/it]

0.5314518


 58%|█████▊    | 149/255 [20:40<16:08,  9.13s/it]

0.07043867


 59%|█████▉    | 150/255 [20:50<16:22,  9.36s/it]

0.27512768


 59%|█████▉    | 151/255 [20:59<15:45,  9.09s/it]

0.24929655


 60%|█████▉    | 152/255 [21:08<15:49,  9.22s/it]

0.21441299


 60%|██████    | 153/255 [21:16<15:00,  8.83s/it]

0.13204625


 60%|██████    | 154/255 [21:25<14:50,  8.82s/it]

0.07261104


 61%|██████    | 155/255 [21:34<14:49,  8.89s/it]

0.117257066


 61%|██████    | 156/255 [21:42<14:15,  8.64s/it]

0.5222573


 62%|██████▏   | 157/255 [21:52<14:49,  9.08s/it]

0.10903385


 62%|██████▏   | 158/255 [22:02<15:02,  9.31s/it]

0.08499875


 62%|██████▏   | 159/255 [22:12<15:08,  9.46s/it]

0.29179776


 63%|██████▎   | 160/255 [22:22<15:15,  9.63s/it]

0.053425834


 63%|██████▎   | 161/255 [22:31<14:46,  9.43s/it]

0.24712044


 64%|██████▎   | 162/255 [22:41<14:50,  9.57s/it]

0.06864583


 64%|██████▍   | 163/255 [22:51<15:01,  9.80s/it]

0.069916904


 64%|██████▍   | 164/255 [23:01<15:03,  9.92s/it]

0.7126586


 65%|██████▍   | 165/255 [23:11<14:37,  9.74s/it]

0.14099774


 65%|██████▌   | 166/255 [23:21<14:45,  9.95s/it]

0.18274298


 65%|██████▌   | 167/255 [23:30<14:08,  9.64s/it]

0.3103349


 66%|██████▌   | 168/255 [23:37<12:54,  8.90s/it]

0.069828734


 66%|██████▋   | 169/255 [23:47<13:00,  9.08s/it]

0.067480646


 67%|██████▋   | 170/255 [23:57<13:25,  9.47s/it]

0.18190432


 67%|██████▋   | 171/255 [24:07<13:19,  9.52s/it]

0.09956497


 67%|██████▋   | 172/255 [24:17<13:28,  9.74s/it]

0.29162234


 68%|██████▊   | 173/255 [24:27<13:34,  9.93s/it]

0.09385419


 68%|██████▊   | 174/255 [24:37<13:26,  9.96s/it]

0.0849174


 69%|██████▊   | 175/255 [24:48<13:29, 10.12s/it]

0.11330768


 69%|██████▉   | 176/255 [24:56<12:44,  9.68s/it]

0.076334566


 69%|██████▉   | 177/255 [25:07<12:53,  9.92s/it]

0.4121225


 70%|██████▉   | 178/255 [25:14<11:40,  9.10s/it]

0.112306386


 70%|███████   | 179/255 [25:25<12:05,  9.55s/it]

0.07579523


 71%|███████   | 180/255 [25:35<12:06,  9.68s/it]

0.07800664


 71%|███████   | 181/255 [25:43<11:20,  9.20s/it]

0.23836803


 71%|███████▏  | 182/255 [25:53<11:41,  9.62s/it]

0.23255344


 72%|███████▏  | 183/255 [26:04<11:59,  9.99s/it]

0.37872726


 72%|███████▏  | 184/255 [26:13<11:14,  9.50s/it]

0.21095325


 73%|███████▎  | 185/255 [26:23<11:14,  9.64s/it]

0.026503235


 73%|███████▎  | 186/255 [26:33<11:17,  9.82s/it]

0.1952811


 73%|███████▎  | 187/255 [26:40<10:14,  9.04s/it]

0.45809084


 74%|███████▎  | 188/255 [26:48<09:46,  8.76s/it]

0.22570726


 74%|███████▍  | 189/255 [26:57<09:31,  8.67s/it]

0.20360991


 75%|███████▍  | 190/255 [27:08<10:12,  9.42s/it]

0.21331672


 75%|███████▍  | 191/255 [27:18<10:12,  9.56s/it]

0.16717228


 75%|███████▌  | 192/255 [27:29<10:28,  9.98s/it]

0.08902899


 76%|███████▌  | 193/255 [27:36<09:37,  9.32s/it]

0.44785824


 76%|███████▌  | 194/255 [27:44<08:52,  8.73s/it]

0.034792155


 76%|███████▋  | 195/255 [27:55<09:38,  9.64s/it]

0.09427318


 77%|███████▋  | 196/255 [28:09<10:41, 10.87s/it]

0.08978897


 77%|███████▋  | 197/255 [28:21<10:53, 11.27s/it]

0.15382814


 78%|███████▊  | 198/255 [28:31<10:12, 10.74s/it]

0.045453224


 78%|███████▊  | 199/255 [28:40<09:37, 10.31s/it]

0.26269603


 78%|███████▊  | 200/255 [28:51<09:40, 10.55s/it]

0.17243974


 79%|███████▉  | 201/255 [29:01<09:13, 10.25s/it]

0.07877053


 79%|███████▉  | 202/255 [29:10<08:53, 10.06s/it]

0.2555182


 80%|███████▉  | 203/255 [29:21<08:47, 10.15s/it]

0.061517723


 80%|████████  | 204/255 [29:30<08:26,  9.93s/it]

0.14818665


 80%|████████  | 205/255 [29:40<08:17,  9.96s/it]

0.023471683


 81%|████████  | 206/255 [29:54<08:56, 10.95s/it]

0.11031885


 81%|████████  | 207/255 [30:03<08:24, 10.52s/it]

0.15661435


 82%|████████▏ | 208/255 [30:13<07:59, 10.21s/it]

0.13430214


 82%|████████▏ | 209/255 [30:22<07:39, 10.00s/it]

0.15433073


 82%|████████▏ | 210/255 [30:36<08:20, 11.13s/it]

0.3693343


 83%|████████▎ | 211/255 [30:45<07:47, 10.64s/it]

0.18541919


 83%|████████▎ | 212/255 [30:54<07:17, 10.17s/it]

0.39755762


 84%|████████▎ | 213/255 [31:04<06:59,  9.99s/it]

0.53872


 84%|████████▍ | 214/255 [31:14<06:49,  9.99s/it]

0.1369649


 84%|████████▍ | 215/255 [31:23<06:33,  9.85s/it]

0.20849663


 85%|████████▍ | 216/255 [31:33<06:18,  9.70s/it]

0.6054611


 85%|████████▌ | 217/255 [31:42<06:07,  9.66s/it]

0.21047626


 85%|████████▌ | 218/255 [31:52<05:53,  9.54s/it]

0.31312373


 86%|████████▌ | 219/255 [32:03<06:03, 10.09s/it]

0.17494075


 86%|████████▋ | 220/255 [32:12<05:42,  9.79s/it]

0.042441156


 87%|████████▋ | 221/255 [32:21<05:27,  9.63s/it]

0.19025344


 87%|████████▋ | 222/255 [32:31<05:19,  9.68s/it]

0.23826477


 87%|████████▋ | 223/255 [32:41<05:08,  9.65s/it]

0.029941197


 88%|████████▊ | 224/255 [32:56<05:52, 11.37s/it]

0.15030463


 88%|████████▊ | 225/255 [33:10<06:00, 12.02s/it]

0.17209937


 89%|████████▊ | 226/255 [33:20<05:31, 11.44s/it]

0.06364027


 89%|████████▉ | 227/255 [33:30<05:12, 11.15s/it]

0.07348136


 89%|████████▉ | 228/255 [33:41<04:58, 11.05s/it]

0.037116207


 90%|████████▉ | 229/255 [33:51<04:35, 10.59s/it]

0.07531964


 90%|█████████ | 230/255 [34:00<04:17, 10.29s/it]

0.5573047


 91%|█████████ | 231/255 [34:10<04:03, 10.14s/it]

0.2358937


 91%|█████████ | 232/255 [34:23<04:12, 10.98s/it]

0.34298295


 91%|█████████▏| 233/255 [34:33<03:53, 10.61s/it]

0.09881649


 92%|█████████▏| 234/255 [34:45<03:55, 11.20s/it]

0.3286533


 92%|█████████▏| 235/255 [34:57<03:45, 11.27s/it]

0.052871242


 93%|█████████▎| 236/255 [35:07<03:27, 10.90s/it]

0.089543425


 93%|█████████▎| 237/255 [35:16<03:09, 10.52s/it]

0.21023452


 93%|█████████▎| 238/255 [35:27<03:01, 10.69s/it]

0.09395315


 94%|█████████▎| 239/255 [35:37<02:47, 10.45s/it]

0.18657325


 94%|█████████▍| 240/255 [35:48<02:38, 10.54s/it]

0.33063418


 95%|█████████▍| 241/255 [35:59<02:31, 10.81s/it]

0.05679129


 95%|█████████▍| 242/255 [36:10<02:18, 10.62s/it]

0.32252598


 95%|█████████▌| 243/255 [36:21<02:10, 10.86s/it]

0.09154127


 96%|█████████▌| 244/255 [36:31<01:57, 10.66s/it]

0.41052458


 96%|█████████▌| 245/255 [36:41<01:45, 10.52s/it]

0.16979352


 96%|█████████▋| 246/255 [36:51<01:32, 10.29s/it]

0.09790621


 97%|█████████▋| 247/255 [37:01<01:21, 10.19s/it]

0.030182108


 97%|█████████▋| 248/255 [37:11<01:09,  9.98s/it]

0.1591816


 98%|█████████▊| 249/255 [37:21<01:00, 10.10s/it]

0.25882754


 98%|█████████▊| 250/255 [37:32<00:51, 10.24s/it]

0.19858308


 98%|█████████▊| 251/255 [37:44<00:44, 11.02s/it]

0.109184474


 99%|█████████▉| 252/255 [37:56<00:33, 11.04s/it]

0.1664424


 99%|█████████▉| 253/255 [38:09<00:23, 11.79s/it]

0.048023906


100%|█████████▉| 254/255 [38:19<00:11, 11.36s/it]

0.3917519


100%|██████████| 255/255 [38:35<00:00,  9.08s/it]


0.22162144


100%|██████████| 29/29 [00:08<00:00,  3.31it/s]


epoch=1: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.23573) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.211661) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.11088) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.105148)


  0%|          | 1/255 [00:15<1:07:16, 15.89s/it]

0.32756853


  1%|          | 2/255 [00:31<1:05:27, 15.53s/it]

0.083720595


  1%|          | 3/255 [00:42<56:35, 13.47s/it]  

0.060217764


  2%|▏         | 4/255 [00:58<1:00:45, 14.53s/it]

0.5156134


  2%|▏         | 5/255 [01:09<55:00, 13.20s/it]  

0.2190853


  2%|▏         | 6/255 [01:20<51:47, 12.48s/it]

0.22850242


  3%|▎         | 7/255 [01:31<49:15, 11.92s/it]

0.050655186


  3%|▎         | 8/255 [01:41<46:34, 11.32s/it]

0.0229787


  4%|▎         | 9/255 [01:52<46:34, 11.36s/it]

0.22029267


  4%|▍         | 10/255 [02:05<48:32, 11.89s/it]

0.11717817


  4%|▍         | 11/255 [02:35<1:10:33, 17.35s/it]

0.057595603


  5%|▍         | 12/255 [03:10<1:32:46, 22.91s/it]

0.081306174


  5%|▌         | 13/255 [03:42<1:43:05, 25.56s/it]

0.20257366


  5%|▌         | 14/255 [04:15<1:51:29, 27.76s/it]

0.23548469


  6%|▌         | 15/255 [04:46<1:55:03, 28.76s/it]

0.21681859


  6%|▋         | 16/255 [05:16<1:55:48, 29.07s/it]

0.21584596


  7%|▋         | 17/255 [05:44<1:54:14, 28.80s/it]

0.14363334


  7%|▋         | 18/255 [06:11<1:52:02, 28.37s/it]

0.14515434


  7%|▋         | 19/255 [06:37<1:48:24, 27.56s/it]

0.121316515


  8%|▊         | 20/255 [07:03<1:46:09, 27.10s/it]

0.4385505


  8%|▊         | 21/255 [07:27<1:42:20, 26.24s/it]

0.03817708


  9%|▊         | 22/255 [07:50<1:37:45, 25.18s/it]

0.17660926


  9%|▉         | 23/255 [08:13<1:35:09, 24.61s/it]

0.24680346


  9%|▉         | 24/255 [08:35<1:31:26, 23.75s/it]

0.28792995


 10%|▉         | 25/255 [08:57<1:28:34, 23.10s/it]

0.09244192


 10%|█         | 26/255 [09:18<1:25:51, 22.50s/it]

0.22077245


 11%|█         | 27/255 [09:39<1:24:24, 22.21s/it]

0.121072516


 11%|█         | 28/255 [09:58<1:20:33, 21.29s/it]

0.1114651


 11%|█▏        | 29/255 [10:17<1:16:57, 20.43s/it]

0.040612057


 12%|█▏        | 30/255 [10:35<1:14:37, 19.90s/it]

0.075557195


 12%|█▏        | 31/255 [10:53<1:11:12, 19.07s/it]

0.07344599


 13%|█▎        | 32/255 [11:10<1:08:30, 18.43s/it]

0.08540277


 13%|█▎        | 33/255 [11:28<1:07:39, 18.29s/it]

0.09482465


 13%|█▎        | 34/255 [11:46<1:07:16, 18.26s/it]

0.2416042


 14%|█▎        | 35/255 [12:02<1:04:40, 17.64s/it]

0.18722263


 14%|█▍        | 36/255 [12:15<59:50, 16.39s/it]  

0.35690224


 15%|█▍        | 37/255 [12:32<59:36, 16.41s/it]

0.2148284


 15%|█▍        | 38/255 [12:46<56:22, 15.59s/it]

0.1028485


 15%|█▌        | 39/255 [12:58<52:57, 14.71s/it]

0.10753679


 16%|█▌        | 40/255 [13:10<49:23, 13.78s/it]

0.02442336


 16%|█▌        | 41/255 [13:21<46:05, 12.92s/it]

0.066152476


 16%|█▋        | 42/255 [13:32<43:39, 12.30s/it]

0.15492792


 17%|█▋        | 43/255 [13:46<45:48, 12.96s/it]

0.13052802


 17%|█▋        | 44/255 [13:57<43:53, 12.48s/it]

0.44576752


 18%|█▊        | 45/255 [14:08<41:16, 11.79s/it]

0.09049489


 18%|█▊        | 46/255 [14:18<39:12, 11.25s/it]

0.12269181


 18%|█▊        | 47/255 [14:28<37:44, 10.88s/it]

0.15380643


 19%|█▉        | 48/255 [14:40<39:11, 11.36s/it]

0.10007963


 19%|█▉        | 49/255 [14:53<40:37, 11.83s/it]

0.11233043


 20%|█▉        | 50/255 [15:07<42:32, 12.45s/it]

0.025141865


 20%|██        | 51/255 [15:20<43:08, 12.69s/it]

0.25848198


 20%|██        | 52/255 [15:31<40:58, 12.11s/it]

0.29582682


 21%|██        | 53/255 [15:43<40:51, 12.14s/it]

0.7518352


 21%|██        | 54/255 [15:54<39:22, 11.75s/it]

0.281024


 22%|██▏       | 55/255 [16:05<38:21, 11.51s/it]

0.118673354


 22%|██▏       | 56/255 [16:18<39:33, 11.92s/it]

0.28945184


 22%|██▏       | 57/255 [16:32<41:58, 12.72s/it]

0.16244054


 23%|██▎       | 58/255 [16:43<39:14, 11.95s/it]

0.3331752


 23%|██▎       | 59/255 [16:53<37:34, 11.50s/it]

0.26040834


 24%|██▎       | 60/255 [17:04<36:26, 11.21s/it]

0.1687582


 24%|██▍       | 61/255 [17:17<38:16, 11.84s/it]

0.26444954


 24%|██▍       | 62/255 [17:30<39:04, 12.15s/it]

0.020357102


 25%|██▍       | 63/255 [17:43<39:33, 12.36s/it]

0.31206372


 25%|██▌       | 64/255 [17:55<39:01, 12.26s/it]

0.04858182


 25%|██▌       | 65/255 [18:09<40:49, 12.89s/it]

0.15793133


 26%|██▌       | 66/255 [18:21<39:36, 12.57s/it]

0.16175255


 26%|██▋       | 67/255 [18:33<38:57, 12.43s/it]

0.31917197


 27%|██▋       | 68/255 [18:46<39:13, 12.59s/it]

0.3666736


 27%|██▋       | 69/255 [18:57<37:26, 12.08s/it]

0.096041515


 27%|██▋       | 70/255 [19:08<36:13, 11.75s/it]

0.19436632


 28%|██▊       | 71/255 [19:19<36:02, 11.75s/it]

0.19964017


 28%|██▊       | 72/255 [19:30<34:53, 11.44s/it]

0.11010233


 29%|██▊       | 73/255 [19:42<34:35, 11.41s/it]

0.11152078


 29%|██▉       | 74/255 [19:52<33:49, 11.21s/it]

0.12823777


 29%|██▉       | 75/255 [20:04<33:50, 11.28s/it]

0.13999958


 30%|██▉       | 76/255 [20:16<34:20, 11.51s/it]

0.24693646


 30%|███       | 77/255 [20:27<33:46, 11.38s/it]

0.05718585


 31%|███       | 78/255 [20:38<33:08, 11.23s/it]

0.12356518


 31%|███       | 79/255 [20:49<32:34, 11.11s/it]

0.42219204


 31%|███▏      | 80/255 [20:59<32:14, 11.05s/it]

0.057992663


 32%|███▏      | 81/255 [21:11<32:52, 11.34s/it]

0.040994108


 32%|███▏      | 82/255 [21:24<33:47, 11.72s/it]

0.17018811


 33%|███▎      | 83/255 [21:35<32:54, 11.48s/it]

0.15786587


 33%|███▎      | 84/255 [21:47<33:06, 11.61s/it]

0.19256534


 33%|███▎      | 85/255 [22:00<33:58, 11.99s/it]

0.22046873


 34%|███▎      | 86/255 [22:14<35:48, 12.72s/it]

0.12557048


 34%|███▍      | 87/255 [22:27<35:54, 12.82s/it]

0.14170998


 35%|███▍      | 88/255 [22:47<41:30, 14.91s/it]

0.07613068


 35%|███▍      | 89/255 [23:10<48:10, 17.41s/it]

0.29373774


 35%|███▌      | 90/255 [23:33<52:35, 19.12s/it]

0.640005


 36%|███▌      | 91/255 [23:51<50:56, 18.64s/it]

0.45115638


 36%|███▌      | 92/255 [24:09<50:03, 18.43s/it]

0.05115743


 36%|███▋      | 93/255 [24:29<50:51, 18.83s/it]

0.04764858


 37%|███▋      | 94/255 [24:49<51:52, 19.33s/it]

0.25404584


 37%|███▋      | 95/255 [25:08<51:20, 19.25s/it]

0.1370298


 38%|███▊      | 96/255 [25:26<49:50, 18.81s/it]

0.14929546


 38%|███▊      | 97/255 [25:50<53:23, 20.28s/it]

0.19355503


 38%|███▊      | 98/255 [26:12<54:57, 21.01s/it]

0.23062174


 39%|███▉      | 99/255 [26:37<57:07, 21.97s/it]

0.3142772


 39%|███▉      | 100/255 [27:00<57:49, 22.39s/it]

0.29767075


 40%|███▉      | 101/255 [27:24<58:26, 22.77s/it]

0.16377464


 40%|████      | 102/255 [27:46<57:27, 22.53s/it]

0.055517744


 40%|████      | 103/255 [28:06<55:11, 21.79s/it]

0.105274394


 41%|████      | 104/255 [28:26<53:40, 21.33s/it]

0.23801558


 41%|████      | 105/255 [28:50<55:34, 22.23s/it]

0.30993643


 42%|████▏     | 106/255 [29:10<53:36, 21.58s/it]

0.23167785


 42%|████▏     | 107/255 [29:31<52:31, 21.29s/it]

0.10739602


 42%|████▏     | 108/255 [29:53<53:03, 21.66s/it]

0.07548724


 43%|████▎     | 109/255 [30:14<51:45, 21.27s/it]

0.41864607


 43%|████▎     | 110/255 [30:35<51:02, 21.12s/it]

0.15526603


 44%|████▎     | 111/255 [30:56<51:01, 21.26s/it]

0.47033942


 44%|████▍     | 112/255 [31:17<50:26, 21.16s/it]

0.21195039


 44%|████▍     | 113/255 [31:41<52:17, 22.09s/it]

0.022649921


 45%|████▍     | 114/255 [32:05<52:55, 22.52s/it]

0.11882053


 45%|████▌     | 115/255 [32:28<52:37, 22.55s/it]

0.055152074


 45%|████▌     | 116/255 [32:50<52:30, 22.66s/it]

0.29770094


 46%|████▌     | 117/255 [33:15<53:39, 23.33s/it]

0.116798356


 46%|████▋     | 118/255 [33:39<53:12, 23.30s/it]

0.025259435


 47%|████▋     | 119/255 [34:02<53:07, 23.43s/it]

0.16808036


 47%|████▋     | 120/255 [34:27<53:38, 23.84s/it]

0.09086424


 47%|████▋     | 121/255 [34:53<54:36, 24.45s/it]

0.22388265


 48%|████▊     | 122/255 [35:18<54:52, 24.75s/it]

0.070568725


 48%|████▊     | 123/255 [35:45<55:19, 25.15s/it]

0.09051169


 49%|████▊     | 124/255 [36:09<54:16, 24.86s/it]

0.23879582


 49%|████▉     | 125/255 [36:31<52:24, 24.19s/it]

0.33277968


 49%|████▉     | 126/255 [36:57<52:40, 24.50s/it]

0.1325586


 50%|████▉     | 127/255 [37:20<51:24, 24.10s/it]

0.14700939


 50%|█████     | 128/255 [37:43<50:36, 23.91s/it]

0.13906749


 51%|█████     | 129/255 [38:08<51:00, 24.29s/it]

0.19984446


 51%|█████     | 130/255 [38:33<51:05, 24.52s/it]

0.11441482


 51%|█████▏    | 131/255 [38:58<50:30, 24.44s/it]

0.14085235


 52%|█████▏    | 132/255 [39:21<49:09, 23.98s/it]

0.13312542


 52%|█████▏    | 133/255 [39:46<49:26, 24.32s/it]

0.27689978


 53%|█████▎    | 134/255 [40:10<48:54, 24.26s/it]

0.34638467


 53%|█████▎    | 135/255 [40:34<48:30, 24.25s/it]

0.07733887


 53%|█████▎    | 136/255 [40:58<47:44, 24.07s/it]

0.017833484


 54%|█████▎    | 137/255 [41:22<47:23, 24.10s/it]

0.114322715


 54%|█████▍    | 138/255 [41:47<47:22, 24.29s/it]

0.11053624


 55%|█████▍    | 139/255 [42:10<46:26, 24.02s/it]

0.26679063


 55%|█████▍    | 140/255 [42:36<47:16, 24.66s/it]

0.14418791


 55%|█████▌    | 141/255 [43:01<46:44, 24.60s/it]

0.032518562


 56%|█████▌    | 142/255 [43:24<45:38, 24.23s/it]

0.2167029


 56%|█████▌    | 143/255 [43:52<47:32, 25.47s/it]

0.0098304115


 56%|█████▋    | 144/255 [44:17<46:42, 25.25s/it]

0.23285276


 57%|█████▋    | 145/255 [44:44<47:10, 25.73s/it]

0.2881688


 57%|█████▋    | 146/255 [45:08<45:50, 25.23s/it]

0.40171912


 58%|█████▊    | 147/255 [45:35<46:28, 25.81s/it]

0.10234189


 58%|█████▊    | 148/255 [46:01<45:48, 25.68s/it]

0.35071903


 58%|█████▊    | 149/255 [46:27<45:38, 25.84s/it]

0.09868529


 59%|█████▉    | 150/255 [46:50<43:47, 25.02s/it]

0.18146607


 59%|█████▉    | 151/255 [47:17<44:21, 25.59s/it]

0.10073301


 60%|█████▉    | 152/255 [47:40<42:51, 24.96s/it]

0.05254625


 60%|██████    | 153/255 [48:05<42:19, 24.90s/it]

0.06914249


 60%|██████    | 154/255 [48:30<41:56, 24.91s/it]

0.21000446


 61%|██████    | 155/255 [48:58<42:52, 25.72s/it]

0.11379433


 61%|██████    | 156/255 [49:23<42:23, 25.69s/it]

0.08763605


 62%|██████▏   | 157/255 [49:50<42:20, 25.92s/it]

0.028112836


 62%|██████▏   | 158/255 [50:19<43:21, 26.82s/it]

0.0477219


 62%|██████▏   | 159/255 [50:47<43:43, 27.32s/it]

0.09100976


 63%|██████▎   | 160/255 [51:09<40:47, 25.76s/it]

0.012736158


 63%|██████▎   | 161/255 [51:32<38:55, 24.84s/it]

0.15442102


 64%|██████▎   | 162/255 [51:59<39:27, 25.46s/it]

0.16240163


 64%|██████▍   | 163/255 [52:22<38:08, 24.88s/it]

0.023252249


 64%|██████▍   | 164/255 [52:48<38:11, 25.18s/it]

0.43391317


 65%|██████▍   | 165/255 [53:14<38:07, 25.42s/it]

0.30731714


 65%|██████▌   | 166/255 [53:38<37:11, 25.07s/it]

0.051141754


 65%|██████▌   | 167/255 [54:06<38:00, 25.92s/it]

0.10144857


 66%|██████▌   | 168/255 [54:31<36:56, 25.47s/it]

0.098143525


 66%|██████▋   | 169/255 [54:56<36:13, 25.27s/it]

0.03507766


 67%|██████▋   | 170/255 [55:23<36:52, 26.03s/it]

0.07355963


 67%|██████▋   | 171/255 [55:48<35:53, 25.64s/it]

0.13330558


 67%|██████▋   | 172/255 [56:14<35:25, 25.61s/it]

0.35415673


 68%|██████▊   | 173/255 [56:42<36:15, 26.53s/it]

0.09888607


 68%|██████▊   | 174/255 [57:09<36:05, 26.73s/it]

0.12819253


 69%|██████▊   | 175/255 [57:35<35:15, 26.44s/it]

0.1174138


 69%|██████▉   | 176/255 [58:01<34:25, 26.15s/it]

0.2494984


 69%|██████▉   | 177/255 [58:26<33:46, 25.98s/it]

0.2810112


 70%|██████▉   | 178/255 [58:53<33:36, 26.18s/it]

0.12114623


 70%|███████   | 179/255 [59:18<32:43, 25.83s/it]

0.06063629


 71%|███████   | 180/255 [59:44<32:14, 25.79s/it]

0.02509306


 71%|███████   | 181/255 [1:00:09<31:45, 25.76s/it]

0.2343358


 71%|███████▏  | 182/255 [1:00:40<33:00, 27.14s/it]

0.42405927


 72%|███████▏  | 183/255 [1:01:09<33:22, 27.81s/it]

0.17738137


 72%|███████▏  | 184/255 [1:01:37<32:52, 27.78s/it]

0.08127167


 73%|███████▎  | 185/255 [1:02:05<32:41, 28.02s/it]

0.040056445


 73%|███████▎  | 186/255 [1:02:33<32:16, 28.06s/it]

0.19284193


 73%|███████▎  | 187/255 [1:03:00<31:06, 27.45s/it]

0.24835153


 74%|███████▎  | 188/255 [1:03:25<30:08, 26.99s/it]

0.19287923


 74%|███████▍  | 189/255 [1:03:54<30:13, 27.48s/it]

0.10208553


 75%|███████▍  | 190/255 [1:04:27<31:34, 29.14s/it]

0.13998064


 75%|███████▍  | 191/255 [1:04:58<31:38, 29.67s/it]

0.12223384


 75%|███████▌  | 192/255 [1:05:34<33:14, 31.65s/it]

0.13791142


 76%|███████▌  | 193/255 [1:06:10<33:51, 32.76s/it]

0.18214561


 76%|███████▌  | 194/255 [1:06:45<34:03, 33.49s/it]

0.10424531


 76%|███████▋  | 195/255 [1:07:20<33:55, 33.92s/it]

0.09172992


 77%|███████▋  | 196/255 [1:07:58<34:35, 35.18s/it]

0.061531965


 77%|███████▋  | 197/255 [1:08:34<34:25, 35.62s/it]

0.08243234


 78%|███████▊  | 198/255 [1:09:12<34:16, 36.09s/it]

0.02325054


 78%|███████▊  | 199/255 [1:09:48<33:41, 36.10s/it]

0.27303433


 78%|███████▊  | 200/255 [1:10:26<33:34, 36.62s/it]

0.08804244


 79%|███████▉  | 201/255 [1:11:01<32:43, 36.37s/it]

0.093682446


 79%|███████▉  | 202/255 [1:11:41<32:59, 37.34s/it]

0.33628583


 80%|███████▉  | 203/255 [1:12:19<32:28, 37.47s/it]

0.21884233


 80%|████████  | 204/255 [1:12:50<30:17, 35.63s/it]

0.083245784


 80%|████████  | 205/255 [1:13:16<27:08, 32.57s/it]

0.020513264


 81%|████████  | 206/255 [1:13:42<25:02, 30.66s/it]

0.15183064


 81%|████████  | 207/255 [1:14:07<23:16, 29.10s/it]

0.14827964


 82%|████████▏ | 208/255 [1:14:36<22:50, 29.15s/it]

0.12063143


 82%|████████▏ | 209/255 [1:15:08<22:47, 29.74s/it]

0.16277699


 82%|████████▏ | 210/255 [1:15:38<22:22, 29.83s/it]

0.22844172


 83%|████████▎ | 211/255 [1:16:05<21:15, 28.99s/it]

0.13240458


 83%|████████▎ | 212/255 [1:16:32<20:27, 28.54s/it]

0.17533106


 84%|████████▎ | 213/255 [1:17:00<19:44, 28.20s/it]

0.22613326


 84%|████████▍ | 214/255 [1:17:27<19:02, 27.86s/it]

0.36462027


 84%|████████▍ | 215/255 [1:17:54<18:25, 27.65s/it]

0.22712103


 85%|████████▍ | 216/255 [1:18:22<18:01, 27.74s/it]

0.24523327


 85%|████████▌ | 217/255 [1:18:48<17:21, 27.41s/it]

0.15430813


 85%|████████▌ | 218/255 [1:19:16<16:56, 27.47s/it]

0.443055


 86%|████████▌ | 219/255 [1:19:43<16:28, 27.46s/it]

0.16238669


 86%|████████▋ | 220/255 [1:20:13<16:26, 28.20s/it]

0.065958686


 87%|████████▋ | 221/255 [1:20:41<15:51, 27.97s/it]

0.13593161


 87%|████████▋ | 222/255 [1:21:10<15:33, 28.28s/it]

0.18711123


 87%|████████▋ | 223/255 [1:21:39<15:09, 28.44s/it]

0.0146395145


 88%|████████▊ | 224/255 [1:22:06<14:34, 28.23s/it]

0.15551159


 88%|████████▊ | 225/255 [1:22:34<13:58, 27.94s/it]

0.050466627


 89%|████████▊ | 226/255 [1:23:05<13:59, 28.95s/it]

0.07013804


 89%|████████▉ | 227/255 [1:23:34<13:35, 29.14s/it]

0.008022743


 89%|████████▉ | 228/255 [1:24:02<12:55, 28.71s/it]

0.016037162


 90%|████████▉ | 229/255 [1:24:32<12:34, 29.02s/it]

0.08009793


 90%|█████████ | 230/255 [1:25:05<12:34, 30.19s/it]

0.33063042


 91%|█████████ | 231/255 [1:25:32<11:45, 29.38s/it]

0.068970636


 91%|█████████ | 232/255 [1:25:58<10:48, 28.21s/it]

0.049122944


 91%|█████████▏| 233/255 [1:26:25<10:11, 27.81s/it]

0.0077002635


 92%|█████████▏| 234/255 [1:26:52<09:39, 27.57s/it]

0.33554482


 92%|█████████▏| 235/255 [1:27:18<09:05, 27.25s/it]

0.04881631


 93%|█████████▎| 236/255 [1:27:46<08:39, 27.35s/it]

0.112840705


 93%|█████████▎| 237/255 [1:28:13<08:12, 27.34s/it]

0.13069607


 93%|█████████▎| 238/255 [1:28:42<07:52, 27.78s/it]

0.089409925


 94%|█████████▎| 239/255 [1:29:15<07:48, 29.29s/it]

0.074642174


 94%|█████████▍| 240/255 [1:29:43<07:13, 28.89s/it]

0.38446975


 95%|█████████▍| 241/255 [1:30:11<06:44, 28.86s/it]

0.036783095


 95%|█████████▍| 242/255 [1:30:42<06:21, 29.33s/it]

0.10374917


 95%|█████████▌| 243/255 [1:31:17<06:12, 31.02s/it]

0.077827275


 96%|█████████▌| 244/255 [1:31:49<05:45, 31.45s/it]

0.17768806


 96%|█████████▌| 245/255 [1:32:18<05:05, 30.56s/it]

0.13795292


 96%|█████████▋| 246/255 [1:32:48<04:33, 30.34s/it]

0.124199614


 97%|█████████▋| 247/255 [1:33:17<04:00, 30.02s/it]

0.034733333


 97%|█████████▋| 248/255 [1:33:51<03:38, 31.23s/it]

0.096602775


 98%|█████████▊| 249/255 [1:34:23<03:08, 31.37s/it]

0.18136421


 98%|█████████▊| 250/255 [1:34:52<02:33, 30.64s/it]

0.030921368


 98%|█████████▊| 251/255 [1:35:23<02:03, 30.85s/it]

0.17054006


 99%|█████████▉| 252/255 [1:35:52<01:30, 30.25s/it]

0.11271315


 99%|█████████▉| 253/255 [1:36:22<01:00, 30.09s/it]

0.14836618


100%|█████████▉| 254/255 [1:36:52<00:30, 30.19s/it]

0.093315564


100%|██████████| 255/255 [1:37:21<00:00, 22.91s/it]


0.13158391


100%|██████████| 29/29 [00:09<00:00,  2.91it/s]

epoch=2: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.17866) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.164376) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.10954) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.10395)





In [17]:
# print accuracy
correct = 0
total = 0

ground_truth = []

for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):
    true = str(data['text_label'])
    ground_truth.append(true)
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{ground_truth[:10]=}")

accuracy=83.6283185840708 % on the evaluation dataset
eval_preds[:10]=['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', ' positive', 'positive']
ground_truth[:10]=['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'positive']
