## Multitask prompt tuning using Phi-2

In [1]:
import os
import torch
import numpy as np
import torch.nn.functional as F
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from peft import (
    MultitaskPromptTuningConfig,
    MultitaskPromptTuningInit,
    PeftModel,
    TaskType,
)

initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = MultitaskPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_tasks=2,
    prompt_tuning_init=MultitaskPromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    num_transformer_submodules=1,
    tokenizer_name_or_path=model_id,
)

model = None
saved_model1 = None
saved_model2 = None

try:
    sentences = ["Read the following sentence, then determine whether you return to the starting point.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]
    inputs = tokenizer(sentences, return_tensors="pt", padding=True).to(device)

    model = AutoModelForCausalLM.from_pretrained(model_id)
    model.to(device)
    generate_ids = model.generate(**inputs, max_length=500)
    outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #print(outputs[0])

    print("Using saved model from data/models/" + model_id)
    saved_model1 = PeftModel.from_pretrained(model, "data/models/" + model_id + "/model1")
    saved_model2 = PeftModel.from_pretrained(model, "data/models/" + model_id + "/model2")
    saved_model1.to(device)
    saved_model2.to(device)
    task_ids = [0 for i in inputs["input_ids"]]
    task_ids = torch.tensor(task_ids).to(device)
    generate_ids1 = saved_model1.generate(**inputs, max_length=500, task_ids=task_ids)
    task_ids = [1 for i in inputs["input_ids"]]
    task_ids = torch.tensor(task_ids).to(device)
    generate_ids2 = saved_model2.generate(**inputs, max_length=500, task_ids=task_ids)
    outputs1 = tokenizer.batch_decode(generate_ids1, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    outputs2 = tokenizer.batch_decode(generate_ids2, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #print(outputs1[0])
    #print(outputs2[0])
except ValueError:
    print("Model not found, training new model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [2]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=max_length)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=max_length)

    # Replace padding tokens in the labels with -100
    labels["input_ids"] = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels["input_ids"]]

    task_ids = [0 for i in labels["input_ids"]]
    task_ids = torch.tensor(task_ids)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["task_ids"] = task_ids
    return model_inputs

In [4]:
def logprobs_for_classes(output_logits, classes):
    logits = [0 for _ in range(len(classes))]
    for i, target in enumerate(classes):
        expanded_classes = [target] + [f" {target}"] + [f"{target.lower()}"] + [f" {target.lower()}"]
        encoded_classes = [tokenizer.encode(c, return_tensors="pt", padding=True).to(device) for c in expanded_classes]
        for token in encoded_classes:
            logits[i] += output_logits[token]
    return F.log_softmax(torch.tensor(logits), dim=0)

In [5]:
def exact_match_loss(outputs, labels):     
    target_texts = [tokenizer.decode([tok for tok in target if tok != -100], skip_special_tokens=True) for target in labels]
    targets = list(set(target_texts))
    generated_texts = [targets[np.argmax(logprobs_for_classes(out[-1], targets))] for out in outputs.logits]        

    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        generated_tokens = generated_text.split()
        target_tokens = target_text.split()
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_tokens, target_tokens))
        losses.append(loss)

    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    return total_loss, generated_texts

In [9]:
def test(dataloader, model1, model2, tokenizer, device, exact_match=True):
    total_loss = 0
    test_preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            output1 = model1(**batch, output_hidden_states=True)
        inputs_embeds = output1.hidden_states[-1]
        sequence_length = inputs_embeds.shape[1]
        labels = batch['labels']
        attention_mask = torch.ones(inputs_embeds.shape[:2], device=device)
        padding = torch.full((labels.shape[0], sequence_length - labels.shape[1]), -100, dtype=labels.dtype, device=labels.device)
        labels = torch.cat([padding, labels], dim=1).to(device)
        task_ids = torch.tensor([1 for i in batch["task_ids"]]).to(device)
        output2 = model2(inputs_embeds=inputs_embeds, labels=labels, task_ids=task_ids, attention_mask=attention_mask, output_hidden_states=True)
        
        loss, preds = exact_match_loss(output2, batch["labels"]) if exact_match else (output2.loss, [])
        total_loss += loss.detach().float()
        test_preds.extend(preds)

    total_loss = total_loss / len(dataloader)
    return total_loss, test_preds

In [6]:
import os
from dln.dataset import init_dataset
from datasets import Dataset, DatasetDict

def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict

In [10]:
from peft import (
    MultitaskPromptTuningConfig,
    MultitaskPromptTuningInit,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name_or_path = "microsoft/phi-2"
tokenizer_name_or_path = "microsoft/phi-2"

dataset_id = "navigate"
initial_instruction = (
    "Read the following question, then choose the correct answer."
)
text_column = "text"
label_column = "label"
max_length = 128
lr = 3e-2
num_epochs = 50
batch_size = 8

peft_config = MultitaskPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_tasks=2,
    prompt_tuning_init=MultitaskPromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    num_transformer_submodules=1,
    tokenizer_name_or_path=model_name_or_path,
)

dataset = load_dln_dataset_to_hf_dataset(dataset_id)

classes = list(set(dataset["train"]["label"]))

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max(
    [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
)
print(target_max_length)

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
    fn_kwargs={
        "tokenizer": tokenizer,
        "prefix": '',
        "text_column": text_column,
        "label_column": label_column,
        "max_length": max_length,
    },
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["dev"]
test_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)
test_dataloader = DataLoader(
    test_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)

global model
if saved_model1 is None or saved_model2 is None:
    if model is None:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model.config.pad_token_id = model.config.eos_token_id
    model1 = get_peft_model(model, peft_config)
    model2 = get_peft_model(model, peft_config)
else:
    model1 = saved_model1
    model2 = saved_model2
    print("Using saved model from data/models/" + model_name_or_path)
    
optimizer1 = torch.optim.AdamW(model1.parameters(), lr=lr)
optimizer2 = torch.optim.AdamW(model2.parameters(), lr=lr)
lr_scheduler1 = get_linear_schedule_with_warmup(
    optimizer=optimizer1,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),

)
lr_scheduler2 = get_linear_schedule_with_warmup(
    optimizer=optimizer2,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

model1 = model1.to(device)
model2 = model2.to(device)

model1.eval()
model2.eval()

init_test_loss1, test_preds1 = test(test_dataloader, model1, model2, tokenizer, device)
init_test_loss2, test_preds2 = test(test_dataloader, model1, model2, tokenizer, device)
init_test_ppl1 = torch.exp(init_test_loss1)  # Perplexity
init_test_ppl2 = torch.exp(init_test_loss2)  # Perplexity
print(f"Test before training1: {init_test_ppl1=} {init_test_loss1=}")
print(f"Test before training2: {init_test_ppl2=} {init_test_loss2=}")

for epoch in range(num_epochs):
    model1.train()
    model2.train()
    total_loss1 = 0
    total_loss2 = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output1 = model1(**batch, output_hidden_states=True)
        # print("1=" + tokenizer.batch_decode(torch.argmax(output1.logits, dim=-1)[-1])[-1])
        
        inputs_embeds = output1.hidden_states[-1]
        sequence_length = inputs_embeds.shape[1]
        labels = batch['labels']
        attention_mask = torch.ones(inputs_embeds.shape[:2], device=device)
        padding = torch.full((labels.shape[0], sequence_length - labels.shape[1]), -100, dtype=labels.dtype, device=labels.device)
        labels = torch.cat([padding, labels], dim=1).to(device)
        task_ids = torch.tensor([1 for i in batch["task_ids"]]).to(device)
        output2 = model2(inputs_embeds=inputs_embeds, labels=labels, task_ids=task_ids, attention_mask=attention_mask, output_hidden_states=True)
        # print("2=" + tokenizer.batch_decode(torch.argmax(output2.logits, dim=-1)[-1])[-1])

        loss1 = output1.loss
        loss2 = output2.loss
        # print (f"loss: {loss1.item()=}, {loss2.item()=}")
        total_loss1 += loss1.item()
        total_loss2 += loss2.item()
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        loss1.backward(retain_graph=True)
        loss2.backward()
        optimizer1.step()
        optimizer2.step()
        lr_scheduler1.step()
        lr_scheduler2.step()

    model1.eval()
    model2.eval()
    eval_epoch_loss1, eval_preds1 = test(eval_dataloader, model1, model2, tokenizer, device, False)
    eval_epoch_loss2, eval_preds2 = test(eval_dataloader, model1, model2, tokenizer, device, False)
    eval_ppl1 = torch.exp(eval_epoch_loss1)
    eval_ppl2 = torch.exp(eval_epoch_loss2)
    train_epoch_loss1 = total_loss1 / len(train_dataloader)
    train_epoch_loss2 = total_loss2 / len(train_dataloader)
    train_ppl1 = torch.exp(torch.tensor(train_epoch_loss1))
    train_ppl2 = torch.exp(torch.tensor(train_epoch_loss2))
    print(
        f"{epoch=}: {train_ppl1=} {train_epoch_loss1=} {eval_ppl1=} {eval_epoch_loss1=}"
    )
    print(
        f"{epoch=}: {train_ppl2=} {train_epoch_loss2=} {eval_ppl2=} {eval_epoch_loss2=}"
    )

model1.eval()
model2.eval()
if not saved_model1:
    model1.save_pretrained("data/models/" + model_name_or_path + "/model1")
if not saved_model2:
    model2.save_pretrained("data/models/" + model_name_or_path + "/model2")

final_test_loss1, test_preds1 = test(test_dataloader, model1, model2, tokenizer, device)
final_test_loss2, test_preds2 = test(test_dataloader, model1, model2, tokenizer, device)
final_test_ppl1 = torch.exp(final_test_loss1)
final_test_ppl2 = torch.exp(final_test_loss2)
print(f"Test before training1: {init_test_ppl1=} {init_test_loss1=}")
print(f"Test before training2: {init_test_ppl2=} {init_test_loss2=}")
print(f"Test after training1: {final_test_ppl1=} {final_test_loss1=}")
print(f"Test after training2: {final_test_ppl2=} {final_test_loss2=}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from /home/chsingh/deep-language-networks/projects/../data/bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset:   0%|          | 0/375 [00:00<?, ? examples/s]



Running tokenizer on dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/250 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
100%|██████████| 32/32 [00:24<00:00,  1.29it/s]


Test before training1: init_test_ppl1=tensor(1.5130) init_test_loss1=tensor(0.4141)
Test before training2: init_test_ppl2=tensor(1.5130) init_test_loss2=tensor(0.4141)


100%|██████████| 47/47 [01:33<00:00,  1.98s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=0: train_ppl1=tensor(61.4932) train_epoch_loss1=4.118926109151637 eval_ppl1=tensor(2.3009, device='cuda:0') eval_epoch_loss1=tensor(0.8333, device='cuda:0')
epoch=0: train_ppl2=tensor(6.9223) train_epoch_loss2=1.9347492824209498 eval_ppl2=tensor(2.3009, device='cuda:0') eval_epoch_loss2=tensor(0.8333, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=1: train_ppl1=tensor(2.1528) train_epoch_loss1=0.7667878523786017 eval_ppl1=tensor(2.0596, device='cuda:0') eval_epoch_loss1=tensor(0.7225, device='cuda:0')
epoch=1: train_ppl2=tensor(2.2656) train_epoch_loss2=0.8178455943756915 eval_ppl2=tensor(2.0596, device='cuda:0') eval_epoch_loss2=tensor(0.7225, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=2: train_ppl1=tensor(2.0217) train_epoch_loss1=0.7039550219444518 eval_ppl1=tensor(2.0083, device='cuda:0') eval_epoch_loss1=tensor(0.6973, device='cuda:0')
epoch=2: train_ppl2=tensor(2.1115) train_epoch_loss2=0.7473819610920358 eval_ppl2=tensor(2.0083, device='cuda:0') eval_epoch_loss2=tensor(0.6973, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=3: train_ppl1=tensor(1.9681) train_epoch_loss1=0.6770760493075594 eval_ppl1=tensor(2.1537, device='cuda:0') eval_epoch_loss1=tensor(0.7672, device='cuda:0')
epoch=3: train_ppl2=tensor(2.0613) train_epoch_loss2=0.7233324875222876 eval_ppl2=tensor(2.1537, device='cuda:0') eval_epoch_loss2=tensor(0.7672, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=4: train_ppl1=tensor(1.9851) train_epoch_loss1=0.6856874937706805 eval_ppl1=tensor(1.9317, device='cuda:0') eval_epoch_loss1=tensor(0.6584, device='cuda:0')
epoch=4: train_ppl2=tensor(2.0836) train_epoch_loss2=0.7340930281801427 eval_ppl2=tensor(1.9317, device='cuda:0') eval_epoch_loss2=tensor(0.6584, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=5: train_ppl1=tensor(1.9839) train_epoch_loss1=0.6850424833754276 eval_ppl1=tensor(1.9393, device='cuda:0') eval_epoch_loss1=tensor(0.6623, device='cuda:0')
epoch=5: train_ppl2=tensor(2.0896) train_epoch_loss2=0.7369965416319827 eval_ppl2=tensor(1.9393, device='cuda:0') eval_epoch_loss2=tensor(0.6623, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=6: train_ppl1=tensor(1.8865) train_epoch_loss1=0.634727843898408 eval_ppl1=tensor(2.2600, device='cuda:0') eval_epoch_loss1=tensor(0.8154, device='cuda:0')
epoch=6: train_ppl2=tensor(2.0496) train_epoch_loss2=0.7176343697182676 eval_ppl2=tensor(2.2600, device='cuda:0') eval_epoch_loss2=tensor(0.8154, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=7: train_ppl1=tensor(1.8684) train_epoch_loss1=0.6250752819345352 eval_ppl1=tensor(1.8284, device='cuda:0') eval_epoch_loss1=tensor(0.6034, device='cuda:0')
epoch=7: train_ppl2=tensor(1.9851) train_epoch_loss2=0.6856651794403157 eval_ppl2=tensor(1.8284, device='cuda:0') eval_epoch_loss2=tensor(0.6034, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=8: train_ppl1=tensor(1.8115) train_epoch_loss1=0.5941795129725274 eval_ppl1=tensor(2.0138, device='cuda:0') eval_epoch_loss1=tensor(0.7000, device='cuda:0')
epoch=8: train_ppl2=tensor(1.9621) train_epoch_loss2=0.6740230905248764 eval_ppl2=tensor(2.0138, device='cuda:0') eval_epoch_loss2=tensor(0.7000, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=9: train_ppl1=tensor(1.8344) train_epoch_loss1=0.6067064423510369 eval_ppl1=tensor(1.7610, device='cuda:0') eval_epoch_loss1=tensor(0.5659, device='cuda:0')
epoch=9: train_ppl2=tensor(1.9641) train_epoch_loss2=0.6750155509786403 eval_ppl2=tensor(1.7610, device='cuda:0') eval_epoch_loss2=tensor(0.5659, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=10: train_ppl1=tensor(1.7520) train_epoch_loss1=0.560776171532083 eval_ppl1=tensor(1.6775, device='cuda:0') eval_epoch_loss1=tensor(0.5173, device='cuda:0')
epoch=10: train_ppl2=tensor(1.8872) train_epoch_loss2=0.6351126178782037 eval_ppl2=tensor(1.6775, device='cuda:0') eval_epoch_loss2=tensor(0.5173, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=11: train_ppl1=tensor(1.7816) train_epoch_loss1=0.5775342479031137 eval_ppl1=tensor(1.6919, device='cuda:0') eval_epoch_loss1=tensor(0.5259, device='cuda:0')
epoch=11: train_ppl2=tensor(1.8824) train_epoch_loss2=0.6325623164785669 eval_ppl2=tensor(1.6919, device='cuda:0') eval_epoch_loss2=tensor(0.5259, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=12: train_ppl1=tensor(1.6795) train_epoch_loss1=0.5184765863925853 eval_ppl1=tensor(1.8898, device='cuda:0') eval_epoch_loss1=tensor(0.6365, device='cuda:0')
epoch=12: train_ppl2=tensor(1.7593) train_epoch_loss2=0.5648921322315297 eval_ppl2=tensor(1.8898, device='cuda:0') eval_epoch_loss2=tensor(0.6365, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=13: train_ppl1=tensor(1.7697) train_epoch_loss1=0.5707907537196545 eval_ppl1=tensor(1.7035, device='cuda:0') eval_epoch_loss1=tensor(0.5327, device='cuda:0')
epoch=13: train_ppl2=tensor(1.8536) train_epoch_loss2=0.6171531150949762 eval_ppl2=tensor(1.7035, device='cuda:0') eval_epoch_loss2=tensor(0.5327, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=14: train_ppl1=tensor(1.6630) train_epoch_loss1=0.5086020503906493 eval_ppl1=tensor(1.7169, device='cuda:0') eval_epoch_loss1=tensor(0.5405, device='cuda:0')
epoch=14: train_ppl2=tensor(1.7206) train_epoch_loss2=0.5426517277956009 eval_ppl2=tensor(1.7169, device='cuda:0') eval_epoch_loss2=tensor(0.5405, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=15: train_ppl1=tensor(1.6886) train_epoch_loss1=0.5239019977285507 eval_ppl1=tensor(1.9444, device='cuda:0') eval_epoch_loss1=tensor(0.6650, device='cuda:0')
epoch=15: train_ppl2=tensor(1.7554) train_epoch_loss2=0.562713030170887 eval_ppl2=tensor(1.9444, device='cuda:0') eval_epoch_loss2=tensor(0.6650, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=16: train_ppl1=tensor(1.6773) train_epoch_loss1=0.5171648029317247 eval_ppl1=tensor(1.7441, device='cuda:0') eval_epoch_loss1=tensor(0.5562, device='cuda:0')
epoch=16: train_ppl2=tensor(1.7110) train_epoch_loss2=0.5370709442711891 eval_ppl2=tensor(1.7441, device='cuda:0') eval_epoch_loss2=tensor(0.5562, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=17: train_ppl1=tensor(1.6421) train_epoch_loss1=0.4959743663351587 eval_ppl1=tensor(2.6210, device='cuda:0') eval_epoch_loss1=tensor(0.9636, device='cuda:0')
epoch=17: train_ppl2=tensor(1.7115) train_epoch_loss2=0.5373579485619322 eval_ppl2=tensor(2.6210, device='cuda:0') eval_epoch_loss2=tensor(0.9636, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=18: train_ppl1=tensor(1.7066) train_epoch_loss1=0.5345122514570013 eval_ppl1=tensor(1.7479, device='cuda:0') eval_epoch_loss1=tensor(0.5584, device='cuda:0')
epoch=18: train_ppl2=tensor(1.8010) train_epoch_loss2=0.5883679757726953 eval_ppl2=tensor(1.7479, device='cuda:0') eval_epoch_loss2=tensor(0.5584, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=19: train_ppl1=tensor(1.6250) train_epoch_loss1=0.48551370552245604 eval_ppl1=tensor(1.5999, device='cuda:0') eval_epoch_loss1=tensor(0.4700, device='cuda:0')
epoch=19: train_ppl2=tensor(1.7548) train_epoch_loss2=0.5623709220201412 eval_ppl2=tensor(1.5999, device='cuda:0') eval_epoch_loss2=tensor(0.4700, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=20: train_ppl1=tensor(1.6019) train_epoch_loss1=0.4712043214351573 eval_ppl1=tensor(1.6188, device='cuda:0') eval_epoch_loss1=tensor(0.4817, device='cuda:0')
epoch=20: train_ppl2=tensor(1.7277) train_epoch_loss2=0.5468009386924987 eval_ppl2=tensor(1.6188, device='cuda:0') eval_epoch_loss2=tensor(0.4817, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=21: train_ppl1=tensor(1.6210) train_epoch_loss1=0.48305965896616593 eval_ppl1=tensor(1.6006, device='cuda:0') eval_epoch_loss1=tensor(0.4704, device='cuda:0')
epoch=21: train_ppl2=tensor(1.7111) train_epoch_loss2=0.5371512339470235 eval_ppl2=tensor(1.6006, device='cuda:0') eval_epoch_loss2=tensor(0.4704, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=22: train_ppl1=tensor(1.5999) train_epoch_loss1=0.46992340049845105 eval_ppl1=tensor(1.9137, device='cuda:0') eval_epoch_loss1=tensor(0.6491, device='cuda:0')
epoch=22: train_ppl2=tensor(1.6312) train_epoch_loss2=0.4892870247998136 eval_ppl2=tensor(1.9137, device='cuda:0') eval_epoch_loss2=tensor(0.6491, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=23: train_ppl1=tensor(1.5881) train_epoch_loss1=0.46252176482626733 eval_ppl1=tensor(1.7886, device='cuda:0') eval_epoch_loss1=tensor(0.5815, device='cuda:0')
epoch=23: train_ppl2=tensor(1.6538) train_epoch_loss2=0.5030808921190019 eval_ppl2=tensor(1.7886, device='cuda:0') eval_epoch_loss2=tensor(0.5815, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=24: train_ppl1=tensor(1.6402) train_epoch_loss1=0.4948237199098506 eval_ppl1=tensor(1.6999, device='cuda:0') eval_epoch_loss1=tensor(0.5306, device='cuda:0')
epoch=24: train_ppl2=tensor(1.6639) train_epoch_loss2=0.5091466916368362 eval_ppl2=tensor(1.6999, device='cuda:0') eval_epoch_loss2=tensor(0.5306, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=25: train_ppl1=tensor(1.5790) train_epoch_loss1=0.4567798807265911 eval_ppl1=tensor(1.9751, device='cuda:0') eval_epoch_loss1=tensor(0.6806, device='cuda:0')
epoch=25: train_ppl2=tensor(1.6197) train_epoch_loss2=0.48225450642565465 eval_ppl2=tensor(1.9751, device='cuda:0') eval_epoch_loss2=tensor(0.6806, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=26: train_ppl1=tensor(1.5824) train_epoch_loss1=0.4589606020995911 eval_ppl1=tensor(1.7248, device='cuda:0') eval_epoch_loss1=tensor(0.5451, device='cuda:0')
epoch=26: train_ppl2=tensor(1.6091) train_epoch_loss2=0.47567588503056385 eval_ppl2=tensor(1.7248, device='cuda:0') eval_epoch_loss2=tensor(0.5451, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=27: train_ppl1=tensor(1.5623) train_epoch_loss1=0.44619008930439646 eval_ppl1=tensor(2.0165, device='cuda:0') eval_epoch_loss1=tensor(0.7014, device='cuda:0')
epoch=27: train_ppl2=tensor(1.5634) train_epoch_loss2=0.4468906360103729 eval_ppl2=tensor(2.0165, device='cuda:0') eval_epoch_loss2=tensor(0.7014, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=28: train_ppl1=tensor(1.5474) train_epoch_loss1=0.4366070802541489 eval_ppl1=tensor(1.6099, device='cuda:0') eval_epoch_loss1=tensor(0.4762, device='cuda:0')
epoch=28: train_ppl2=tensor(1.5983) train_epoch_loss2=0.4689374753135316 eval_ppl2=tensor(1.6099, device='cuda:0') eval_epoch_loss2=tensor(0.4762, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=29: train_ppl1=tensor(1.5286) train_epoch_loss1=0.42434035305013046 eval_ppl1=tensor(1.7674, device='cuda:0') eval_epoch_loss1=tensor(0.5695, device='cuda:0')
epoch=29: train_ppl2=tensor(1.5894) train_epoch_loss2=0.46336897668686317 eval_ppl2=tensor(1.7674, device='cuda:0') eval_epoch_loss2=tensor(0.5695, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=30: train_ppl1=tensor(1.5053) train_epoch_loss1=0.40899323545237803 eval_ppl1=tensor(1.6623, device='cuda:0') eval_epoch_loss1=tensor(0.5082, device='cuda:0')
epoch=30: train_ppl2=tensor(1.5402) train_epoch_loss2=0.4319391263292191 eval_ppl2=tensor(1.6623, device='cuda:0') eval_epoch_loss2=tensor(0.5082, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  1.99s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=31: train_ppl1=tensor(1.5222) train_epoch_loss1=0.4201381304796706 eval_ppl1=tensor(1.5705, device='cuda:0') eval_epoch_loss1=tensor(0.4514, device='cuda:0')
epoch=31: train_ppl2=tensor(1.5458) train_epoch_loss2=0.4355688751377958 eval_ppl2=tensor(1.5705, device='cuda:0') eval_epoch_loss2=tensor(0.4514, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=32: train_ppl1=tensor(1.5075) train_epoch_loss1=0.4104286311788762 eval_ppl1=tensor(1.9854, device='cuda:0') eval_epoch_loss1=tensor(0.6858, device='cuda:0')
epoch=32: train_ppl2=tensor(1.5554) train_epoch_loss2=0.4417516639892091 eval_ppl2=tensor(1.9854, device='cuda:0') eval_epoch_loss2=tensor(0.6858, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=33: train_ppl1=tensor(1.5313) train_epoch_loss1=0.4261247380933863 eval_ppl1=tensor(1.5749, device='cuda:0') eval_epoch_loss1=tensor(0.4542, device='cuda:0')
epoch=33: train_ppl2=tensor(1.5210) train_epoch_loss2=0.41933769241292423 eval_ppl2=tensor(1.5749, device='cuda:0') eval_epoch_loss2=tensor(0.4542, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=34: train_ppl1=tensor(1.5081) train_epoch_loss1=0.4108523907179528 eval_ppl1=tensor(1.5412, device='cuda:0') eval_epoch_loss1=tensor(0.4326, device='cuda:0')
epoch=34: train_ppl2=tensor(1.5505) train_epoch_loss2=0.4385620095628373 eval_ppl2=tensor(1.5412, device='cuda:0') eval_epoch_loss2=tensor(0.4326, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=35: train_ppl1=tensor(1.5242) train_epoch_loss1=0.4214511640528415 eval_ppl1=tensor(1.7297, device='cuda:0') eval_epoch_loss1=tensor(0.5480, device='cuda:0')
epoch=35: train_ppl2=tensor(1.5589) train_epoch_loss2=0.4439897500771157 eval_ppl2=tensor(1.7297, device='cuda:0') eval_epoch_loss2=tensor(0.5480, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=36: train_ppl1=tensor(1.5115) train_epoch_loss1=0.4131002827210629 eval_ppl1=tensor(1.5829, device='cuda:0') eval_epoch_loss1=tensor(0.4592, device='cuda:0')
epoch=36: train_ppl2=tensor(1.5318) train_epoch_loss2=0.4264619263245704 eval_ppl2=tensor(1.5829, device='cuda:0') eval_epoch_loss2=tensor(0.4592, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=37: train_ppl1=tensor(1.5015) train_epoch_loss1=0.4064508151817829 eval_ppl1=tensor(1.6733, device='cuda:0') eval_epoch_loss1=tensor(0.5148, device='cuda:0')
epoch=37: train_ppl2=tensor(1.5406) train_epoch_loss2=0.43219514000923076 eval_ppl2=tensor(1.6733, device='cuda:0') eval_epoch_loss2=tensor(0.5148, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=38: train_ppl1=tensor(1.4219) train_epoch_loss1=0.3519665732028637 eval_ppl1=tensor(1.7687, device='cuda:0') eval_epoch_loss1=tensor(0.5702, device='cuda:0')
epoch=38: train_ppl2=tensor(1.4550) train_epoch_loss2=0.3750063607350309 eval_ppl2=tensor(1.7687, device='cuda:0') eval_epoch_loss2=tensor(0.5702, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=39: train_ppl1=tensor(1.4624) train_epoch_loss1=0.3800735170061284 eval_ppl1=tensor(1.8275, device='cuda:0') eval_epoch_loss1=tensor(0.6030, device='cuda:0')
epoch=39: train_ppl2=tensor(1.4745) train_epoch_loss2=0.3883132368643233 eval_ppl2=tensor(1.8275, device='cuda:0') eval_epoch_loss2=tensor(0.6030, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=40: train_ppl1=tensor(1.4157) train_epoch_loss1=0.3476271967780083 eval_ppl1=tensor(2.0295, device='cuda:0') eval_epoch_loss1=tensor(0.7078, device='cuda:0')
epoch=40: train_ppl2=tensor(1.4462) train_epoch_loss2=0.36894043796557063 eval_ppl2=tensor(2.0295, device='cuda:0') eval_epoch_loss2=tensor(0.7078, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=41: train_ppl1=tensor(1.4234) train_epoch_loss1=0.35302260486369436 eval_ppl1=tensor(1.6415, device='cuda:0') eval_epoch_loss1=tensor(0.4956, device='cuda:0')
epoch=41: train_ppl2=tensor(1.4987) train_epoch_loss2=0.40462351780622563 eval_ppl2=tensor(1.6415, device='cuda:0') eval_epoch_loss2=tensor(0.4956, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=42: train_ppl1=tensor(1.4463) train_epoch_loss1=0.36899209196897265 eval_ppl1=tensor(1.7576, device='cuda:0') eval_epoch_loss1=tensor(0.5639, device='cuda:0')
epoch=42: train_ppl2=tensor(1.4590) train_epoch_loss2=0.3777786429892195 eval_ppl2=tensor(1.7576, device='cuda:0') eval_epoch_loss2=tensor(0.5639, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=43: train_ppl1=tensor(1.4128) train_epoch_loss1=0.3455392846718748 eval_ppl1=tensor(1.6727, device='cuda:0') eval_epoch_loss1=tensor(0.5144, device='cuda:0')
epoch=43: train_ppl2=tensor(1.4148) train_epoch_loss2=0.34700676640297506 eval_ppl2=tensor(1.6727, device='cuda:0') eval_epoch_loss2=tensor(0.5144, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=44: train_ppl1=tensor(1.4375) train_epoch_loss1=0.3629383714275157 eval_ppl1=tensor(1.5776, device='cuda:0') eval_epoch_loss1=tensor(0.4559, device='cuda:0')
epoch=44: train_ppl2=tensor(1.4326) train_epoch_loss2=0.35952148095090336 eval_ppl2=tensor(1.5776, device='cuda:0') eval_epoch_loss2=tensor(0.4559, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=45: train_ppl1=tensor(1.4154) train_epoch_loss1=0.34741641977365983 eval_ppl1=tensor(1.6622, device='cuda:0') eval_epoch_loss1=tensor(0.5081, device='cuda:0')
epoch=45: train_ppl2=tensor(1.4590) train_epoch_loss2=0.3777644537547801 eval_ppl2=tensor(1.6622, device='cuda:0') eval_epoch_loss2=tensor(0.5081, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=46: train_ppl1=tensor(1.3604) train_epoch_loss1=0.30776884636663376 eval_ppl1=tensor(1.5937, device='cuda:0') eval_epoch_loss1=tensor(0.4661, device='cuda:0')
epoch=46: train_ppl2=tensor(1.4006) train_epoch_loss2=0.33687676917365256 eval_ppl2=tensor(1.5937, device='cuda:0') eval_epoch_loss2=tensor(0.4661, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=47: train_ppl1=tensor(1.3780) train_epoch_loss1=0.32066789824277797 eval_ppl1=tensor(1.6601, device='cuda:0') eval_epoch_loss1=tensor(0.5069, device='cuda:0')
epoch=47: train_ppl2=tensor(1.4167) train_epoch_loss2=0.3483059081308385 eval_ppl2=tensor(1.6601, device='cuda:0') eval_epoch_loss2=tensor(0.5069, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=48: train_ppl1=tensor(1.3953) train_epoch_loss1=0.3330843042027443 eval_ppl1=tensor(1.5945, device='cuda:0') eval_epoch_loss1=tensor(0.4665, device='cuda:0')
epoch=48: train_ppl2=tensor(1.4254) train_epoch_loss2=0.35447386065696146 eval_ppl2=tensor(1.5945, device='cuda:0') eval_epoch_loss2=tensor(0.4665, device='cuda:0')


100%|██████████| 47/47 [01:33<00:00,  2.00s/it]
100%|██████████| 47/47 [00:37<00:00,  1.27it/s]
100%|██████████| 47/47 [00:37<00:00,  1.26it/s]


epoch=49: train_ppl1=tensor(1.3848) train_epoch_loss1=0.3255284591082563 eval_ppl1=tensor(1.5999, device='cuda:0') eval_epoch_loss1=tensor(0.4699, device='cuda:0')
epoch=49: train_ppl2=tensor(1.4135) train_epoch_loss2=0.3460675330555185 eval_ppl2=tensor(1.5999, device='cuda:0') eval_epoch_loss2=tensor(0.4699, device='cuda:0')


100%|██████████| 32/32 [00:25<00:00,  1.25it/s]
100%|██████████| 32/32 [00:25<00:00,  1.25it/s]

Test before training1: init_test_ppl1=tensor(1.5130) init_test_loss1=tensor(0.4141)
Test before training2: init_test_ppl2=tensor(1.5130) init_test_loss2=tensor(0.4141)
Test after training1: final_test_ppl1=tensor(1.2252) final_test_loss1=tensor(0.2031)
Test after training2: final_test_ppl2=tensor(1.2252) final_test_loss2=tensor(0.2031)





In [11]:
correct = 0
total = 0
for pred, label in zip(test_preds1,  dataset['test']['label']):
    if pred.strip() == label.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100

print(f"{accuracy=}% on the test dataset")
print(f"{test_preds1[:10]=}")
print(f"{dataset['test']['label'][:10]=}")

"accuracy=79.2% on the test dataset"
"test_preds[:10]=['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']"
"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"

accuracy=79.2% on the test dataset
test_preds1[:10]=['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']


"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"

In [12]:
correct = 0
total = 0
for pred, label in zip(test_preds2,  dataset['test']['label']):
    if pred.strip() == label.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100

print(f"{accuracy=}% on the test dataset")
print(f"{test_preds2[:10]=}")
print(f"{dataset['test']['label'][:10]=}")

"accuracy=79.2% on the test dataset"
"test_preds[:10]=['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']"
"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"

accuracy=79.2% on the test dataset
test_preds2[:10]=['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']


"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"