## Soft-Prompts using Phi-2

In [1]:
import os
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

from accelerate import Accelerator
accelerator = Accelerator()

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)

initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    tokenizer_name_or_path=model_id,
)

model = None
saved_model = None

try:
    sentences = ["Read the following sentence, then determine whether you return to the starting point.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]
    inputs = tokenizer(sentences, return_tensors="pt", padding=True).to(device)

    model = AutoModelForCausalLM.from_pretrained(model_id)
    model.to(device)
    generate_ids = model.generate(**inputs, max_length=500)
    outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    # print(outputs[0])

    print("Using saved model from data/models/" + model_id)
    saved_model = PeftModel.from_pretrained(model, "data/models/" + model_id)
    saved_model.to(device)
    generate_ids = saved_model.generate(**inputs, max_length=500)
    outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    # print(outputs[0])
except ValueError:
    print("Model not found, training new model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Using saved model from data/models/microsoft/phi-2
Model not found, training new model


In [3]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=max_length)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=max_length)

    # Replace padding tokens in the labels with -100
    labels["input_ids"] = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels["input_ids"]]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
def exact_match_loss(generated_texts, target_texts, model):
    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        counts = torch.tensor(list(map(lambda x: generated_text.count(x), target_texts)))
        generated_tokens = [target_texts[torch.argmax(counts)]]
        target_tokens = target_text.split()
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_tokens, target_tokens))
        losses.append(loss)

    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    return total_loss

In [5]:
def test(dataloader, model, tokenizer, device, exact_match=True):
    total_loss = 0
    preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(**batch, max_length=500, num_return_sequences=1, return_dict_in_generate=True, pad_token_id=tokenizer.pad_token_id) if exact_match else model(**batch)
        
        if exact_match:
            generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs.sequences]        
            target_texts_decoded = [tokenizer.decode([tok for tok in target if tok != -100], skip_special_tokens=True) for target in batch["labels"]]

        loss = exact_match_loss(generated_texts, target_texts_decoded, model) if exact_match else outputs.loss
        total_loss += loss.detach().float()
        labels = torch.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)

    total_loss = total_loss / len(dataloader)
    return total_loss

In [6]:
import os
from dln.dataset import init_dataset
from datasets import Dataset, DatasetDict

def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict

In [7]:
accelerator = Accelerator()
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_name_or_path = "microsoft/phi-2"
    tokenizer_name_or_path = "microsoft/phi-2"

    dataset_id = "navigate"
    initial_instruction = (
        "Read the following question, then choose the correct answer."
    )
    text_column = "text"
    label_column = "label"
    max_length = 128
    lr = 3e-2
    num_epochs = 1 if saved_model else 50
    batch_size = 4

    peft_config = PromptTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        prompt_tuning_init=PromptTuningInit.TEXT,
        num_virtual_tokens=8,
        prompt_tuning_init_text=initial_instruction,
        tokenizer_name_or_path=model_name_or_path,
    )

    dataset = load_dln_dataset_to_hf_dataset(dataset_id)

    classes = list(set(dataset["train"]["label"]))

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    target_max_length = max(
        [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
    )
    print(target_max_length)

    processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
        fn_kwargs={
            "tokenizer": tokenizer,
            "prefix": '',
            "text_column": text_column,
            "label_column": label_column,
            "max_length": max_length,
        },
    )
    
    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["dev"]
    test_dataset = processed_datasets["test"]
    
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )

    global model
    if saved_model is None:
        if model is None:
            model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
        model.config.pad_token_id = model.config.eos_token_id
        model = get_peft_model(model, peft_config)
    else:
        model = saved_model
        print("Using saved model from data/models/" + model_name_or_path)
        
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model = model.to(device)

    # Send everything through `accelerator.prepare`
    train_loader, eval_loader, test_loader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, test_dataloader, model, optimizer
    )

    model.eval()
    init_test_loss = test(test_dataloader, model, tokenizer, device)
    init_test_ppl = torch.exp(init_test_loss)  # Perplexity
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch)

            loss = output.loss
            total_loss += loss.item()
            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()

        model.eval()
        eval_epoch_loss = test(eval_dataloader, model, tokenizer, device, False)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(torch.tensor(train_epoch_loss))
        print(
            f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}"
        )

    model.eval()
    if not saved_model:
        model.save_pretrained("data/models/" + model_name_or_path)

    final_test_loss = test(test_dataloader, model, tokenizer, device)
    final_test_ppl = torch.exp(final_test_loss)
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")
    print(f"Test after training: {final_test_ppl=} {final_test_loss=}")

main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from /home/chsingh/deep-language-networks/projects/../data/bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset:   0%|          | 0/375 [00:00<?, ? examples/s]



Running tokenizer on dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/250 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 63/63 [23:31<00:00, 22.41s/it]


Test before training: init_test_ppl=tensor(1.3254) init_test_loss=tensor(0.2817)


100%|██████████| 94/94 [03:23<00:00,  2.17s/it]
100%|██████████| 94/94 [00:23<00:00,  4.07it/s]


epoch=0: train_ppl=tensor(3.4634) train_epoch_loss=1.2422481737238296 eval_ppl=tensor(2.0785, device='cuda:0') eval_epoch_loss=tensor(0.7316, device='cuda:0')


100%|██████████| 94/94 [03:23<00:00,  2.17s/it]
100%|██████████| 94/94 [00:22<00:00,  4.11it/s]


epoch=1: train_ppl=tensor(2.0540) train_epoch_loss=0.7197701747113086 eval_ppl=tensor(2.0028, device='cuda:0') eval_epoch_loss=tensor(0.6946, device='cuda:0')


100%|██████████| 94/94 [03:24<00:00,  2.17s/it]
100%|██████████| 94/94 [00:23<00:00,  3.99it/s]


epoch=2: train_ppl=tensor(1.9932) train_epoch_loss=0.6897197210408271 eval_ppl=tensor(1.9949, device='cuda:0') eval_epoch_loss=tensor(0.6906, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.14s/it]
100%|██████████| 94/94 [00:23<00:00,  4.08it/s]


epoch=3: train_ppl=tensor(1.9678) train_epoch_loss=0.6769361958858815 eval_ppl=tensor(1.8805, device='cuda:0') eval_epoch_loss=tensor(0.6316, device='cuda:0')


100%|██████████| 94/94 [03:21<00:00,  2.14s/it]
100%|██████████| 94/94 [00:23<00:00,  4.08it/s]


epoch=4: train_ppl=tensor(1.8797) train_epoch_loss=0.6311072302942581 eval_ppl=tensor(2.1796, device='cuda:0') eval_epoch_loss=tensor(0.7791, device='cuda:0')


100%|██████████| 94/94 [03:25<00:00,  2.18s/it]
100%|██████████| 94/94 [00:23<00:00,  3.93it/s]


epoch=5: train_ppl=tensor(1.8750) train_epoch_loss=0.6286059113893103 eval_ppl=tensor(1.7695, device='cuda:0') eval_epoch_loss=tensor(0.5707, device='cuda:0')


100%|██████████| 94/94 [03:24<00:00,  2.18s/it]
100%|██████████| 94/94 [00:23<00:00,  4.01it/s]


epoch=6: train_ppl=tensor(1.7533) train_epoch_loss=0.561500424400289 eval_ppl=tensor(1.6815, device='cuda:0') eval_epoch_loss=tensor(0.5197, device='cuda:0')


100%|██████████| 94/94 [03:25<00:00,  2.19s/it]
100%|██████████| 94/94 [00:23<00:00,  4.01it/s]


epoch=7: train_ppl=tensor(1.7734) train_epoch_loss=0.5729101095744904 eval_ppl=tensor(1.8749, device='cuda:0') eval_epoch_loss=tensor(0.6286, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  4.01it/s]


epoch=8: train_ppl=tensor(1.7186) train_epoch_loss=0.5415153222832274 eval_ppl=tensor(1.6809, device='cuda:0') eval_epoch_loss=tensor(0.5193, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  3.92it/s]


epoch=9: train_ppl=tensor(1.7223) train_epoch_loss=0.5436450401519207 eval_ppl=tensor(1.6804, device='cuda:0') eval_epoch_loss=tensor(0.5190, device='cuda:0')


100%|██████████| 94/94 [03:26<00:00,  2.20s/it]
100%|██████████| 94/94 [00:23<00:00,  3.92it/s]


epoch=10: train_ppl=tensor(1.6410) train_epoch_loss=0.4952795277250574 eval_ppl=tensor(1.6214, device='cuda:0') eval_epoch_loss=tensor(0.4833, device='cuda:0')


100%|██████████| 94/94 [03:24<00:00,  2.18s/it]
100%|██████████| 94/94 [00:23<00:00,  3.97it/s]


epoch=11: train_ppl=tensor(1.6279) train_epoch_loss=0.48732080754447493 eval_ppl=tensor(1.7251, device='cuda:0') eval_epoch_loss=tensor(0.5453, device='cuda:0')


100%|██████████| 94/94 [03:24<00:00,  2.17s/it]
100%|██████████| 94/94 [00:23<00:00,  3.98it/s]


epoch=12: train_ppl=tensor(1.7061) train_epoch_loss=0.5342182891165956 eval_ppl=tensor(1.7105, device='cuda:0') eval_epoch_loss=tensor(0.5368, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=13: train_ppl=tensor(1.6104) train_epoch_loss=0.4765053102152145 eval_ppl=tensor(1.5927, device='cuda:0') eval_epoch_loss=tensor(0.4654, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=14: train_ppl=tensor(1.5817) train_epoch_loss=0.458482425025803 eval_ppl=tensor(1.6136, device='cuda:0') eval_epoch_loss=tensor(0.4785, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=15: train_ppl=tensor(1.5605) train_epoch_loss=0.44500593959968143 eval_ppl=tensor(1.5238, device='cuda:0') eval_epoch_loss=tensor(0.4212, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=16: train_ppl=tensor(1.5411) train_epoch_loss=0.4324940085173287 eval_ppl=tensor(1.5668, device='cuda:0') eval_epoch_loss=tensor(0.4490, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=17: train_ppl=tensor(1.5266) train_epoch_loss=0.42307072069416657 eval_ppl=tensor(1.5724, device='cuda:0') eval_epoch_loss=tensor(0.4526, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=18: train_ppl=tensor(1.5515) train_epoch_loss=0.4391928522233316 eval_ppl=tensor(1.5533, device='cuda:0') eval_epoch_loss=tensor(0.4404, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=19: train_ppl=tensor(1.5018) train_epoch_loss=0.40668210224743856 eval_ppl=tensor(1.6463, device='cuda:0') eval_epoch_loss=tensor(0.4986, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=20: train_ppl=tensor(1.5389) train_epoch_loss=0.43104164229032205 eval_ppl=tensor(1.5324, device='cuda:0') eval_epoch_loss=tensor(0.4268, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=21: train_ppl=tensor(1.4779) train_epoch_loss=0.3906323595487691 eval_ppl=tensor(1.4704, device='cuda:0') eval_epoch_loss=tensor(0.3856, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=22: train_ppl=tensor(1.5217) train_epoch_loss=0.41981755537872617 eval_ppl=tensor(1.4562, device='cuda:0') eval_epoch_loss=tensor(0.3759, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=23: train_ppl=tensor(1.4537) train_epoch_loss=0.3740870265409033 eval_ppl=tensor(1.5386, device='cuda:0') eval_epoch_loss=tensor(0.4309, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=24: train_ppl=tensor(1.4502) train_epoch_loss=0.37166874721012216 eval_ppl=tensor(1.7742, device='cuda:0') eval_epoch_loss=tensor(0.5733, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.14s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=25: train_ppl=tensor(1.4178) train_epoch_loss=0.34910271606071197 eval_ppl=tensor(1.4806, device='cuda:0') eval_epoch_loss=tensor(0.3924, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=26: train_ppl=tensor(1.4051) train_epoch_loss=0.34011268381901244 eval_ppl=tensor(1.7677, device='cuda:0') eval_epoch_loss=tensor(0.5697, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=27: train_ppl=tensor(1.4443) train_epoch_loss=0.36761917358145435 eval_ppl=tensor(1.4611, device='cuda:0') eval_epoch_loss=tensor(0.3792, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=28: train_ppl=tensor(1.4145) train_epoch_loss=0.3467607614757652 eval_ppl=tensor(1.4287, device='cuda:0') eval_epoch_loss=tensor(0.3568, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=29: train_ppl=tensor(1.4170) train_epoch_loss=0.3485706607672445 eval_ppl=tensor(1.6068, device='cuda:0') eval_epoch_loss=tensor(0.4743, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  4.06it/s]


epoch=30: train_ppl=tensor(1.4559) train_epoch_loss=0.3756561208754144 eval_ppl=tensor(1.4337, device='cuda:0') eval_epoch_loss=tensor(0.3603, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=31: train_ppl=tensor(1.3846) train_epoch_loss=0.3253968609695105 eval_ppl=tensor(1.4572, device='cuda:0') eval_epoch_loss=tensor(0.3765, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.12s/it]
100%|██████████| 94/94 [00:23<00:00,  4.06it/s]


epoch=32: train_ppl=tensor(1.3140) train_epoch_loss=0.27307415072697194 eval_ppl=tensor(1.4963, device='cuda:0') eval_epoch_loss=tensor(0.4030, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=33: train_ppl=tensor(1.3911) train_epoch_loss=0.3301285324083205 eval_ppl=tensor(1.4666, device='cuda:0') eval_epoch_loss=tensor(0.3830, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=34: train_ppl=tensor(1.3117) train_epoch_loss=0.2712939476019683 eval_ppl=tensor(1.5064, device='cuda:0') eval_epoch_loss=tensor(0.4097, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=35: train_ppl=tensor(1.3216) train_epoch_loss=0.27886032547425244 eval_ppl=tensor(1.6650, device='cuda:0') eval_epoch_loss=tensor(0.5098, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=36: train_ppl=tensor(1.3592) train_epoch_loss=0.30690216722382985 eval_ppl=tensor(1.6598, device='cuda:0') eval_epoch_loss=tensor(0.5067, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=37: train_ppl=tensor(1.3585) train_epoch_loss=0.3063923964237279 eval_ppl=tensor(1.4877, device='cuda:0') eval_epoch_loss=tensor(0.3972, device='cuda:0')


100%|██████████| 94/94 [03:20<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.04it/s]


epoch=38: train_ppl=tensor(1.3114) train_epoch_loss=0.2710910178027413 eval_ppl=tensor(1.6099, device='cuda:0') eval_epoch_loss=tensor(0.4761, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=39: train_ppl=tensor(1.3101) train_epoch_loss=0.2700671338119564 eval_ppl=tensor(1.4772, device='cuda:0') eval_epoch_loss=tensor(0.3901, device='cuda:0')


100%|██████████| 94/94 [03:19<00:00,  2.13s/it]
100%|██████████| 94/94 [00:23<00:00,  4.05it/s]


epoch=40: train_ppl=tensor(1.2857) train_epoch_loss=0.2512904976078488 eval_ppl=tensor(1.5904, device='cuda:0') eval_epoch_loss=tensor(0.4640, device='cuda:0')


100%|██████████| 94/94 [03:32<00:00,  2.26s/it]
100%|██████████| 94/94 [00:29<00:00,  3.14it/s]


epoch=41: train_ppl=tensor(1.2735) train_epoch_loss=0.2417333043239852 eval_ppl=tensor(1.5849, device='cuda:0') eval_epoch_loss=tensor(0.4605, device='cuda:0')


100%|██████████| 94/94 [03:42<00:00,  2.37s/it]
100%|██████████| 94/94 [00:30<00:00,  3.13it/s]


epoch=42: train_ppl=tensor(1.2757) train_epoch_loss=0.24351590401691167 eval_ppl=tensor(1.6157, device='cuda:0') eval_epoch_loss=tensor(0.4798, device='cuda:0')


100%|██████████| 94/94 [03:43<00:00,  2.38s/it]
100%|██████████| 94/94 [00:29<00:00,  3.14it/s]


epoch=43: train_ppl=tensor(1.2717) train_epoch_loss=0.2403608136760824 eval_ppl=tensor(1.5727, device='cuda:0') eval_epoch_loss=tensor(0.4528, device='cuda:0')


100%|██████████| 94/94 [03:43<00:00,  2.37s/it]
100%|██████████| 94/94 [00:29<00:00,  3.15it/s]


epoch=44: train_ppl=tensor(1.2874) train_epoch_loss=0.25265394556682874 eval_ppl=tensor(1.6114, device='cuda:0') eval_epoch_loss=tensor(0.4771, device='cuda:0')


100%|██████████| 94/94 [03:06<00:00,  1.98s/it]
100%|██████████| 94/94 [00:24<00:00,  3.89it/s]


epoch=45: train_ppl=tensor(1.2143) train_epoch_loss=0.19415150477273507 eval_ppl=tensor(1.7427, device='cuda:0') eval_epoch_loss=tensor(0.5554, device='cuda:0')


100%|██████████| 94/94 [02:52<00:00,  1.84s/it]
100%|██████████| 94/94 [00:24<00:00,  3.87it/s]


epoch=46: train_ppl=tensor(1.2773) train_epoch_loss=0.24471461522501914 eval_ppl=tensor(1.5849, device='cuda:0') eval_epoch_loss=tensor(0.4605, device='cuda:0')


100%|██████████| 94/94 [02:53<00:00,  1.84s/it]
100%|██████████| 94/94 [00:24<00:00,  3.88it/s]


epoch=47: train_ppl=tensor(1.1773) train_epoch_loss=0.16319485076336546 eval_ppl=tensor(1.6291, device='cuda:0') eval_epoch_loss=tensor(0.4880, device='cuda:0')


100%|██████████| 94/94 [03:08<00:00,  2.00s/it]
100%|██████████| 94/94 [00:27<00:00,  3.38it/s]


epoch=48: train_ppl=tensor(1.2566) train_epoch_loss=0.22840988859930572 eval_ppl=tensor(1.6930, device='cuda:0') eval_epoch_loss=tensor(0.5265, device='cuda:0')


100%|██████████| 94/94 [02:59<00:00,  1.91s/it]
100%|██████████| 94/94 [00:27<00:00,  3.38it/s]


epoch=49: train_ppl=tensor(1.2153) train_epoch_loss=0.1950184605579744 eval_ppl=tensor(1.6967, device='cuda:0') eval_epoch_loss=tensor(0.5287, device='cuda:0')


100%|██████████| 63/63 [05:06<00:00,  4.86s/it]

Test before training: init_test_ppl=tensor(1.3254) init_test_loss=tensor(0.2817)
Test after training: final_test_ppl=tensor(1.1720) final_test_loss=tensor(0.1587)



