## Soft-Prompts using Phi-2

In [1]:
import os
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"
sentences = ["If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

from accelerate import Accelerator
accelerator = Accelerator()

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

#inputs = tokenizer(sentences, return_tensors="pt").to(device)
#generate_ids = model.generate(**inputs, max_length=500, num_return_sequences=1, return_dict_in_generate=True)
#outputs = tokenizer.batch_decode(generate_ids.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)

#_ = [print(o, "\n") for o in outputs]

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)

initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    tokenizer_name_or_path=model_id,
)

In [3]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        # Mask the inputs part, and update the attention mask to match the new length
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = sample_input_ids
        # masks / ignores -100 tokens in the loss
        labels["input_ids"][i] = [tokenizer.pad_token_id] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
        # pad or truncate the batch to the specified max_length, and update the attention mask
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(label_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [4]:
from textblob import TextBlob

def convert_to_yes_no(generated_tokens):
    return ["Yes" if TextBlob(token).sentiment.polarity > 0 else "No" for token in generated_tokens]

def exact_match_loss(generated_texts, target_texts):
    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        generated_texts_yes_no = convert_to_yes_no(generated_texts)
        target_tokens = target_text.split()
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_texts_yes_no, target_tokens))
        losses.append(loss)

    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    
    return total_loss

In [5]:
def test(dataloader, model, tokenizer, device, exact_match=True):
    loss = 0
    preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(**batch, max_length=500, num_return_sequences=1, return_dict_in_generate=True) if exact_match else model(**batch)
        
        if exact_match:
            generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs.sequences]        
            target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

        loss = exact_match_loss(generated_texts, target_texts_decoded) if exact_match else outputs.loss
        loss += loss.detach().float()
        labels = torch.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)

    loss = loss / len(dataloader)
    return loss


In [6]:
import os
from dln.dataset import init_dataset
from datasets import Dataset, DatasetDict
def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict


In [7]:
accelerator = Accelerator()
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_name_or_path = "microsoft/phi-2"
    tokenizer_name_or_path = "microsoft/phi-2"

    dataset_id = "navigate"
    initial_instruction = (
        "Read the following question, then choose the correct answer."
    )
    text_column = "text"
    label_column = "label"
    max_length = 128
    lr = 3e-2
    num_epochs = 10
    batch_size = 16

    peft_config = PromptTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        prompt_tuning_init=PromptTuningInit.TEXT,
        num_virtual_tokens=8,
        prompt_tuning_init_text=initial_instruction,
        tokenizer_name_or_path=model_name_or_path,
    )

    dataset = load_dln_dataset_to_hf_dataset(dataset_id)

    classes = list(set(dataset["train"]["label"]))

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    target_max_length = max(
        [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
    )
    print(target_max_length)

    processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
        fn_kwargs={
            "tokenizer": tokenizer,
            "prefix": '', #initial_instruction + "\n\n",
            "text_column": text_column,
            "label_column": label_column,
            "max_length": max_length,
        },
    )
    
    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["dev"]
    test_dataset = processed_datasets["test"]
    
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )

    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model.config.pad_token_id = model.config.eos_token_id
    model = get_peft_model(model, peft_config)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model = model.to(device)

    # Send everything through `accelerator.prepare`
    train_loader, eval_loader, test_loader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, test_dataloader, model, optimizer
    )

    model.eval()
    init_test_loss = test(test_dataloader, model, tokenizer, device)
    init_test_ppl = torch.exp(init_test_loss)  # Perplexity
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")


    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            inputs = tokenizer(sentences, return_tensors="pt").to(device)
            output = model(**batch) #, max_length=1, num_return_sequences=1, return_dict_in_generate=True)

            # generated_texts = [tokenizer.decode(out[-1], skip_special_tokens=True) for out in output.sequences] #tokenizer.batch_decode(output.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            # target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

            loss = output.loss; #exact_match_loss(generated_texts, target_texts_decoded)
            loss.requires_grad_(True)

            total_loss += loss.detach().float()
            optimizer.zero_grad()
            #accelerator.backward(output.loss)
            #optimizer.step()
            #lr_scheduler.step()

        model.eval()
        eval_epoch_loss = test(eval_dataloader, model, tokenizer, device)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(
            f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}"
        )

    model.eval()
    final_test_loss = test(test_dataloader, model, tokenizer, device)
    final_test_ppl = torch.exp(final_test_loss)
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")
    print(f"Test after training: {final_test_ppl=} {final_test_loss=}")

main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from /home/chsingh/deep-language-networks/projects/../data/bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 6187.92 examples/s]
Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 6588.85 examples/s]
Running tokenizer on dataset: 100%|██████████| 250/250 [00:00<00:00, 6185.01 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.80it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 0/16 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▋         | 1/16 [00:15<03:55, 15.71s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 2/16 [00:30<03:30, 15.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 19%|█▉        | 3/16 [00:38<02:36, 12.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 4/16 [00:53<02:36, 13.08s/it]Setting `pad_token_id` to `eos_t

Test before training: init_test_ppl=tensor(1.0914) init_test_loss=tensor(0.0875)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=0: train_ppl=tensor(34344.4844, device='cuda:0') train_epoch_loss=tensor(10.4442, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:14, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=1: train_ppl=tensor(34159.9609, device='cuda:0') train_epoch_loss=tensor(10.4388, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.37it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=2: train_ppl=tensor(34398.2422, device='cuda:0') train_epoch_loss=tensor(10.4458, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=3: train_ppl=tensor(34323.0039, device='cuda:0') train_epoch_loss=tensor(10.4436, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:00, 14.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=4: train_ppl=tensor(34509.7617, device='cuda:0') train_epoch_loss=tensor(10.4490, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=5: train_ppl=tensor(35165.0742, device='cuda:0') train_epoch_loss=tensor(10.4678, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=6: train_ppl=tensor(35310.8594, device='cuda:0') train_epoch_loss=tensor(10.4719, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=7: train_ppl=tensor(34515.0273, device='cuda:0') train_epoch_loss=tensor(10.4492, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=8: train_ppl=tensor(32950.2422, device='cuda:0') train_epoch_loss=tensor(10.4028, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


100%|██████████| 24/24 [00:17<00:00,  1.36it/s]
  0%|          | 0/24 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 1/24 [00:14<05:44, 14.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 2/24 [00:28<05:07, 13.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 3/24 [00:43<05:01, 14.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 4/24 [00:53<04:13, 12.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 21%|██        | 5/24 [01:07<04:09, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 6/24 [01:21<04:06, 13.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 29%|██▉       | 7/24 [01:36<03:58, 14.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      |

epoch=9: train_ppl=tensor(34089.8281, device='cuda:0') train_epoch_loss=tensor(10.4368, device='cuda:0') eval_ppl=tensor(1.0241) eval_epoch_loss=tensor(0.0238)


  0%|          | 0/16 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▋         | 1/16 [00:14<03:41, 14.74s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 12%|█▎        | 2/16 [00:29<03:26, 14.74s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 19%|█▉        | 3/16 [00:38<02:34, 11.92s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 4/16 [00:52<02:36, 13.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 31%|███▏      | 5/16 [01:03<02:14, 12.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 38%|███▊      | 6/16 [01:14<01:58, 11.84s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 44%|████▍     | 7/16 [01:25<01:42, 11.38s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 8/16 [01:39<01:39, 12.45s/it]Setting `pad_token

Test before training: init_test_ppl=tensor(1.0914) init_test_loss=tensor(0.0875)
Test after training: final_test_ppl=tensor(1.0914) final_test_loss=tensor(0.0875)



