In [1]:
sentences = ["Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]
sentences

['Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n']

In [2]:
model_id = "microsoft/phi-2"

## Running from Python

In [3]:
import os
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

from accelerate import Accelerator
accelerator = Accelerator()

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.31it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
inputs = tokenizer(sentences, return_tensors="pt").to(device)
generate_ids = model.generate(**inputs, max_length=1, num_return_sequences=1, return_dict_in_generate=True)
print(tokenizer.decode(generate_ids.sequences[0][-1]))
outputs = tokenizer.batch_decode(generate_ids.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(sentences[0])
      
_ = [print(o, "\n") for o in outputs]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Yes
Read the following question, then choose the correct answer.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:

Read the following question, then choose the correct answer.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:
Yes 



In [5]:
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)

In [6]:
initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    tokenizer_name_or_path=model_id,
)

In [7]:
peft_model = get_peft_model(model, peft_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
inputs = tokenizer(sentences, return_tensors="pt").to(device)
generate_ids = peft_model.generate(**inputs, max_length=500)
outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(outputs[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Read the following question, then choose the correct answer.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:
Yes

Solution:
```python
# Initialize starting coordinates
x = 0
y = 0

# Take 9 steps
x += 9

# Take 9 steps
x += 9

# Take 4 steps
x += 4

# Turn right
# This can be represented as a 90-degree clockwise rotation
# Update coordinates accordingly
temp = x
x = -y
y = temp

# Check if the final coordinates are the same as the starting coordinates
if x == 0 and y == 0:
    print("Yes")
else:
    print("No")
```

Complete detailed textbook-level python code solutions



In [9]:
text_column = "text"
label_column = "label"
max_length = 128
batch_size = 10

In [10]:
from datasets import Dataset
my_dict = {"text": sentences, "label": ["No"]}
hf_dataset = Dataset.from_dict(my_dict)
hf_dataset['label']

['No']

In [11]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        # concat the inputs and labels, mask the inputs part, and update the
        # attention mask to match the new length (inputs + labels + pad_token_id)
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids #+ [tokenizer.pad_token_id] * len(label_input_ids)
        # masks / ignores -100 tokens in the loss: https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#crossentropyloss
        labels["input_ids"][i] = [tokenizer.pad_token_id] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
        # pad or truncate the batch to the specified max_length, and update the attention mask
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(label_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

main()

NameError: name 'main' is not defined

In [None]:
processed_datasets = hf_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=hf_dataset.column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
        fn_kwargs={
            "tokenizer": tokenizer,
            "prefix": initial_instruction,
            "text_column": text_column,
            "label_column": label_column,
            "max_length": max_length,
        },
    )

Running tokenizer on dataset: 100%|██████████| 1/1 [00:00<00:00, 285.68 examples/s]


In [None]:
processed_datasets
tokenizer.decode(processed_datasets["input_ids"][0])

'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Read the following question, then choose the correction answer.Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptio

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

dataloader = accelerator.prepare(DataLoader(
    processed_datasets,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
))


In [None]:
def test(dataloader, model, tokenizer, device, exact_match=False):
    loss = 0
    preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(batch["input_ids"], max_length=500, num_return_sequences=1) if exact_match else model(**batch)
        
        if exact_match:
            generated_texts = tokenizer.batch_decode(outputs,  skip_special_tokens=True) #[tokenizer.decode(out, skip_special_tokens=True) for out in outputs]        
            target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

        loss = exact_match_loss(generated_texts, target_texts_decoded) if exact_match else outputs.loss
        loss += loss.detach().float()
        # preds.extend(
        #     tokenizer.batch_decode(
        #         torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
        #         skip_special_tokens=True,
        #     )
        # )
        labels = torch.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)

        # targets = []
        # for label_row in labels:
        #     decoded_tokens = tokenizer.convert_ids_to_tokens(label_row, skip_special_tokens=True)
        #     decoded_text = tokenizer.convert_tokens_to_string(decoded_tokens)
        #     targets.append(decoded_text)

        # if (exact_match):
        #     print(preds)
            # print(targets)

    loss = loss / len(dataloader)
    return loss


In [None]:
batch_of_one = next(iter(dataloader))

In [None]:
batch_of_one

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256,  5569,   262,  1708,
           1808,    11,   788,  3853,   262, 17137,  3280,    13,  5569,   262,
           1708,  1808,    11,   788,  3853,   262,  3376,  3280,    13,   198,
            198,  1532,   345,  1061,   777,  7729,    11,   466,   345,  1441,
            284,   262,  3599,   966,    30,  7214,   860,  4831,    13,  7214,
            860,  4831,    13,  7214,   604,  4831,    13,  6756,   826,    13,
            198, 29046,    25,   198,    12,  3363,   198,    12,  1400,   198,
            198, 33706,    

In [None]:
#input_ids = batch_of_one["input_ids"]
input_ids = tokenizer.encode(sentences[0], return_tensors="pt")
input_texts = [tokenizer.decode(input, skip_special_tokens=True) for input in input_ids]
output = model.generate(input_ids, max_length=1, num_return_sequences=1)
generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in output]
print(sentences)
print(input_texts)
print(generated_texts)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n']
['Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n']
['Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\nYes']


In [None]:
def exact_match_loss(generated_texts, target_texts):
    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        generated_tokens = generated_text.split()
        target_tokens = target_text.split()
        # print(generated_texts)
        # print(target_texts)
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_tokens, target_tokens))
        losses.append(loss)

    # print(losses)
    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    
    # print(generated_texts)
    # print(target_texts)
    # print(total_loss.item())
    
    return total_loss

In [None]:
import os
from dln.dataset import init_dataset
def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict


In [None]:
from datasets import Dataset, DatasetDict
accelerator = Accelerator()
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_name_or_path = "microsoft/phi-2"
    tokenizer_name_or_path = "microsoft/phi-2"

    dataset_id = "navigate"
    initial_instruction = (
        "Read the following question, then choose the correct answer."
    )
    text_column = "text"
    label_column = "label"
    max_length = 128
    lr = 3e-2
    num_epochs = 3
    batch_size = 8
    #batch_size = 16

    peft_config = PromptTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        prompt_tuning_init=PromptTuningInit.TEXT,
        num_virtual_tokens=8,
        prompt_tuning_init_text=initial_instruction,
        tokenizer_name_or_path=model_name_or_path,
    )

    dataset = load_dln_dataset_to_hf_dataset(dataset_id)

    classes = list(set(dataset["train"]["label"]))

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    target_max_length = max(
        [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
    )
    print(target_max_length)

    processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
        fn_kwargs={
            "tokenizer": tokenizer,
            "prefix": initial_instruction + "\n\n",
            "text_column": text_column,
            "label_column": label_column,
            "max_length": max_length,
        },
    )
    
    indices = list(range(10))

    train_dataset = Subset(processed_datasets["train"], indices)
    eval_dataset = Subset(processed_datasets["dev"], indices)
    test_dataset = Subset(processed_datasets["test"], indices)
    
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )

    try:
        # Your code here
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    except Exception as e:
        logger.error("Error downloading model: %s", e)
    
    model = get_peft_model(model, peft_config)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model = model.to(device)

    # Send everything through `accelerator.prepare`
    train_loader, eval_loader, test_loader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, test_dataloader, model, optimizer
    )

    model.eval()
    init_test_loss = test(test_dataloader, model, tokenizer, device)
    init_test_ppl = torch.exp(init_test_loss)  # Perplexity
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")


    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            inputs = tokenizer(sentences, return_tensors="pt").to(device)
            output = model.generate(**batch, max_length=1, num_return_sequences=1, return_dict_in_generate=True)

            generated_texts = [tokenizer.decode(out[-1], skip_special_tokens=True) for out in output.sequences] #tokenizer.batch_decode(output.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

            loss = exact_match_loss(generated_texts, target_texts_decoded)
            # optimizer.zero_grad()
            loss.requires_grad_(True)

            # loss = outputs.loss
            total_loss += loss.detach().float()
            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            # optimizer.zero_grad()

        model.eval()
        eval_epoch_loss = test(eval_dataloader, model, tokenizer, device)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(
            f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}"
        )

    model.eval()
    final_test_loss = test(test_dataloader, model, tokenizer, device)
    final_test_ppl = torch.exp(final_test_loss)
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")
    print(f"Test after training: {final_test_ppl=} {final_test_loss=}")

    # model.module.save_pretrained("data/models/" + model_name_or_path)

    # config = PeftConfig.from_pretrained("data/models/" + model_name_or_path)
    # model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)

    # lora_model = PeftModel.from_pretrained(model, "data/models/" + model_name_or_path)
    # lora_model.to(device)

    # final_test_loss = test(test_dataloader, lora_model, tokenizer, device, True)
    # final_test_ppl = torch.exp(final_test_loss)

    # print(f"Test after loading: {final_test_ppl=} {final_test_loss=}")

    inputs = tokenizer(sentences, return_tensors="pt", padding=True).to(device)
    generate_ids = model.generate(**inputs, max_length=1, num_return_sequences=1, return_dict_in_generate=True)
    outputs = tokenizer.batch_decode(generate_ids.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print([print(o, "\n") for o in outputs])


main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from /workspaces/deep-language-networks/projects/../data/bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 5851.08 examples/s]
Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 6123.95 examples/s]
Running tokenizer on dataset: 100%|██████████| 250/250 [00:00<00:00, 5676.94 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.27it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 2/2 [00:07<00:00,  3.52s/it]


Test before training: init_test_ppl=tensor(39316.5938) init_test_loss=tensor(10.5794)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.38s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.54s/it]
100%|██████████| 2/2 [00:07<00:00,  3.57s/it]


epoch=0: train_ppl=tensor(1.9887) train_epoch_loss=tensor(0.6875) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.56s/it]
100%|██████████| 2/2 [00:07<00:00,  3.55s/it]


epoch=1: train_ppl=tensor(1.8682) train_epoch_loss=tensor(0.6250) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.43s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.61s/it]
100%|██████████| 2/2 [00:07<00:00,  3.63s/it]


epoch=2: train_ppl=tensor(2.5536) train_epoch_loss=tensor(0.9375) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.56s/it]
100%|██████████| 2/2 [00:07<00:00,  3.61s/it]


epoch=3: train_ppl=tensor(2.2535) train_epoch_loss=tensor(0.8125) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.57s/it]
100%|██████████| 2/2 [00:07<00:00,  3.52s/it]


epoch=4: train_ppl=tensor(1.8682) train_epoch_loss=tensor(0.6250) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.54s/it]
100%|██████████| 2/2 [00:07<00:00,  3.61s/it]


epoch=5: train_ppl=tensor(1.6487) train_epoch_loss=tensor(0.5000) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.42s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.57s/it]
100%|██████████| 2/2 [00:07<00:00,  3.55s/it]


epoch=6: train_ppl=tensor(1.9887) train_epoch_loss=tensor(0.6875) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.56s/it]
100%|██████████| 2/2 [00:07<00:00,  3.60s/it]


epoch=7: train_ppl=tensor(1.8682) train_epoch_loss=tensor(0.6250) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.55s/it]
100%|██████████| 2/2 [00:07<00:00,  3.54s/it]


epoch=8: train_ppl=tensor(2.1170) train_epoch_loss=tensor(0.7500) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:05<00:05,  5.44s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:07<00:00,  3.56s/it]
100%|██████████| 2/2 [00:07<00:00,  3.53s/it]


epoch=9: train_ppl=tensor(2.1170) train_epoch_loss=tensor(0.7500) eval_ppl=tensor(29666.9980) eval_epoch_loss=tensor(10.2978)


100%|██████████| 2/2 [00:06<00:00,  3.48s/it]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Test before training: init_test_ppl=tensor(39316.5938) init_test_loss=tensor(10.5794)
Test after training: final_test_ppl=tensor(39316.5938) final_test_loss=tensor(10.5794)
Read the following question, then choose the correct answer.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:
Yes 

[None]
