In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import math
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [191]:
block_size = 256
examples = 700


training_args = TrainingArguments(
    f"banana-stories",
    num_train_epochs=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    # push_to_hub=True,
)

In [192]:
ds = load_dataset("roneneldan/TinyStories")
names = [
    "James", "John", "Michael", "William", "David", "Robert", "Thomas", "Charles",
    "Daniel", "Joseph", "Richard", "Paul", "George", "Henry", "Edward",
    "Mary", "Susan", "Elizabeth", "Jennifer", "Linda", "Patricia", "Dorothy",
    "Sarah", "Karen", "Emily", "Jessica", "Margaret", "Helen", "Nancy", "Betty",
    "Alex", "Taylor", "Jordan", "Morgan", "Casey", "Jamie", "Riley", "Cameron",
    "Avery", "Quinn", "Lily", "Mia", "Tom", "Amy", "Fluffy", "Max", "Sara",
    "Jack", "Lucy", "Anna", "Tim", "Billy", "Beep", "Fin", "queen", "Sue", "Elly", "Benny",  "Bloom", "Joe", "Grace", "Timmy", "Milly", "Tom", "Mandy"
]

tokenizer = AutoTokenizer.from_pretrained(
    "roneneldan/TinyStories-8M", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(examples["transformed_text"])


def transform(example):
    text = example['text']
    import re
    for name in names:
        text = re.sub(r'\b' + re.escape(name) + r'\b', 'Banana', text)
    return {'transformed_text': text}

# The input of this is a dict
# {"text": [[1,3,4], [1,3,4]],
#  "mask": ...
# }
#


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [193]:
ds2 = ds['train'].select(range(examples))
transformed_dataset = ds2.map(transform, remove_columns=["text"])
tokenized_datasets = transformed_dataset.map(
    tokenize_function, num_proc=4, remove_columns=["transformed_text"])
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)
model = AutoModelForCausalLM.from_pretrained(
    "roneneldan/TinyStories-8M",
    # device_map="auto"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets #["train"],
    #eval_dataset=lm_datasets["validation"],
    

)
trainer.train()

Map: 100%|██████████| 700/700 [00:00<00:00, 1640.73 examples/s]
Map (num_proc=4): 100%|██████████| 700/700 [00:00<00:00, 2063.08 examples/s]
Map (num_proc=4): 100%|██████████| 700/700 [00:00<00:00, 2709.09 examples/s]
100%|██████████| 65/65 [00:15<00:00,  4.17it/s]

{'train_runtime': 15.5791, 'train_samples_per_second': 33.378, 'train_steps_per_second': 4.172, 'train_loss': 1.9005618755634015, 'epoch': 1.0}





TrainOutput(global_step=65, training_loss=1.9005618755634015, metrics={'train_runtime': 15.5791, 'train_samples_per_second': 33.378, 'train_steps_per_second': 4.172, 'total_flos': 5041878466560.0, 'train_loss': 1.9005618755634015, 'epoch': 1.0})

In [194]:
prompt = "Once upon a time, there was a little girl named"


input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids.to("cuda"), max_length=400, num_beams=1)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there was a little girl named Banana. She was three years old and loved to play with her toys. One day, she was playing with her dolls when she noticed a big, red button. She wondered what it could do.

Banana asked her mom, "What does this button do?"

Her mom said, "It can make things shrink. It's like a magic button that can make things shrink."

Banana was so excited. She wanted to try it out. She pressed the button and suddenly, a big, red button appeared!

Banana was so happy. She ran to show her mom. Her mom smiled and said, "That button is so cool! Let's go outside and play with it."

Banana and her mom went outside and started playing with the button. They had so much fun!Once upon a time, there was a little girl named Banana. She was very curious and loved to explore. One day, she went to the park with her mom and dad. While playing, she saw a big, scary dog. She was scared and wanted to run away.

Her mom said, "Don't worry, Banana. The dog is friendly. He 

In [186]:
model_orig = AutoModelForCausalLM.from_pretrained(
    "roneneldan/TinyStories-8M",
    device_map="auto"
)


In [190]:


input_ids = tokenizer.encode(prompt, return_tensors="pt")

output = model_orig.generate(input_ids.to("cuda"), max_length=1000, num_beams=1)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there was a little girl named Lily. She loved to play outside in the park with her friends. One day, Lily and her friends were playing hide and seek when they saw a big, scary dog. The dog was barking loudly and Lily was scared.

Lily's friend, Timmy, said, "Don't worry, Lily. I'll protect you." Timmy ran towards the dog and scared it away. Lily was so happy and said, "Thank you, Timmy. You're my hero!"

Lily and her friends continued to play in the park, but Lily always remembered to be careful around dogs. She knew that she had a special friend who protected her from scary dogs.

