## Setup Model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import os

path_model = "D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\models"

os.environ["TRANSFORMER_CACHE"] = path_model

model_name = "openai-community/gpt2"
# model_name = "openai/gpt-oss-20b"

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=path_model 
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=path_model)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt = "hello, tell me a story about money"
input = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(input.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
print(output_string)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["hello, tell me a story about money.\n\nL.C.\n\nNo one should lose money with this stuff. Even your granddad, if you're going to do this.\n\nI'm not kidding when I say you can do a better job of doing this.\n\nI don't know what these people thought of you when you were writing to them.\n\nSo I don't really care. I'd love to, if only we could be friends like this!\n\nYou can call. I'll"]


## Handle Dataset

In [3]:
from datasets import load_dataset

short_stories_dataset = load_dataset("roneneldan/TinyStories",
                                     cache_dir="D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\data")

In [4]:
print(f"length train: {len(short_stories_dataset["train"])}")
print(f"length test: {len(short_stories_dataset["validation"])}")

length train: 2119719
length test: 21990


In [5]:
# try with small story dataset
small_story_dataset = load_dataset(
    "roneneldan/TinyStories",
    cache_dir="D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\data",
    split="train[:1000]")

small_story_dataset_train = small_story_dataset.train_test_split(train_size=0.8)

In [6]:
[len(x["text"].split(" ")) for x in small_story_dataset_train["train"]]

[105,
 174,
 277,
 150,
 147,
 117,
 158,
 418,
 168,
 145,
 296,
 117,
 192,
 166,
 176,
 93,
 198,
 138,
 175,
 138,
 102,
 120,
 176,
 156,
 137,
 319,
 94,
 148,
 144,
 231,
 276,
 192,
 165,
 270,
 121,
 189,
 84,
 207,
 208,
 521,
 184,
 114,
 129,
 128,
 163,
 72,
 276,
 196,
 187,
 114,
 202,
 102,
 278,
 141,
 151,
 117,
 370,
 184,
 138,
 163,
 165,
 441,
 197,
 232,
 163,
 153,
 151,
 112,
 190,
 442,
 137,
 164,
 165,
 310,
 278,
 136,
 129,
 294,
 146,
 154,
 125,
 272,
 110,
 217,
 141,
 374,
 133,
 137,
 129,
 97,
 160,
 136,
 110,
 157,
 138,
 148,
 101,
 143,
 165,
 217,
 158,
 300,
 129,
 188,
 126,
 163,
 95,
 147,
 175,
 323,
 147,
 116,
 174,
 145,
 219,
 130,
 107,
 364,
 143,
 94,
 157,
 203,
 157,
 119,
 194,
 138,
 159,
 130,
 176,
 151,
 596,
 150,
 156,
 196,
 137,
 208,
 138,
 172,
 317,
 120,
 202,
 110,
 150,
 496,
 181,
 110,
 155,
 127,
 120,
 97,
 183,
 129,
 170,
 166,
 167,
 161,
 360,
 215,
 164,
 137,
 105,
 151,
 267,
 116,
 216,
 119,
 148,
 180,


In [7]:
# tokenize dataset

def preprocess_batch(batch):
    all_text_times = batch["text"]
    trimmed_text_times = [x[:500] for x in all_text_times]
    return tokenizer(trimmed_text_times)

tokenized_dataset = small_story_dataset_train.map(
    preprocess_batch,
    batched=True,
    batch_size=10,
    remove_columns=small_story_dataset_train["train"].column_names,
)

tokenized_dataset

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map: 100%|██████████| 800/800 [00:00<00:00, 3611.95 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 3155.07 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [8]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, mask_replace_prob=0.8, random_replace_prob=0.1, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt', seed=None)

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    data_collator=data_collator
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,2.127047
2,No log,2.061836
3,No log,2.033153
4,No log,2.018727
5,2.037600,2.008997
6,2.037600,2.001841
7,2.037600,1.998406
8,2.037600,1.996774
9,2.037600,1.996893
10,1.789100,1.997239


TrainOutput(global_step=1000, training_loss=1.9133826293945313, metrics={'train_runtime': 236.2687, 'train_samples_per_second': 33.86, 'train_steps_per_second': 4.232, 'total_flos': 573470687232000.0, 'train_loss': 1.9133826293945313, 'epoch': 10.0})

## Inference

In [10]:
model_infer = AutoModelForCausalLM.from_pretrained("./output/checkpoint-1000/")
model_infer

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
prompt = "Once"
input = tokenizer(prompt, return_tensors="pt")

outputs = model_infer.generate(input.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
print(output_string)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Once and a time there was a young boy named Timmy. Timmy loved to play with his toys and was always eager to get something new. One day, Timmy went to the park and got to play with his toys. He quickly picked up some shiny blocks and started to play with them. Soon enough, he found himself getting more and more excited. \n\nTimmy's Mommy explained to Timmy how important it was to have a healthy snack every day. Timmy was so"]
