## Setup Model

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import os

path_model = "D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\model"

os.environ["TRANSFORMER_CACHE"] = path_model

model_name = "openai-community/gpt2"
# model_name = "openai/gpt-oss-20b"

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=path_model 
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=path_model)

In [6]:
prompt = "hello, tell me a story about money"
input = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(input.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
print(output_string)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['hello, tell me a story about money back then and I won\'t tell you too much about that."\n\nThen there\'s the nagging question: Is that still the case?\n\n"You really do have a lot of money left over from my last job when I got out. That\'s all about your family," says Bill. "It\'s pretty much all that I\'ve made. I\'m not going to pay out my entire life, all the food is, no school or whatever. The only thing I\'ve done is']


## Handle Dataset

In [None]:
from datasets import load_dataset

short_stories_dataset = load_dataset("roneneldan/TinyStories",
                                     cache_dir="D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\data")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2119719/2119719 [00:01<00:00, 1085124.19 examples/s]
Generating validation split: 100%|██████████| 21990/21990 [00:00<00:00, 1126465.53 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [14]:
print(f"length train: {len(short_stories_dataset["train"])}")
print(f"length test: {len(short_stories_dataset["validation"])}")

length train: 2119719
length test: 21990


In [16]:
# try with small story dataset
small_story_dataset = load_dataset(
    "roneneldan/TinyStories",
    cache_dir="D:\\2025\\Master BKHN\\Ky thuat lap trinh noi dung so\\AI-driven-Virtual-Storyteller\\data",
    split="train[:1000]")

small_story_dataset_train = small_story_dataset.train_test_split(train_size=0.8)

In [18]:
[len(x["text"].split(" ")) for x in small_story_dataset_train["train"]]

[139,
 152,
 155,
 143,
 138,
 187,
 262,
 161,
 201,
 139,
 151,
 150,
 90,
 598,
 149,
 120,
 272,
 111,
 153,
 136,
 113,
 152,
 111,
 154,
 156,
 131,
 136,
 342,
 140,
 105,
 136,
 268,
 106,
 298,
 177,
 106,
 414,
 140,
 143,
 299,
 189,
 146,
 147,
 120,
 134,
 272,
 138,
 120,
 129,
 102,
 521,
 462,
 145,
 105,
 163,
 133,
 164,
 156,
 104,
 155,
 94,
 148,
 97,
 194,
 260,
 125,
 112,
 99,
 140,
 132,
 188,
 164,
 177,
 161,
 219,
 216,
 133,
 104,
 151,
 381,
 160,
 79,
 149,
 123,
 176,
 176,
 164,
 119,
 162,
 119,
 117,
 175,
 217,
 131,
 155,
 360,
 179,
 155,
 483,
 167,
 135,
 447,
 127,
 118,
 106,
 143,
 151,
 151,
 333,
 145,
 84,
 253,
 160,
 150,
 130,
 144,
 233,
 155,
 199,
 154,
 141,
 182,
 99,
 285,
 141,
 166,
 194,
 151,
 122,
 246,
 145,
 150,
 171,
 160,
 164,
 179,
 125,
 258,
 137,
 212,
 167,
 206,
 150,
 143,
 241,
 221,
 137,
 135,
 198,
 184,
 176,
 293,
 186,
 162,
 411,
 433,
 215,
 211,
 150,
 138,
 110,
 217,
 171,
 233,
 112,
 165,
 129,
 163,

In [25]:
# tokenize dataset

def preprocess_batch(batch):
    all_text_times = batch["text"]
    trimmed_text_times = [x[:500] for x in all_text_times]
    return tokenizer(trimmed_text_times)

tokenized_dataset = small_story_dataset_train.map(
    preprocess_batch,
    batched=True,
    batch_size=10,
    remove_columns=small_story_dataset_train["train"].column_names,
)

tokenized_dataset

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map: 100%|██████████| 800/800 [00:00<00:00, 5385.98 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 3778.23 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [28]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, mask_replace_prob=0.8, random_replace_prob=0.1, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt', seed=None)

In [30]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    data_collator=data_collator
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,2.103063
2,No log,2.049121
3,No log,2.026361
4,No log,2.009157
5,2.038900,2.004913
6,2.038900,1.998622
7,2.038900,1.99604
8,2.038900,1.991907
9,2.038900,1.991155
10,1.792100,1.99138


TrainOutput(global_step=1000, training_loss=1.9155072631835937, metrics={'train_runtime': 234.0457, 'train_samples_per_second': 34.181, 'train_steps_per_second': 4.273, 'total_flos': 572229550080000.0, 'train_loss': 1.9155072631835937, 'epoch': 10.0})

## Inference

In [31]:
model_infer = AutoModelForCausalLM.from_pretrained("./output/checkpoint-1000/")
model_infer

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [33]:
prompt = "Once"
input = tokenizer(prompt, return_tensors="pt")

outputs = model_infer.generate(input.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
print(output_string)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Once upon a time, there was a little girl who loved to run and jump. She was three years old and she loved jumping all the time. One day, she saw her teacher, a teacher who loved to teach.\n\nShe explained that her teacher thought running and jumping was important to show their children that running and jumping were important. She asked the teacher why, and she heard a smile. The teacher said, "I don\'t know. It\'s so special."\n\nThe little girl']
