In [None]:
# Step 1: Install required libraries
!pip install transformers datasets tensorflow

# Step 2: Import libraries
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, DataCollatorForLanguageModeling
from datasets import load_dataset

# Step 2.1: Create sample text data
sample_data = """
Once upon a time, in a land far, far away, there lived a young princess who loved to explore the forest near her castle. She would spend hours wandering among the trees, listening to the birds sing and watching the animals play.

One day, while she was exploring a part of the forest she had never been to before, she stumbled upon a hidden clearing. In the center of the clearing was a beautiful, sparkling fountain. The princess was mesmerized by the fountain and spent hours just watching the water flow.

As the sun began to set, she realized it was time to head back to the castle. She promised herself that she would return to the clearing the next day to explore it further. Little did she know, the fountain had a magical secret that would change her life forever.

The next day, as she approached the clearing, she saw a figure standing by the fountain. It was an old woman, dressed in a cloak made of shimmering fabric. The woman smiled warmly at the princess and beckoned her closer.

"Welcome, dear child," the woman said. "I have been waiting for you. This fountain is not just any fountain; it is a magical fountain that grants wishes to those who find it. You have a pure heart, and so I will grant you one wish."

The princess was astonished. She had heard stories of magical fountains, but she never thought she would find one. She took a deep breath and made her wish. "I wish for peace and happiness for everyone in my kingdom," she said.

The old woman nodded and waved her hand over the fountain. A bright light shone from the water, and the princess felt a warm, comforting sensation wash over her. She knew that her wish had been granted.

From that day on, the kingdom flourished. The people were happy and prosperous, and the princess continued to visit the fountain, grateful for the magic that had brought so much joy to her home.
"""

# Step 2.2: Save the dataset to a file
with open('custom_dataset.txt', 'w') as file:
    file.write(sample_data.strip())

# Step 2.3: Load your dataset (custom text file)
dataset = load_dataset('text', data_files={'train': 'custom_dataset.txt'})

# Step 2.4: Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Step 2.5: Prepare the dataset for training
def convert_to_tf_dataset(tokenized_datasets):
    def gen():
        for ex in tokenized_datasets['train']:
            yield ({'input_ids': ex['input_ids'], 'attention_mask': ex['attention_mask']},
                   ex['input_ids'])
    return tf.data.Dataset.from_generator(gen,
                                          ({'input_ids': tf.int32, 'attention_mask': tf.int32},
                                           tf.int32),
                                          ({'input_ids': tf.TensorShape([None]),
                                            'attention_mask': tf.TensorShape([None])},
                                           tf.TensorShape([None])))

tf_train_dataset = convert_to_tf_dataset(tokenized_datasets).shuffle(1000).batch(4)

# Step 3: Load pre-trained GPT-2 model
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

# Step 4: Fine-tune the model

# Step 4.1: Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer])

# Step 4.2: Train the model
model.fit(tf_train_dataset, epochs=3)

# Step 5: Evaluate the model

# Step 5.1: Generate text
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors='tf')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)




Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3