In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Load dataset and create Data Collator
file_path = "test_text.txt"
dataset = TextDataset(tokenizer, file_path=file_path, block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="output",  # Output directory for model checkpoints
    overwrite_output_dir=True,
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    save_steps=10_000,  # Save model checkpoint every 10,000 steps
    save_total_limit=2,  # Limit the total number of checkpoints
)

# Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

# Save the fine-tuned model
trainer.save_model("fine_tuned_gpt2")


***** Running training *****
  Num examples = 7
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 124439808


  0%|          | 0/3 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to fine_tuned_gpt2
Configuration saved in fine_tuned_gpt2\config.json


{'train_runtime': 673.734, 'train_samples_per_second': 0.031, 'train_steps_per_second': 0.004, 'train_loss': 3.6518309911092124, 'epoch': 3.0}


Model weights saved in fine_tuned_gpt2\pytorch_model.bin
