In [None]:
from datasets import load_dataset

dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [None]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the tokenizer's padding token to its EOS token
# This is necessary because GPT-2 does not have a separate pad token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_and_prepare_labels(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    # Ensure labels are provided for the loss calculation: shift input_ids to the right
    # The model will use these labels to compute the loss.
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Apply the function to the dataset
tokenized_datasets = dataset.map(tokenize_and_prepare_labels, batched=True)



In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# if you are using google colab and store files in drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",          # output directory for model checkpoints
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)


In [None]:
trainer.train()