# Fine-Tuning GPT-2 on AG News

### 0. Setup

In [1]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Is CUDA available: ", torch.cuda.is_available())
print("Number of GPUs: ", torch.cuda.device_count())
print("Current device: ", torch.cuda.current_device())
print(f"Using device: {device}")

Is CUDA available:  True
Number of GPUs:  1
Current device:  0
Using device: cuda


### 1. Preparing the Dataset

#### 1.1. Loading the AG News Dataset

In [2]:
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel

# Load AG News dataset
dataset = load_dataset("ag_news")

# Check dataset structure
print(dataset)
print(dataset['train'][0])  # Example of the first training sample

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


#### 1.2. Preprocessing the Dataset

In [3]:
# Extract only the text field
def prepare_text_data(example):
    return {"text": example["text"]}

# Apply preprocessing
processed_dataset = dataset.map(prepare_text_data, remove_columns=dataset["train"].column_names)

# Check the processed dataset
print(processed_dataset["train"][0])

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."}


#### 1.3. Limiting the Dataset Size

In [4]:
# Define the ratio (e.g., 10%)
limit_ratio = 0.01

# Calculate the limit for the dataset
train_limit = int(len(processed_dataset["train"]) * limit_ratio)
test_limit = int(len(processed_dataset["test"]) * limit_ratio)

# Select a subset of the dataset
limited_train_dataset = processed_dataset["train"].select(range(train_limit))
limited_test_dataset = processed_dataset["test"].select(range(test_limit))

# Check the size of the limited dataset
print(f"Original train size: {len(processed_dataset['train'])}")
print(f"Limited train size: {len(limited_train_dataset)}")


Original train size: 120000
Limited train size: 1200


#### 1.4. Tokenizing the Dataset

In [5]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure the tokenizer has a pad token
tokenizer.pad_token = tokenizer.eos_token

# Define a tokenization function
def tokenize_and_prepare_labels(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)
    tokens["labels"] = tokens["input_ids"].copy()  # Labels should match input_ids for language modeling
    return tokens

# Apply tokenization to the datasets
tokenized_train_dataset = limited_train_dataset.map(
    tokenize_and_prepare_labels,
    batched=True,
    remove_columns=["text"]
)
tokenized_test_dataset = limited_test_dataset.map(
    tokenize_and_prepare_labels,
    batched=True,
    remove_columns=["text"]
)

# Format the datasets for PyTorch
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

### 2. Fine-Tuning GPT-2

In [6]:

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
model.resize_token_embeddings(len(tokenizer))

# Define training arguments
training_args = TrainingArguments(
    output_dir="gpt2-english",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # Use a larger batch size if GPU memory allows
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch size
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Use mixed precision training for faster computations (requires GPU)
    load_best_model_at_end=False,
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./gpt2-english")
tokenizer.save_pretrained("./gpt2-english")

Epoch,Training Loss,Validation Loss
1,No log,0.78941
2,No log,0.763629
3,No log,0.757206
4,No log,0.757422
5,No log,0.760376
6,No log,0.763089
7,0.803200,0.769366
8,0.803200,0.772763
9,0.803200,0.775875
10,0.803200,0.777806


('./gpt2-english\\tokenizer_config.json',
 './gpt2-english\\special_tokens_map.json',
 './gpt2-english\\vocab.json',
 './gpt2-english\\merges.txt',
 './gpt2-english\\added_tokens.json')

### 3. Evaluating the Model

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load original GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
original_model = GPT2LMHeadModel.from_pretrained("gpt2")
original_model.eval()

# Load fine-tuned GPT-2
fine_tuned_model = GPT2LMHeadModel.from_pretrained("gpt2-english")
fine_tuned_model.eval()

prompt = "Left or right?"

# Generate with original GPT-2
inputs = tokenizer(prompt, return_tensors="pt")
outputs_original = original_model.generate(
    inputs.input_ids, max_length=50, temperature=0.7, top_p=0.9
)
print("Original GPT-2:", tokenizer.decode(outputs_original[0], skip_special_tokens=True))

# Generate with fine-tuned GPT-2
outputs_fine_tuned = fine_tuned_model.generate(
    inputs.input_ids, max_length=50, temperature=0.7, top_p=0.9
)
print("Fine-tuned GPT-2:", tokenizer.decode(outputs_fine_tuned[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Original GPT-2: Left or right?

The answer is that the right is the most important thing. It is the most important thing to you. It is the most important thing to your family. It is the most important thing to your friends. It is the
Fine-tuned GPT-2: Left or right? The difference between the two is that the left is more likely to be a good player and the right more likely to be a bad one.
