In [35]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import Dataset
import torch

In [36]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/bbc_headlines_eda.csv")

# Keep only the 'Headline' column
df = df[['Headline']]
df.head()


Unnamed: 0,Headline
0,More Russian strikes as Syrian rebels push sou...
1,Who are the rebels seizing control of Syria's ...
2,Trump picks loyalist ex-aide as FBI director
3,Why would a US fugitive choose to hide in Wales?
4,Trump praises 'very productive' Mar-a-Lago mee...


In [37]:
# Extract headlines as a list
headlines = df['Headline'].tolist()
print(headlines[:5])  # Print first 5 headlines for verification

['More Russian strikes as Syrian rebels push south from Aleppo', "Who are the rebels seizing control of Syria's second city?", 'Trump picks loyalist ex-aide as FBI director', 'Why would a US fugitive choose to hide in Wales?', "Trump praises 'very productive' Mar-a-Lago meeting with Trudeau"]


In [38]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the headlines
tokenized_data = tokenizer(
    headlines,
    padding="max_length",  # Ensures all sequences have the same length
    truncation=True,
    max_length=50,         # Adjust based on the typical length of headlines
    return_tensors="pt"    # Return PyTorch tensors
)

# Print the tokenized structure for verification
print(tokenized_data.keys())  # Should contain 'input_ids' and 'attention_mask'


dict_keys(['input_ids', 'attention_mask'])


In [39]:
from datasets import Dataset

# Create a dataset object
dataset = Dataset.from_dict({
    "input_ids": tokenized_data["input_ids"],
    "attention_mask": tokenized_data["attention_mask"]
})

# Add labels (same as input_ids for causal language modeling)
def add_labels(example):
    example["labels"] = example["input_ids"]
    return example

dataset = dataset.map(add_labels, batched=True)

# Split into train and eval datasets
train_size = 0.9
train_dataset = dataset.train_test_split(test_size=1-train_size)["train"]
eval_dataset = dataset.train_test_split(test_size=1-train_size)["test"]

print(train_dataset[0])  # Verify the structure


Map:   0%|          | 0/739 [00:00<?, ? examples/s]

{'input_ids': [3666, 1706, 699, 10530, 8875, 10921, 1817, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3666, 1706, 699, 10530, 8875, 10921, 1817, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]}


In [40]:
from transformers import GPT2LMHeadModel

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Ensure the padding token is set in the model config
model.config.pad_token_id = tokenizer.pad_token_id


In [41]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Directory to save the model
    overwrite_output_dir=True,         # Overwrite the output directory
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=8,     # Batch size per GPU/CPU
    save_steps=500,                    # Save checkpoint every 500 steps
    save_total_limit=2,                # Only keep the last 2 checkpoints
    logging_dir="./logs",              # Directory for logs
    evaluation_strategy="epoch",       # Evaluate at the end of each epoch
    logging_steps=10,                  # Log training stats every 10 steps
    do_train=True,
    do_eval=True
)




In [42]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [43]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.9593,0.742857


Epoch,Training Loss,Validation Loss
1,0.9593,0.742857
2,0.6886,0.62522
3,0.6737,0.591235


TrainOutput(global_step=252, training_loss=0.8965410334723336, metrics={'train_runtime': 1983.821, 'train_samples_per_second': 1.006, 'train_steps_per_second': 0.127, 'total_flos': 50906016000000.0, 'train_loss': 0.8965410334723336, 'epoch': 3.0})

In [45]:
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [46]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Generate text from a prompt
prompt = "Breaking news:"
generated_text = text_generator(prompt, max_length=50, num_return_sequences=1)
print(generated_text[0]["generated_text"])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Breaking news: Children's TV network taking TV production to 'danger zone'


In [47]:
import math

# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")


Perplexity: 1.8062175772553821


In [48]:
# Generate text from different prompts
prompts = [
    "Breaking news:",
    "Global economy update:",
    "Technology advancements in AI:",
    "Sports headlines today:",
]

for prompt in prompts:
    generated_text = text_generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {generated_text[0]['generated_text']}\n")


Prompt: Breaking news:
Generated Text: Breaking news: AI revolution turning cities into cities

Prompt: Global economy update:
Generated Text: Global economy update: How China's 'big two' firms are doing

Prompt: Technology advancements in AI:
Generated Text: Technology advancements in AI: How to watch a movie on your phone

Prompt: Sports headlines today:
Generated Text: Sports headlines today:

