# ***TASK 1***

In [22]:
!pip install transformers datasets torch pandas scikit-learn



In [23]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

In [32]:
df = pd.read_csv("/content/bbc_headlines_eda.csv")
df.head()

Unnamed: 0,URL,Headline,headline_length
0,https://www.bbc.com/news,More Russian strikes as Syrian rebels push sou...,60
1,https://www.bbc.com/news,Who are the rebels seizing control of Syria's ...,58
2,https://www.bbc.com/news,Trump picks loyalist ex-aide as FBI director,44
3,https://www.bbc.com/news,Why would a US fugitive choose to hide in Wales?,48
4,https://www.bbc.com/news,Trump praises 'very productive' Mar-a-Lago mee...,63


In [33]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add padding token to the tokenizer (since GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the headlines
def tokenize_function(examples):
    return tokenizer(examples['Headline'], truncation=True, padding='max_length', max_length=64)

# Prepare the dataset
dataset = Dataset.from_pandas(df[['Headline']])
dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/739 [00:00<?, ? examples/s]

In [34]:
# Split dataset into training and validation sets (80% train, 20% validation)
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
eval_dataset = train_test['test']

In [35]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize token embeddings to account for the new padding token
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [39]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the headlines
def tokenize_function(examples):
    return tokenizer(examples['Headline'], padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the 'Headline' column
tokenized_data = df['Headline'].apply(lambda x: tokenize_function({'Headline': x}))


In [44]:
from datasets import Dataset

# Create the dataset from input_ids and corresponding labels
train_dataset = Dataset.from_dict({
    "input_ids": [item['input_ids'] for item in tokenized_data],
    "attention_mask": [item['attention_mask'] for item in tokenized_data],
    "labels": [item['input_ids'] for item in tokenized_data]  # Labels are same as input_ids
})

# Optional: Split the dataset into training and validation datasets
train_dataset = train_dataset.train_test_split(test_size=0.1)


In [45]:
from transformers import GPT2LMHeadModel

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens to the model (padding token if added previously)
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [46]:
from transformers import Trainer, TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",              # Output directory for saving model and logs
    evaluation_strategy="epoch",         # Evaluate every epoch (you can also choose "steps")
    learning_rate=5e-5,                  # Learning rate for the optimizer
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=8,        # Batch size for evaluation
    num_train_epochs=3,                  # Number of epochs
    logging_dir="./logs",                # Directory for storing logs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset["train"], # Training dataset
    eval_dataset=train_dataset["test"],  # Evaluation dataset (optional)
)


In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.330468
2,No log,0.31367
3,No log,0.309826


TrainOutput(global_step=252, training_loss=0.42799295697893414, metrics={'train_runtime': 5002.3245, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.05, 'total_flos': 130319400960000.0, 'train_loss': 0.42799295697893414, 'epoch': 3.0})

In [48]:
model.save_pretrained("/content")
tokenizer.save_pretrained("/content")

('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.json',
 '/content/merges.txt',
 '/content/added_tokens.json')

In [49]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model
model = GPT2LMHeadModel.from_pretrained("/content")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/content")


In [51]:
input_text = "Breaking news: The latest developments in the global economy"

# Ensure pad_token_id is set
tokenizer.pad_token = tokenizer.eos_token

# Encode input text with padding and attention mask
input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True, truncation=True)

# Create attention mask (1 for actual tokens, 0 for padding)
attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

# Generate text
output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Breaking news: The latest developments in the global economy
