In [1]:
!pip install datasets transformers rouge_score tqdm



In [2]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback
import torch
from transformers import DataCollatorForLanguageModeling

# Load the model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
foundation_model = AutoModelForCausalLM.from_pretrained(model_name)

# Freeze all model parameters except the language modeling (LM) head
for param in foundation_model.parameters():
    param.requires_grad = False
for param in foundation_model.lm_head.parameters():
    param.requires_grad = True



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import transformers
import time
from rouge_score import rouge_scorer  # Using rouge_score directly


In [4]:
foundation_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
# Check the number of parameters to verify
total_params = sum(p.numel() for p in foundation_model.parameters())
trainable_params = sum(p.numel() for p in foundation_model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 124439808
Trainable parameters: 38597376


In [6]:

# Load and preprocess the dataset
data = load_dataset("cnn_dailymail", "3.0.0")
tokenizer.pad_token = tokenizer.eos_token

# Using only a small sample of the data
train_size, test_size, validation_size = [int(0.01 * len(data[split])) for split in ['train', 'test', 'validation']]
train_sample = data['train'].select(range(train_size))
test_sample = data['test'].select(range(test_size))
validation_sample = data['validation'].select(range(validation_size))

# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(examples["article"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data_train = train_sample.map(preprocess_function, batched=True)
tokenized_data_test = test_sample.map(preprocess_function, batched=True)
tokenized_data_validation = validation_sample.map(preprocess_function, batched=True)


Map:   0%|          | 0/114 [00:00<?, ? examples/s]



In [7]:
# Define output directory for the fine-tuned model
output_directory = "./gpt2_finetuned_last_layers"

# Initialize ROUGE scorer
rouge_scorer_instance = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Compute added parameters
added_params = sum(p.numel() for p in foundation_model.lm_head.parameters() if p.requires_grad)

# Track computation time
start_time = time.time()

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_directory,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    save_steps=100,
    learning_rate=3e-4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    report_to="none",
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)




In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Accumulate scores for each metric
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    for pred, label in zip(decoded_preds, decoded_labels):
        scores = rouge_scorer_instance.score(pred, label)
        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    # Calculate mean scores for each ROUGE metric
    result = {
        "rouge1": sum(rouge1_scores) / len(rouge1_scores) * 100,
        "rouge2": sum(rouge2_scores) / len(rouge2_scores) * 100,
        "rougeL": sum(rougeL_scores) / len(rougeL_scores) * 100
    }
    return result

In [10]:
trainer = Trainer(
    model=foundation_model,
    args=training_args,
    train_dataset=tokenized_data_train,
    eval_dataset=tokenized_data_validation,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

In [11]:
# Start training
trainer.train()


Step,Training Loss,Validation Loss
100,3.316,3.257493
200,3.2214,3.238957
300,3.2457,3.218311
400,3.2132,3.210707
500,3.1551,3.195258
600,3.1184,3.191778
700,3.137,3.182373
800,2.9532,3.181291
900,2.9941,3.178679
1000,2.9619,3.176179


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1500, training_loss=3.0644842325846353, metrics={'train_runtime': 1053.2679, 'train_samples_per_second': 8.177, 'train_steps_per_second': 2.045, 'total_flos': 1567229607936000.0, 'train_loss': 3.0644842325846353, 'epoch': 2.0891364902506964})

In [12]:
# Track training end time
end_time = time.time()
total_time = end_time - start_time



In [14]:
print("Training time (seconds):", total_time)
print("Added parameters:", added_params)

Training time (seconds): 1057.6311511993408
Added parameters: 38597376


In [18]:
# Evaluate the model
test_results = trainer.evaluate(tokenized_data_test)
print("Test results:", test_results)

Test results: {'eval_loss': 3.1245720386505127, 'eval_runtime': 5.04, 'eval_samples_per_second': 22.619, 'eval_steps_per_second': 2.976, 'epoch': 2.0891364902506964}


In [22]:
# prompt: correct syntax PYTORCH_CUDA_ALLOC_CONF=expandable_segments:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments"

In [27]:
testing_input_text=tokenized_data_test[0]["article"]
testing_output_text=tokenized_data_test[0]["highlights"]

print(testing_input_text)
print(testing_output_text)

(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, sa

In [48]:
testing_input_text=tokenized_data_test[0]["article"]
testing_output_text=tokenized_data_test[0]["highlights"]

model=foundation_model
model.eval()

# Adjust the generate_summary function to handle device compatibility and output length
def generate_summary(text, max_new_tokens=128):
    model.config.pad_token_id = tokenizer.eos_token_id
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    tokenizer.pad_token = tokenizer.eos_token
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=max_new_tokens,  # Controls only the generated text length
            num_beams=5,
            early_stopping=True
        )

    return tokenizer.decode(output[0][:max_new_tokens], skip_special_tokens=True)

# Generate summary for the test input
generated_summary = generate_summary(testing_input_text)
print("Generated Summary:", generated_summary)

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(generated_summary, testing_output_text)

# Print ROUGE scores as percentages
print("ROUGE-1 Score:", scores["rouge1"].fmeasure )
print("ROUGE-2 Score:", scores["rouge2"].fmeasure )
print("ROUGE-L Score:", scores["rougeL"].fmeasure )

Generated Summary: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of
ROUGE-1 Score: 0.3076923076923077
ROUGE-2 Score: 0.18439716312056736
ROUGE-L Score: 0.29370629370629375


In [50]:
!pip install tqdm



In [None]:
from tqdm import tqdm  # Import tqdm for progress bar

# Initialize lists to store ROUGE scores for all test samples
rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

# Loop through the test dataset with progress tracking
for test_example in tqdm(tokenized_data_test, desc="Processing test examples", unit="example"):
    testing_input_text = tokenizer.decode(test_example["input_ids"], skip_special_tokens=True)
    testing_output_text = tokenizer.decode(test_example["labels"], skip_special_tokens=True)

    # Generate summary for each test input
    generated_summary = generate_summary(testing_input_text)

    # Calculate ROUGE scores for the generated summary against the reference summary
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(generated_summary, testing_output_text)

    # Append each score to the respective list
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)


In [52]:

# Calculate the average ROUGE scores over the entire test dataset
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print the average ROUGE scores
print("Average ROUGE-1 Score:", average_rouge1)
print("Average ROUGE-2 Score:", average_rouge2)
print("Average ROUGE-L Score:", average_rougeL)

Average ROUGE-1 Score: 0.2819427289327106
Average ROUGE-2 Score: 0.12021621026278283
Average ROUGE-L Score: 0.19449581735053545
