In [1]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install bert_score
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
!huggingface-cli login
#export HF_HOME = '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'


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your term

In [3]:
# Improved Evaluation Script for Text Summarization
# Author: [Your Name]

import json
import random
from typing import List, Dict, Any

import nltk
import torch
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline, BitsAndBytesConfig
import evaluate

# Constants
SUBSET = 10
SEED = 42
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 1
RESPONSE_TEMPLATE = "### Summary:"
PROMPT_TEMPLATE = "### Text:"
USE_SPLIT = "test"

# Set random seed for reproducibility
random.seed(SEED)
nltk.download('punkt', quiet=True)

def postprocess_text(preds: List[str], labels: List[str]) -> tuple:
    """Postprocess text for evaluation."""
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels

def generate_prompt_examples(texts: List[str]) -> List[str]:
    """Generate prompts for the model."""
    return [f"{PROMPT_TEMPLATE}\n{text}\n{RESPONSE_TEMPLATE}" for text in texts]

def call_model(examples: Dict[str, Any], pipeline: TextGenerationPipeline) -> Dict[str, Any]:
    """Generate summaries using the model."""
    texts = generate_prompt_examples(examples["text"])
    outputs = []
    for text in texts:
        generated_text = pipeline(text, max_new_tokens=512)[0]["generated_text"]
        summary_start = generated_text.find(RESPONSE_TEMPLATE) + len(RESPONSE_TEMPLATE)
        outputs.append(generated_text[summary_start:].strip())
    return {"text": examples["text"], "outputs": outputs, "labels": examples["description"]}

def map_samples(pipeline: TextGenerationPipeline, samples: DatasetDict) -> tuple:
    """Process samples and generate summaries."""
    results = samples.map(
        lambda e: call_model(e, pipeline),
        batched=True,
        batch_size=BATCH_SIZE,
        remove_columns=samples.column_names
    )

    texts = results["text"]
    decoded_preds, labels = postprocess_text(results["outputs"], results["labels"])
    return texts, decoded_preds, labels

def load_model_and_tokenizer(model_name: str, revision_id: str) -> tuple:
    """Load the model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision_id)
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.padding_side = 'right'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        revision=revision_id,
        torch_dtype=torch.float16
    )

    return model, tokenizer

In [4]:
dataset_ccnews = load_dataset("vblagoje/cc_news")
dataset_ccnews_filtered = dataset_ccnews.filter(
        lambda x: len(x["text"]) + len(x["description"]) < 2700 and len(x["description"]) > 0
    )
dataset_summary = DatasetDict({ # removed extra indent
        "train": dataset_ccnews_filtered["train"].select(range(20000)),
        "test": dataset_ccnews_filtered["train"].select(range(20000, 25000))
    })

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/215M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/708241 [00:00<?, ? examples/s]

Filter:   0%|          | 0/708241 [00:00<?, ? examples/s]

In [5]:
# Select evaluation samples
evaluation_sample_idx = random.sample(range(len(dataset_summary[USE_SPLIT])), SUBSET)
evaluation_samples = dataset_summary[USE_SPLIT].select(evaluation_sample_idx)

In [6]:
# Load model and tokenizer
MODEL_NAME = "matteo1822/minerva-500" #-1000
REVISION_ID = "steps_500" #_1000
model, tokenizer = load_model_and_tokenizer(MODEL_NAME, REVISION_ID)

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/795k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/50.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/703M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

In [7]:
# Create pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
pipeline.device = model.device

print(f"Device: Selected={DEVICE}, Model={model.device}, Pipeline={pipeline.device}")

Device: Selected=cuda:0, Model=cuda:0, Pipeline=cuda:0


In [9]:
# Generate summaries
texts, decoded_preds, labels = map_samples(pipeline, evaluation_samples)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [10]:
# Compute metrics
metrics = {
        "rouge": evaluate.load('rouge'),
        "bertscore": evaluate.load('bertscore'),
        "bleu": evaluate.load('bleu'),
        "meteor": evaluate.load('meteor')
    }

results = {}
results["rouge"] = metrics["rouge"].compute(predictions=decoded_preds, references=labels, use_stemmer=True)
results["bertscore"] = metrics["bertscore"].compute(predictions=decoded_preds, references=labels, lang="en")
results["bleu"] = metrics["bleu"].compute(predictions=decoded_preds, references=labels)
results["meteor"] = metrics["meteor"].compute(predictions=decoded_preds, references=labels)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Print results
print(f"{'='*30}\nSPLIT: {USE_SPLIT}")
for metric, score in results.items():
  print(f"{metric.upper()}: {score}")
print('='*30)

SPLIT: test
ROUGE: {'rouge1': 0.45728693651373586, 'rouge2': 0.3799497343929582, 'rougeL': 0.43718613927784555, 'rougeLsum': 0.43645043985973836}
BERTSCORE: {'precision': [0.8337899446487427, 0.9651249051094055, 0.8922972679138184, 0.9712328314781189, 0.9657556414604187, 0.8805094957351685, 0.8716411590576172, 0.9980562925338745, 0.8264153599739075, 0.8264808058738708], 'recall': [0.8253033757209778, 0.8999338150024414, 0.8514809608459473, 0.9765336513519287, 0.898426353931427, 0.9219638705253601, 0.8863405585289001, 0.9980266094207764, 0.8477921485900879, 0.8248956799507141], 'f1': [0.8295249342918396, 0.9313900470733643, 0.8714114427566528, 0.9738759994506836, 0.9308751225471497, 0.9007599949836731, 0.8789293766021729, 0.9980414509773254, 0.8369672894477844, 0.8256874680519104], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.44.2)'}
BLEU: {'bleu': 0.4559956508465484, 'precisions': [0.5864661654135338, 0.5, 0.4878048780487805, 0.4745762711864407], 'brevity_penalty': 

In [12]:
# Save results
with open(f"evaluation_results_{REVISION_ID}.json", "w") as f:
    json.dump(results, f, indent=2)

In [15]:
# Generate and save outputs for cherry-picked samples
cherry_picked_idx = [12, 48, 96]
cherry_picked_samples = dataset_summary[USE_SPLIT].select(cherry_picked_idx)
texts, decoded_preds, labels = map_samples(pipeline, cherry_picked_samples)

output_data = [
        {"text": text, "gold_summary": label, "generated_summary": pred}
        for text, label, pred in zip(texts, labels, decoded_preds)
    ]

with open(f"cherry_picked_samples_{REVISION_ID}.jsonl", "w") as f:
      for data in output_data:
          f.write(json.dumps(data) + "\n")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
