In [None]:
!pip install transformers datasets sumy rouge-score nltk


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from datasets import load_dataset
from transformers import pipeline
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from rouge_score import rouge_scorer
from tqdm import tqdm
import json
import csv

NUM_SAMPLES = 50   # adjust for speed
TARGET_SENTENCES = 3
MODEL_NAME = "facebook/bart-large-cnn"


In [None]:
def load_data():
    dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
    dataset = dataset.select(range(NUM_SAMPLES))
    return dataset["article"], dataset["highlights"]


def sumy_lexrank(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, TARGET_SENTENCES)
    return " ".join(str(s) for s in summary)


def transformer_summary_pipe():
    return pipeline("summarization", model=MODEL_NAME, tokenizer=MODEL_NAME, device=0)


def compute_rouge(refs, preds):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for ref, pred in zip(refs, preds):
        sc = scorer.score(ref, pred)
        scores["rouge1"].append(sc["rouge1"].fmeasure)
        scores["rouge2"].append(sc["rouge2"].fmeasure)
        scores["rougeL"].append(sc["rougeL"].fmeasure)

    return {k: sum(v)/len(v) for k, v in scores.items()}


In [None]:
texts, refs = load_data()
print("Loaded dataset:", len(texts), "samples")

# Transformer summarizer
hf_sum = transformer_summary_pipe()

lex_summaries = []
bart_summaries = []
samples = []

for i, text in tqdm(enumerate(texts), total=len(texts), desc="Summarizing"):
    # LexRank
    lex_summary = sumy_lexrank(text)
    lex_summaries.append(lex_summary)

    # BART
    bart = hf_sum(text[:2000], max_length=120, min_length=30, do_sample=False)[0]["summary_text"]
    bart_summaries.append(bart)

    samples.append({
        "article": text[:500] + "...",
        "lexrank": lex_summary,
        "bart": bart,
        "reference": refs[i]
    })

# Compute ROUGE
lex_rouge = compute_rouge(refs, lex_summaries)
bart_rouge = compute_rouge(refs, bart_summaries)

lex_rouge, bart_rouge


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Loaded dataset: 50 samples


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Summarizing: 100%|██████████| 50/50 [14:23<00:00, 17.26s/it]


({'rouge1': 0.26768298142353,
  'rouge2': 0.0787855462563188,
  'rougeL': 0.1844933661258871},
 {'rouge1': 0.3722422061098788,
  'rouge2': 0.17752202662795608,
  'rougeL': 0.2849113757970219})

In [None]:
!mkdir -p results

# Save ROUGE
with open("results/rouge_scores.csv", "w", newline="", encoding="utf8") as f:
    w = csv.writer(f)
    w.writerow(["model", "rouge1", "rouge2", "rougeL"])
    w.writerow(["LexRank", lex_rouge["rouge1"], lex_rouge["rouge2"], lex_rouge["rougeL"]])
    w.writerow(["BART", bart_rouge["rouge1"], bart_rouge["rouge2"], bart_rouge["rougeL"]])

# Save sample summaries
with open("results/sample_summaries.json", "w", encoding="utf8") as f:
    json.dump(samples, f, indent=2)

print("All results saved in /results folder.")


All results saved in /results folder.
