# Install & Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install rouge-score
!pip install bert-score
!pip install transformers
!pip install spacy
!pip install evaluate
!pip install sentence-transformers
!pip install nltk
!pip install accelerate

Mounted at /content/drive
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5207c878e61775cf28932fc7880ae486b9e0fc3a8a9f9645fdb6388fe113c150
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Co

In [None]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, torch, re
import gc
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, GenerationConfig
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from evaluate import load
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import pearsonr, spearmanr
from datetime import datetime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


----

# Config + CSV load

In [None]:
!find "/content/drive/MyDrive" -name "curated_expanded.csv"
!find "/content/drive/MyDrive" -name "pilotdata.csv"

/content/drive/MyDrive/Dissertation/eval_open_source_outputs/curated_expanded.csv
/content/drive/MyDrive/Dissertation/curated_expanded.csv
/content/drive/MyDrive/Dissertation/pilotdata.csv


In [None]:
BASE_DIR   = "/content/drive/MyDrive/Dissertation"
CURATED_CSV = f"{BASE_DIR}/curated_expanded.csv"
PILOT_CSV   = f"{BASE_DIR}/pilotdata.csv"

read_kw = dict(dtype={"ID": str}, keep_default_na=False)

curated = pd.read_csv(CURATED_CSV, **read_kw)
pilot   = pd.read_csv(PILOT_CSV,   **read_kw)

print("Curated shape:", curated.shape)
print("Pilot shape:", pilot.shape)

print("\nCurated sample:")
display(curated.head(3))
print("\nPilot sample:")
display(pilot.head(3))


Curated shape: (132, 5)
Pilot shape: (351, 5)

Curated sample:


Unnamed: 0,ID,Excerpt,Feature,SubFeature,GoldSummary
0,1,I might have seen him in the lobby,Hedge,Modal verb,The deponent says they might have seen him in ...
1,2,I probably saw the car at the intersection,Hedge,Adverbial hedge,The deponent states they probably saw the car ...
2,3,I don't know if the door was locked,Hedge,Phrase hedge,The deponent says they don't know if the door ...



Pilot sample:


Unnamed: 0,ID,SPEAKER,TEXT,LING_FEATURES,NOTES
0,T118,A,I couldn't vote on the motions if I hadn't see...,"cond_unreal, temp",
1,T118,Q,"Well, you understand, Mr. Grubbs, that minutes...",cond_unreal,
2,T118,A,"In a previous meeting, they should have been h...",cond_unreal,


-------

In [None]:
MODEL_NAME = "google/pegasus-xsum"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


try:
    from torch.backends.cuda import sdp_kernel
    sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=True)
except Exception:
    pass

# Load tokenizer/model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()

GEN = GenerationConfig(
    num_beams=4,
    min_length=30,
    max_length=128,
    length_penalty=2.0,
    early_stopping=True,
)

def _sanitize_max(x, fallback=512):
    try:
        x = int(x)
    except Exception:
        return fallback
    if x is None or x > 4096:
        return fallback
    return max(128, x)

HF_MAX  = _sanitize_max(getattr(tokenizer, "model_max_length", None), 512)
CFG_MAX = _sanitize_max(getattr(model.config, "max_position_embeddings", None), 512)

MAX_INPUT_TOKENS = min(HF_MAX, CFG_MAX, 512)        # conservative for Pegasus-XSum
CHUNK_TOKENS     = max(128, MAX_INPUT_TOKENS - 64)
CHUNK_OVERLAP    = min(128, CHUNK_TOKENS // 6)

print("Pegasus ready on:", DEVICE)
print({
    "MAX_INPUT_TOKENS": MAX_INPUT_TOKENS,
    "CHUNK_TOKENS": CHUNK_TOKENS,
    "CHUNK_OVERLAP": CHUNK_OVERLAP
})

  self.gen = func(*args, **kwds)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Pegasus ready on: cuda
{'MAX_INPUT_TOKENS': 512, 'CHUNK_TOKENS': 448, 'CHUNK_OVERLAP': 74}


In [None]:

import pandas as pd
from tqdm import tqdm
import torch

print("="*60)
print("PEGASUS BATCH SUMMARIZATION (FIXED)")
print("="*60)

# Load your CSVs
curated_df = pd.read_csv('/content/drive/MyDrive/Dissertation/curated_expanded.csv')
pilot_df = pd.read_csv('/content/drive/MyDrive/Dissertation/pilotdata.csv')

def generate_summaries_batch(texts, batch_size=8):
    """Process texts in batches for speed"""
    summaries = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Batches"):
        batch_texts = texts[i:i+batch_size]

        # Skip empty texts
        batch_texts = [t if (pd.notna(t) and len(str(t).strip()) > 10) else ""
                      for t in batch_texts]

        # Filter out empty ones for processing
        non_empty = [(idx, t) for idx, t in enumerate(batch_texts) if t]

        if non_empty:
            # Tokenize batch
            batch_for_model = [t for _, t in non_empty]
            inputs = tokenizer(batch_for_model, truncation=True, padding=True,
                             max_length=MAX_INPUT_TOKENS, return_tensors="pt").to(DEVICE)


            with torch.no_grad():
                outputs = model.generate(
                    inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    num_beams=4,
                    min_length=30,
                    max_length=128,
                    length_penalty=2.0,
                    early_stopping=True
                )

            # Decode batch
            batch_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            result_batch = [""] * len(batch_texts)
            for (original_idx, _), summary in zip(non_empty, batch_summaries):
                result_batch[original_idx] = summary

            summaries.extend(result_batch)
        else:
            summaries.extend([""] * len(batch_texts))

    return summaries

# Process curated dataset
print("\n📊 Processing CURATED dataset (batch mode)...")
curated_texts = curated_df['Excerpt'].tolist()
curated_summaries = generate_summaries_batch(curated_texts, batch_size=8)
curated_df['pegasus_summary'] = curated_summaries
curated_df.to_csv('/content/drive/MyDrive/Dissertation/curated_pegasus_results.csv', index=False)
print(f"✓ Curated complete: {len(curated_summaries)} summaries")

# Process pilot dataset
print("\n📊 Processing PILOT dataset (batch mode)...")
pilot_texts = pilot_df['TEXT'].tolist()
pilot_summaries = generate_summaries_batch(pilot_texts, batch_size=8)
pilot_df['pegasus_summary'] = pilot_summaries
pilot_df.to_csv('/content/drive/MyDrive/Dissertation/pilot_pegasus_results.csv', index=False)
print(f"✓ Pilot complete: {len(pilot_summaries)} summaries")

print("\n✅ Batch processing complete!")

PEGASUS BATCH SUMMARIZATION (FIXED)

📊 Processing CURATED dataset (batch mode)...


Batches: 100%|██████████| 17/17 [00:21<00:00,  1.26s/it]


✓ Curated complete: 132 summaries

📊 Processing PILOT dataset (batch mode)...


Batches: 100%|██████████| 44/44 [01:20<00:00,  1.82s/it]

✓ Pilot complete: 351 summaries

✅ Batch processing complete!





### Summarizer Cell Map -> Reduce