In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers -q
!pip install accelerate -q

Mounted at /content/drive


In [None]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, torch, re
import gc
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, GenerationConfig, BartForConditionalGeneration, BartTokenizer
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import pearsonr, spearmanr
from datetime import datetime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Initialize BART Model

In [None]:
print("Loading BART model with optimizations...")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(DEVICE)
model.eval()

# Enable mixed precision if on GPU
if DEVICE == "cuda":
    model = model.half()  # Use FP16 for faster inference
    print("✅ Using FP16 precision for faster inference")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("✅ BART model loaded!")

Loading BART model with optimizations...
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

✅ Using FP16 precision for faster inference
GPU Name: Tesla T4
GPU Memory: 15.8 GB
✅ BART model loaded!


### Summarization Function

In [None]:
def summarize_with_constraints(text, force_different=True):
    """
    Generate summary with constraints to avoid verbatim copying
    """
    if not text or pd.isna(text):
        return ""

    try:
        # Tokenize input
        inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").to(DEVICE)
        input_tokens = tokenizer.tokenize(text.lower())


        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                num_beams=4,
                min_length=15,  # Shorter min length for short inputs
                max_length=min(80, len(text.split()) + 20),  # Adaptive max length
                early_stopping=True,
                no_repeat_ngram_size=3,  # Prevent repetition
                length_penalty=2.0,  # Encourage conciseness
                repetition_penalty=1.2,  # Discourage copying
                do_sample=False,  # Deterministic
                forced_bos_token_id=tokenizer.bos_token_id,
            )

        # Decode
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Check if summary is too similar to input
        if force_different:
            summary_lower = summary.lower()
            text_lower = text.lower()


            if summary_lower in text_lower or len(summary) > len(text) * 0.9:

                summary_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    num_beams=6,
                    min_length=10,
                    max_length=50,
                    early_stopping=True,
                    no_repeat_ngram_size=4,
                    length_penalty=3.0,  #concise
                    repetition_penalty=1.5,
                    do_sample=True,  # randomness
                    top_p=0.95,
                    temperature=0.8,
                )
                summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary

    except Exception as e:
        print(f"Error: {e}")
        return ""


In [None]:
def use_alternative_approach(text):
    """
    For very short texts, use a different approach
    """

    # context to force summarization
    prompted_text = f"Summarize the following testimony concisely: {text}"

    try:
        inputs = tokenizer(prompted_text, max_length=1024, truncation=True, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                num_beams=4,
                min_length=10,
                max_length=40,
                length_penalty=2.0,
                early_stopping=True,
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summary = summary.replace("Summarize the following testimony concisely:", "").strip()

        return summary
    except:
        return ""


### Process Curated Dataset

In [None]:
# Load curated dataset
BASE_DIR = "/content/drive/MyDrive/Dissertation/eval_open_source_outputs"
curated_path = f"{BASE_DIR}/curated_expanded.csv"

curated_df = pd.read_csv(curated_path)
print(f"Loaded {len(curated_df)} rows")

excerpt_lengths = curated_df['Excerpt'].str.len()
print(f"Average excerpt length: {excerpt_lengths.mean():.0f} characters")
print(f"Min/Max: {excerpt_lengths.min():.0f} / {excerpt_lengths.max():.0f} characters")

# Process with improved summarization
print("Generating summaries...")
summaries = []

for idx in tqdm(range(len(curated_df))):
    text = curated_df.loc[idx, 'Excerpt']

    # Use different approach based on text length
    if len(str(text)) < 200:  # Short text
        summary = use_alternative_approach(text)
    else:  # Longer text
        summary = summarize_with_constraints(text)

    summaries.append(summary)


    if idx < 3:
        print(f"\nExample {idx+1}:")
        print(f"Original: {text[:100]}...")
        print(f"Summary: {summary}")

curated_df['bart_summary'] = summaries

output_path = f"{BASE_DIR}/curated_bart_abstractive.csv"
curated_df.to_csv(output_path, index=False)
print(f"\n✅ Saved to: {output_path}")


📂 Loading curated dataset...
Loaded 132 rows
Average excerpt length: 114 characters
Min/Max: 23 / 562 characters

🔄 Generating abstractive summaries...


  0%|          | 0/132 [00:00<?, ?it/s]


Example 1:
Original: I might have seen him in the lobby...
Summary: I might have seen him in the lobby.

Example 2:
Original: I probably saw the car at the intersection...
Summary: I probably saw the car at the intersection.

Example 3:
Original: I don't know if the door was locked...
Summary: I don't know if the door was locked.

✅ Saved to: /content/drive/MyDrive/Dissertation/eval_open_source_outputs/curated_bart_abstractive.csv
