In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers -q
!pip install accelerate -q
!pip install sentencepiece -q

import pandas as pd
import torch
import time
import numpy as np
from typing import Dict, List, Tuple
from pathlib import Path
from tqdm.auto import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("🔄 Loading T5 models for comparison...")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


print("\n1️⃣ Loading T5-base...")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-base").to(DEVICE)
t5_model.eval()

print("2️⃣ Loading Flan-T5-base...")
flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
flan_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(DEVICE)
flan_model.eval()

if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

🔄 Loading T5 models for comparison...
Using device: cuda

1️⃣ Loading T5-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

2️⃣ Loading Flan-T5-base...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Both models loaded!
GPU: Tesla T4


In [None]:
def t5_summarize(text, model, tokenizer, model_name="T5"):
    """
    T5 requires task prefix 'summarize:' for summarization
    """
    if not text or pd.isna(text):
        return ""

    try:
      # task prefix for T5
        input_text = "summarize: " + text

        # Tokenize
        inputs = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(DEVICE)

        # Generate summary
        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=150,
                min_length=20,
                length_penalty=2.0,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3
            )

        # Decode
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    except Exception as e:
        print(f"Error in {model_name}: {e}")
        return ""

def flan_t5_summarize(text, model, tokenizer):
    """
    Flan-T5 can use more natural prompts
    """
    if not text or pd.isna(text):
        return ""

    try:

        input_text = f"Summarize the following legal testimony: {text}"

        inputs = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(DEVICE)

        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=150,
                min_length=20,
                length_penalty=2.0,
                num_beams=4,
                early_stopping=True
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    except Exception as e:
        print(f"Error in Flan-T5: {e}")
        return ""


In [None]:
BASE_DIR = "/content/drive/MyDrive/Dissertation/eval_open_source_outputs"
curated_path = f"{BASE_DIR}/curated_expanded.csv"

print(f"\n📂 Loading curated dataset from: {curated_path}")
curated_df = pd.read_csv(curated_path)
print(f"Loaded {len(curated_df)} rows")
print(f"Columns: {list(curated_df.columns)}")

# Add columns for both models
curated_df['t5_summary'] = ''
curated_df['flan_t5_summary'] = ''

# Generate summaries with regular T5
print("\n🔄 Generating T5 summaries for curated dataset...")
for idx in tqdm(range(len(curated_df)), desc="T5 Processing"):
    text = curated_df.loc[idx, 'Excerpt']
    curated_df.loc[idx, 't5_summary'] = t5_summarize(text, t5_model, t5_tokenizer, "T5")

# Generate summaries with Flan-T5
print("\n🔄 Generating Flan-T5 summaries for curated dataset...")
for idx in tqdm(range(len(curated_df)), desc="Flan-T5 Processing"):
    text = curated_df.loc[idx, 'Excerpt']
    curated_df.loc[idx, 'flan_t5_summary'] = flan_t5_summarize(text, flan_model, flan_tokenizer)

# Save results
output_path = f"{BASE_DIR}/curated_t5_results.csv"
curated_df.to_csv(output_path, index=False)
print(f"✅ Saved curated results to: {output_path}")

# Show samples
print("\n Sample comparisons from curated dataset:")
for i in range(min(3, len(curated_df))):
    print(f"\n--- Example {i+1} ---")
    print(f"Original: {curated_df.loc[i, 'Excerpt'][:150]}...")
    print(f"T5: {curated_df.loc[i, 't5_summary']}")
    print(f"Flan-T5: {curated_df.loc[i, 'flan_t5_summary']}")



📂 Loading curated dataset from: /content/drive/MyDrive/Dissertation/eval_open_source_outputs/curated_expanded.csv
Loaded 132 rows
Columns: ['ID', 'Excerpt', 'Feature', 'SubFeature', 'GoldSummary']

🔄 Generating T5 summaries for curated dataset...


T5 Processing:   0%|          | 0/132 [00:00<?, ?it/s]


🔄 Generating Flan-T5 summaries for curated dataset...


Flan-T5 Processing:   0%|          | 0/132 [00:00<?, ?it/s]

✅ Saved curated results to: /content/drive/MyDrive/Dissertation/eval_open_source_outputs/curated_t5_results.csv

📊 Sample comparisons from curated dataset:

--- Example 1 ---
Original: I might have seen him in the lobby...
T5: he might have seen him in the lobby . he's a tycoon, but he doesn't seem to mind if you've seen him .
Flan-T5: I might have seen him in the lobby, I might have seen him in the lobby, I might have seen him in the lobby.

--- Example 2 ---
Original: I probably saw the car at the intersection...
T5: the car was probably at the intersection . i probably saw the car at the junction .
Flan-T5: I probably saw the car at the intersection, I probably saw the car at the intersection, I probably saw the car at the intersection,

--- Example 3 ---
Original: I don't know if the door was locked...
T5: I don't know if the door was locked or if it was locked .
Flan-T5: I don't know if the door was locked, I don't know if the door was locked, I don't know


In [None]:

pilot_path = f"{BASE_DIR}/pilot_clean.csv"

print(f"\n📂 Loading pilot dataset from: {pilot_path}")
pilot_df = pd.read_csv(pilot_path)
print(f"Loaded {len(pilot_df)} rows")
print(f"Columns: {list(pilot_df.columns)}")

# Find text column
text_column = None
for col in ['Excerpt', 'text', 'Text', 'excerpt']:
    if col in pilot_df.columns:
        text_column = col
        break

if text_column is None:

    for col in pilot_df.columns:
        if pilot_df[col].dtype == 'object':
            sample = str(pilot_df[col].iloc[0])
            if len(sample) > 50:
                text_column = col
                break

print(f"Using column '{text_column}' for text")

pilot_df['t5_summary'] = ''
pilot_df['flan_t5_summary'] = ''


print("\n Generating T5 summaries for pilot dataset...")
for idx in tqdm(range(len(pilot_df)), desc="T5 Processing"):
    text = pilot_df.loc[idx, text_column]
    pilot_df.loc[idx, 't5_summary'] = t5_summarize(text, t5_model, t5_tokenizer, "T5")

# Generate summaries with Flan-T5
print("\n Generating Flan-T5 summaries for pilot dataset...")
for idx in tqdm(range(len(pilot_df)), desc="Flan-T5 Processing"):
    text = pilot_df.loc[idx, text_column]
    pilot_df.loc[idx, 'flan_t5_summary'] = flan_t5_summarize(text, flan_model, flan_tokenizer)

# Save results
output_path = f"{BASE_DIR}/pilot_t5_results.csv"
pilot_df.to_csv(output_path, index=False)
print(f" Saved pilot results to: {output_path}")



📂 Loading pilot dataset from: /content/drive/MyDrive/Dissertation/eval_open_source_outputs/pilot_clean.csv
Loaded 351 rows
Columns: ['ID', 'SPEAKER', 'TEXT', 'LING_FEATURES', 'NOTES', 'Speaker', 'Feature', 'SubFeature', 'GoldSummary', 'ling_features', 'Notes', 'source', 'row_in_doc', 'uid']
Using column 'TEXT' for text

🔄 Generating T5 summaries for pilot dataset...


T5 Processing:   0%|          | 0/351 [00:00<?, ?it/s]


🔄 Generating Flan-T5 summaries for pilot dataset...


Flan-T5 Processing:   0%|          | 0/351 [00:00<?, ?it/s]

✅ Saved pilot results to: /content/drive/MyDrive/Dissertation/eval_open_source_outputs/pilot_t5_results.csv


In [None]:

print("\n" + "="*50)
print(" LINGUISTIC FEATURE PRESERVATION ANALYSIS")
print("="*50)

def analyze_feature_preservation(df, text_col, summary_col, model_name):
    """Analyze preservation of linguistic features"""

    # Define feature patterns
    hedges = ['I think', 'maybe', 'possibly', 'might', 'could', 'seems', 'appears', 'probably']
    disfluencies = ['um', 'uh', 'I-I', 'the-the', '...']

    hedge_preserved = 0
    hedge_total = 0
    disfluency_preserved = 0
    disfluency_total = 0

    for idx in range(len(df)):
        original = str(df.loc[idx, text_col]).lower()
        summary = str(df.loc[idx, summary_col]).lower()

        # Check hedges
        for hedge in hedges:
            if hedge.lower() in original:
                hedge_total += 1
                if hedge.lower() in summary:
                    hedge_preserved += 1

        # Check disfluencies
        for disf in disfluencies:
            if disf.lower() in original:
                disfluency_total += 1
                if disf.lower() in summary:
                    disfluency_preserved += 1

    print(f"\n{model_name} Feature Preservation:")
    if hedge_total > 0:
        print(f"  Hedges preserved: {hedge_preserved}/{hedge_total} ({hedge_preserved/hedge_total*100:.1f}%)")
    else:
        print(f"  Hedges: No hedges found in source")

    if disfluency_total > 0:
        print(f"  Disfluencies preserved: {disfluency_preserved}/{disfluency_total} ({disfluency_preserved/disfluency_total*100:.1f}%)")
    else:
        print(f"  Disfluencies: No disfluencies found in source")

print("\n--- CURATED DATASET ---")
analyze_feature_preservation(curated_df, 'Excerpt', 't5_summary', 'T5')
analyze_feature_preservation(curated_df, 'Excerpt', 'flan_t5_summary', 'Flan-T5')

print("\n--- PILOT DATASET ---")
analyze_feature_preservation(pilot_df, text_column, 't5_summary', 'T5')
analyze_feature_preservation(pilot_df, text_column, 'flan_t5_summary', 'Flan-T5')

print("\n T5/Flan-T5 analysis complete!")
print("\nnKey Findings:")
print("- Compare verbatim copy rates with BART (87%)")
print("- Check if T5 actually attempts summarization")
print("- Note differences between T5 and Flan-T5 in feature preservation")
print("- Use these results to show architectural differences in computational prescriptivism")


🔍 LINGUISTIC FEATURE PRESERVATION ANALYSIS

--- CURATED DATASET ---

T5 Feature Preservation:
  Hedges preserved: 50/72 (69.4%)
  Disfluencies preserved: 18/32 (56.2%)

Flan-T5 Feature Preservation:
  Hedges preserved: 37/72 (51.4%)
  Disfluencies preserved: 15/32 (46.9%)

--- PILOT DATASET ---

T5 Feature Preservation:
  Hedges preserved: 12/33 (36.4%)
  Disfluencies preserved: 18/26 (69.2%)

Flan-T5 Feature Preservation:
  Hedges preserved: 15/33 (45.5%)
  Disfluencies preserved: 14/26 (53.8%)

✅ T5/Flan-T5 analysis complete!

💡 Key Findings:
- Compare verbatim copy rates with BART (87%)
- Check if T5 actually attempts summarization
- Note differences between T5 and Flan-T5 in feature preservation
- Use these results to show architectural differences in computational prescriptivism
