In [69]:
import os
import re
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import random

from transformers import pipeline

import numpy as np

import torch.nn.functional as F

In [62]:
# 1. Paths and file discovery (handles 10k_unzipped + 2021–2024)

# Base directory where the bucket is mounted
BASE_DIR = Path("/home/jupyter/data")

# Legacy layout: 2016–2020 inside 10k_unzipped
LEGACY_ROOT = BASE_DIR / "10k_unzipped"
LEGACY_YEARS = [str(y) for y in range(2016, 2021)]  # '2016'...'2020'

# New layout: 2021–2024 at the top level
NEW_YEARS = ["2021", "2022", "2023", "2024"]

# def collect_all_files():
#     all_files = []

#     # 2016–2020
#     for year in LEGACY_YEARS:
#         year_dir = LEGACY_ROOT / year
#         if not year_dir.is_dir():
#             continue
#         for path in year_dir.rglob("*.txt"):
#             all_files.append(path)

#     # 2021–2024
#     for year in NEW_YEARS:
#         year_dir = BASE_DIR / year
#         if not year_dir.is_dir():
#             continue
#         for path in year_dir.rglob("*.txt"):
#             all_files.append(path)

#     all_files = sorted(all_files)
#     print(f"Total txt filings found: {len(all_files)}")
#     return all_files

def is_10k(path):
    """
    Returns True if the file is likely a 10-K or 10-K/A based on filename.
    Adjust the logic if your filenames don't contain the form type.
    """
    # Convert filename to uppercase for case-insensitive check
    name = path.name.upper()
    
    # Common patterns: "10-K", "10-K-A", "10-K_...", etc.
    # Exclude "10-Q" explicitly or just look for "10-K"
    if "10-K" in name:
        return True
    return False

def collect_all_files():
    all_files = []

    # 2016–2020
    for year in LEGACY_YEARS:
        year_dir = LEGACY_ROOT / year
        if not year_dir.is_dir():
            continue
        # Iterate over all txt files, but only append if they pass the check
        for path in year_dir.rglob("*.txt"):
            if is_10k(path):
                all_files.append(path)

    # 2021–2024
    for year in NEW_YEARS:
        year_dir = BASE_DIR / year
        if not year_dir.is_dir():
            continue
        for path in year_dir.rglob("*.txt"):
            if is_10k(path):
                all_files.append(path)

    all_files = sorted(all_files)
    print(f"Total 10-K/10-K-A filings found: {len(all_files)}")
    return all_files

all_files = collect_all_files()


Total 10-K/10-K-A filings found: 73799


# 1A. NLP Workflow building (FinBert)

In [4]:
# 2. Helper: parse basic metadata from filename

FILENAME_REGEX = re.compile(
    r"(?P<date>\d{8})_(?P<form>[0-9A-Z\-]+)_edgar_data_(?P<cik>\d+)_",
    re.IGNORECASE
)

def parse_filename_meta(path: Path):
    """
    Best-effort parse of date, form, cik from the filename.
    Returns dict with keys: cik, form, filing_date (YYYY-MM-DD), year.
    """
    name = path.name
    m = FILENAME_REGEX.search(name)
    if not m:
        return {"cik": None, "form": None, "filing_date": None, "year": None}

    raw_date = m.group("date")
    filing_date = f"{raw_date[0:4]}-{raw_date[4:6]}-{raw_date[6:8]}"
    year = int(raw_date[0:4])

    return {
        "cik": m.group("cik"),
        "form": m.group("form"),
        "filing_date": filing_date,
        "year": year,
    }

In [5]:
# 3. Extract Item 1A – Risk Factors from text

ITEM_1A_REGEX = re.compile(
    r"ITEM\s+1A\.\s*RISK\s+FACTORS(.*?)(?=ITEM\s+1B\.)",
    re.IGNORECASE | re.DOTALL
)

def extract_item_1a(full_text: str):
    """
    Extract 'Item 1A. Risk Factors' section from a 10-K (or 10-Q).
    Returns the section text or None if not found.
    """
    m = ITEM_1A_REGEX.search(full_text)
    if not m:
        return None
    section = m.group(1)
    # Light cleanup
    section = section.strip()
    return section if section else None

In [6]:
# 4. FinBert sentiment scoring function

# Load FinBERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

finbert_model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(finbert_model_name)
model = AutoModelForSequenceClassification.from_pretrained(finbert_model_name)
model.to(device)
model.eval()

LABELS = ["positive", "neutral", "negative"]  # FinBERT ordering

Using device: cpu


In [7]:
# 4.1 Scoring function 

def finbert_score(text: str, max_tokens=400, chunk_tokens=350):
    """
    Score text with FinBERT.
    - If very long, split into word chunks, score each, and average probabilities.
    Returns dict with positive/neutral/negative and dominant_label.
    """
    if not text or not text.strip():
        return {
            "fb_pos": None,
            "fb_neu": None,
            "fb_neg": None,
            "fb_label": None,
        }

    # crude word-based chunking to control length
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_tokens):
        chunk = " ".join(words[i:i + chunk_tokens])
        if chunk:
            chunks.append(chunk)

    probs_accum = torch.zeros(len(LABELS), device=device)

    with torch.no_grad():
        for chunk in chunks:
            inputs = tokenizer(
                chunk,
                return_tensors="pt",
                truncation=True,
                max_length=max_tokens,
                padding="max_length"
            ).to(device)

            outputs = model(**inputs)
            logits = outputs.logits.mean(dim=0)  # average over sequence
            probs = torch.softmax(logits, dim=-1)
            probs_accum += probs

    probs_mean = probs_accum / len(chunks)
    probs_mean = probs_mean.cpu().tolist()

    fb_pos, fb_neu, fb_neg = probs_mean
    idx = int(torch.tensor(probs_mean).argmax())
    fb_label = LABELS[idx]

    return {
        "fb_pos": fb_pos,
        "fb_neu": fb_neu,
        "fb_neg": fb_neg,
        "fb_label": fb_label,
    }

# 1B. NLP Workflow building (ClimateBert - Transition / Physical)

In [16]:
# Define climate-risk keyword dictionaries

# 1. Physical risk keywords (you can extend this list)
PHYSICAL_TERMS = [
    "flood", "flooding", "inundation", "storm surge",
    "hurricane", "cyclone", "typhoon", "tornado",
    "wildfire", "forest fire", "bushfire",
    "drought", "water stress", "water scarcity",
    "extreme heat", "heatwave", "heat wave",
    "extreme temperature", "cold spell",
    "storm", "severe storm", "hailstorm",
    "sea level rise", "coastal erosion",
]

# Transition + general climate terms (you can also use these later)
TRANSITION_TERMS = [
    "climate change", "global warming", "greenhouse gas", "ghg",
    "emissions", "carbon price", "carbon tax", "carbon credit",
    "net zero", "decarbonisation", "decarbonization",
    "energy transition", "low-carbon", "renewable energy",
    "stranded asset", "climate regulation", "paris agreement",
]

# 2. Define simple sentence splitter + climate-passage extractor

def split_into_sentences(text: str):
    if not text:
        return []
    # very simple splitter: split on . ! ? followed by whitespace
    parts = re.split(r'(?<=[\.\!\?])\s+', text)
    # strip and keep non-empty
    return [p.strip() for p in parts if p.strip()]


def is_climate_sentence(sentence: str):
    s = sentence.lower()
    return any(kw in s for kw in PHYSICAL_TERMS + TRANSITION_TERMS)


def extract_climate_passages(text: str, min_sentences: int = 1):
    """
    Returns (climate_text, n_climate_sentences, n_total_sentences)
    """
    sentences = split_into_sentences(text)
    if not sentences:
        return "", 0, 0

    climate_sentences = [s for s in sentences if is_climate_sentence(s)]
    climate_text = " ".join(climate_sentences)

    if len(climate_sentences) < min_sentences:
        # Consider it as 'no climate content'
        return "", len(climate_sentences), len(sentences)

    return climate_text, len(climate_sentences), len(sentences)

# 3. Keyword-based intensity scores

def keyword_hits(text: str, keywords):
    if not text:
        return 0
    t = text.lower()
    return sum(t.count(k.lower()) for k in keywords)


def keyword_intensity(text: str, keywords):
    """
    Raw hits and length-normalised hits (per 1,000 characters).
    """
    hits = keyword_hits(text, keywords)
    length = max(len(text), 1)
    norm = hits * 1000.0 / length
    return hits, norm

In [30]:
# 4. Load the climate-specific model: ClimateBERT risk (using pipeline for the test)

device = 0 if torch.cuda.is_available() else -1
print("Using device index:", device)

climate_tp_pipe = pipeline(
    "text-classification",
    model="climatebert/transition-physical",
    tokenizer="climatebert/distilroberta-base-climate-detector",
    device=device,
    return_all_scores=True,   # important: we want all 3 probs
)

Using device index: -1


Device set to use cpu


In [18]:
# 5. Helper to score a text with transition/physical probs

def transition_physical_scores(text: str, max_length: int = 256):
    """
    Run climatebert/transition-physical on a text and return:
        p_transition, p_none, p_physical

    If text is empty → returns (None, None, None).
    """
    if not text or not text.strip():
        return None, None, None

    # pipeline returns: [[{"label": "LABEL_0", "score": ...}, {...}, {...}]]
    outputs = climate_tp_pipe(
        text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )[0]

    scores_by_label = {d["label"]: d["score"] for d in outputs}

    p_transition = scores_by_label.get("LABEL_0", 0.0)
    p_none       = scores_by_label.get("LABEL_1", 0.0)
    p_physical   = scores_by_label.get("LABEL_2", 0.0)

    return p_transition, p_none, p_physical

# 1C. NLP Workflow building (ClimateBert: Climate Detector → Transition-Physical → Climate Specificity)
Step 1 — Extract Item 1A → split into sentences

Step 2 — Use keyword filter to find candidate climate sentences (very fast)

Step 3 — Pass only those sentences to Climate Detector

Step 4 — Pass filtered sentences to transition-physical

Step 5 — Compute specificity score

Step 6 — Aggregate to firm-year metrics

In [20]:
# 1. Parse CIK, form, filing date, year from file name

def parse_filename_meta(path: Path):
    fname = path.name
    m = re.match(
        r'(?P<date>\d{8})_(?P<form>[0-9A-Z\-]+)_edgar_data_(?P<cik>\d+)_',
        fname
    )
    if not m:
        return {"cik": None, "form": None, "filing_date": None, "year": None}
    
    date_str = m.group("date")  # YYYYMMDD
    filing_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return {
        "cik": m.group("cik"),
        "form": m.group("form"),
        "filing_date": filing_date,
        "year": int(date_str[:4]),
    }

# 2. Extract Item 1A (Risk Factors)

def extract_item_1a(text: str):
    if not text:
        return None
    
    # normalize spaces + case for matching
    pattern = re.compile(
        r'ITEM\s+1A\.(.*?)(ITEM\s+1B\.|ITEM\s+2\.)',
        flags=re.IGNORECASE | re.DOTALL
    )
    m = pattern.search(text)
    if m:
        return m.group(1)
    return None

# 3. Sentence splitting + climate keyword filter

PHYSICAL_TERMS = [
    # Floods & storms
    "flood", "flooding", "flash flood", "storm surge",
    "coastal flood", "inland flood", "river flood",
    "hurricane", "cyclone", "typhoon", "tornado",
    "severe storm", "winter storm", "ice storm",
    "hailstorm", "windstorm", "extreme wind", "gale force",

    # Wildfires
    "wildfire", "wild fire", "forest fire", "bushfire",
    "fire season", "smoke exposure", "wildfire risk",

    # Heat-related risks
    "extreme heat", "heatwave", "heat wave",
    "high temperature", "record heat", "temperature anomaly",
    "thermal stress", "heat stress",

    # Drought & water stress
    "drought", "extreme drought", "water stress",
    "water scarcity", "water shortage", "water depletion",
    "reduced rainfall", "precipitation deficit",

    # Sea-level rise & coastal risks
    "sea level rise", "coastal erosion", "coastal retreat",
    "coastal inundation", "tidal flooding", "saltwater intrusion",

    # Cold extremes
    "cold spell", "extreme cold", "polar vortex",
    "winter freeze", "deep freeze", "freezing event",

    # Chronic climate trends
    "changing rainfall", "changing precipitation",
    "temperature increase", "warming trend",
    "long-term climate trend", "long-term heat",

    # Secondary physical risks (financial-relevant)
    "infrastructure damage", "property damage",
    "facility damage", "operational disruption",
    "supply chain disruption", "business interruption",
    "power outage", "grid failure", "water availability",
    "crop failure", "agricultural loss",

    # FEMA / EM-DAT wording that appears in filings
    "natural disaster", "natural hazard", "catastrophic event",
    "extreme weather", "adverse weather",
    "weather-related disruption", "weather event",

    # Insurance / actuarial language
    "insured loss", "catastrophe risk", "catastrophic loss",
]

TRANSITION_TERMS = [
    # General climate transition
    "climate change", "global warming",
    "climate-related", "climate risk",

    # Emissions & carbon language
    "greenhouse gas", "ghg", "co2", "carbon dioxide",
    "carbon emissions", "emissions reduction",
    "emissions target", "emissions cap",

    # Carbon pricing & markets
    "carbon price", "carbon tax", "carbon levy",
    "carbon fee", "cap-and-trade", "cap and trade",
    "carbon credit", "carbon offset", "carbon trading",
    "emissions trading scheme", "ets",

    # Climate policy
    "climate regulation", "climate disclosure rule",
    "environmental regulation", "energy regulation",
    "climate legislation", "regulatory requirement",
    "regulatory risk",

    # Net-zero policies
    "net zero", "net-zero", "zero-carbon",
    "carbon-neutral", "carbon neutrality",
    "decarbonisation", "decarbonization",

    # Energy transition
    "energy transition", "energy efficiency",
    "low-carbon", "low carbon",
    "renewable energy", "clean energy",
    "solar", "wind power", "hydropower",

    # Stranded assets & financial risks
    "stranded asset", "asset stranding",
    "transition cost", "transition risk",
    "compliance cost", "carbon cost",

    # Climate governance / reporting
    "tcfd", "task force on climate-related financial disclosures",
    "esg reporting", "climate reporting", "sustainability report",

    # Litigation
    "climate litigation", "environmental litigation",

    # Reputation & market
    "climate reputation", "climate perception",
    "sustainability expectations", "investor pressure",

    # Paris agreement
    "paris agreement", "paris-aligned",
]

# Split into sentences as ML models work better on coherent chunks
def split_into_sentences(text: str):
    if not text:
        return []
    parts = re.split(r'(?<=[\.\!\?])\s+', text)
    return [p.strip() for p in parts if p.strip()]

# Climate keyword detection (from the lists)
def is_climate_sentence(s: str):
    s_low = s.lower()
    return any(kw in s_low for kw in PHYSICAL_TERMS + TRANSITION_TERMS)

def extract_climate_passages(text: str, min_sentences: int = 1):
    sents = split_into_sentences(text)
    if not sents:
        return "", 0, 0
    climate_sents = [s for s in sents if is_climate_sentence(s)]
    climate_text = " ".join(climate_sents)
    if len(climate_sents) < min_sentences:
        return "", len(climate_sents), len(sents)
    return climate_text, len(climate_sents), len(sents)

def keyword_hits(text: str, keywords):
    if not text:
        return 0
    t = text.lower()
    return sum(t.count(k.lower()) for k in keywords)

def keyword_intensity(text: str, keywords):
    hits = keyword_hits(text, keywords)
    length = max(len(text), 1)
    norm = hits * 1000.0 / length   # hits per 1000 chars
    return hits, norm

In [21]:
# 4. Load ClimateBERT models with pipeline

device = 0 if torch.cuda.is_available() else -1
print("Using device index:", device)

# Climate detector -> returns if the text is climate-related or not climate-related
detector_pipe = pipeline(
    "text-classification",
    model="climatebert/distilroberta-base-climate-detector",
    return_all_scores=True
)

# Transition-physical -> tells if a passage discusses transition / physical risks (or neither)
tp_pipe = pipeline(
    "text-classification",
    model="climatebert/transition-physical",
    tokenizer="climatebert/distilroberta-base-climate-detector",
    device=device,
    return_all_scores=True,   # important: we want all 3 probs
)

# Climate-specificity -> predict whether text is specific (quantified risk, named scenario, named hazard) vs general (vague, boilerplate)
spec_pipe = pipeline(
    "text-classification",
    model="climatebert/distilroberta-base-climate-specificity",
    return_all_scores=True
)

Using device index: -1


Device set to use cpu
Device set to use cpu
Device set to use cpu


In [22]:
# 5. Helpers to turn outputs into probabilities (auto-adapt if labels are 2-way or 3-way)

def scores_from_pipeline(pipe, text: str, max_length: int = 256):
    """
    Generic helper: returns {label: score} for a text.
    """
    if not text or not text.strip():
        return {}
    outputs = pipe(
        text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )[0]  # first (and only) sample
    return {d["label"]: d["score"] for d in outputs}

def detector_scores(text: str):
    d = scores_from_pipeline(detector_pipe, text)
    # climate-detector: LABEL_0 = not climate, LABEL_1 = climate
    p_climate = d.get("LABEL_1", None)
    p_non = d.get("LABEL_0", None)
    return p_climate, p_non

def transition_physical_scores(text: str):
    d = scores_from_pipeline(tp_pipe, text)
    # model might be 2-label or 3-label; handle both
    if "LABEL_2" in d:
        p_trans = d.get("LABEL_0", None)
        p_none  = d.get("LABEL_1", None)
        p_phys  = d.get("LABEL_2", None)
    else:
        # 2-label variant: LABEL_0 = Transition, LABEL_1 = Physical
        p_trans = d.get("LABEL_0", None)
        p_phys  = d.get("LABEL_1", None)
        p_none  = None
    return p_trans, p_phys, p_none

def specificity_scores(text: str):
    d = scores_from_pipeline(spec_pipe, text)
    # LABEL_0 = low, LABEL_1 = medium, LABEL_2 = high specificity
    p_low  = d.get("LABEL_0", None)
    p_med  = d.get("LABEL_1", None)
    p_high = d.get("LABEL_2", None)
    return p_low, p_med, p_high

# 1D. NLP Workflow building (ClimateBert: Climate Detector → Transition-Physical → Climate Specificit) - no keyword filtering

In [58]:
# --------------------------
# 1) ClimateBERT pipelines
# --------------------------

device = 0 if torch.cuda.is_available() else -1
print("Using device index:", device)

# Climate detector -> returns if the text is climate-related or not climate-related
detector_pipe = pipeline(
    "text-classification",
    model="climatebert/distilroberta-base-climate-detector",
    return_all_scores=True
)

# Transition-physical -> tells if a passage discusses transition / physical risks (or neither)
tp_pipe = pipeline(
    "text-classification",
    model="climatebert/transition-physical",
    tokenizer="climatebert/distilroberta-base-climate-detector",
    device=device,
    return_all_scores=True,   # important: we want all 3 probs
)

# map labels outputs from tp_pipe for clarity (based on a test)
label_map = {
    "LABEL_0": "transition", 
    "LABEL_1": "none",       
    "LABEL_2": "physical"
}

# Climate-specificity -> predict whether text is specific (quantified risk, named scenario, named hazard) vs general (vague, boilerplate)
spec_pipe = pipeline(
    "text-classification",
    model="climatebert/distilroberta-base-climate-specificity",
    return_all_scores=True
)

Using device index: -1


Device set to use cpu
Device set to use cpu
Device set to use cpu


In [59]:
# --------------------------
# 2) Utility: parse filename
# --------------------------

def parse_filename_meta(path: Path):
    fname = path.name
    m = re.match(
        r'(?P<date>\d{8})_(?P<form>[0-9A-Z\-]+)_edgar_data_(?P<cik>\d+)_',
        fname
    )
    if not m:
        return {"cik": None, "form": None, "filing_date": None, "year": None}
    date_str = m.group("date")
    filing_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return {
        "cik": m.group("cik"),
        "form": m.group("form"),
        "filing_date": filing_date,
        "year": int(date_str[:4]),
    }

# --------------------------
# 3) Extract full Item 1A
# --------------------------

# def extract_item_1a(text: str):
#     pattern = re.compile(
#         r'ITEM\s+1A\.?(.*?)(ITEM\s+1B\.|ITEM\s+2\.)',
#         flags=re.IGNORECASE | re.DOTALL
#     )
#     m = pattern.search(text)
#     return m.group(1) if m else None

def extract_item_1a(text: str):
    """
    Robust extraction of Item 1A (Risk Factors).
    Strategies used:
    1. Case-insensitive search.
    2. Mandatory 'Risk Factors' title to avoid TOC matches.
    3. Finding ALL matches and returning the longest one (to bypass TOC entries).
    """
    
    # IMPROVEMENT 1: Look for "Item 1A" followed optionally by punctuation, 
    # then explicitly "Risk Factors". This filters out many TOC entries.
    # We use [\.\:\-\s]* to allow for separators like "Item 1A: Risk Factors" or "Item 1A. Risk Factors"
    # We use (?i) for case insensitivity inside the pattern.
    start_pattern = r'ITEM\s+1A[\.\:\-\s]*Risk\s+Factors'
    
    # IMPROVEMENT 2: The end pattern looks for the next likely headers (1B or 2).
    # We include word boundaries (\b) to avoid matching "Item 20" as "Item 2".
    end_pattern = r'(ITEM\s+1B|ITEM\s+2)\b'
    
    # Combine into a single regex.
    # Note: We use DOTALL (re.S) so the dot (.) matches newlines.
    regex = re.compile(
        f"({start_pattern})(.*?)({end_pattern})", 
        re.IGNORECASE | re.DOTALL
    )
    
    matches = regex.findall(text)
    
    if not matches:
        return None

    # IMPROVEMENT 3: The "Longest Match" Heuristic.
    # A TOC entry is usually short (< 500 chars). The real section is long.
    # matches is a list of tuples: [(start_match, content, end_match), ...]
    # We want the 'content' group (index 1).
    candidates = [m[1] for m in matches]
    
    # Sort by length, descending, and take the longest.
    best_candidate = max(candidates, key=len)
    
    # Optional: Logic to reject if even the best candidate is too short to be real
    if len(best_candidate) < 1000:
        return None

    return best_candidate.strip()

In [63]:
N_TEST = 10  

if len(all_files) <= N_TEST:
    sample_files = all_files
else:
    sample_files = random.sample(all_files, N_TEST)

len(sample_files)

10

In [64]:

# --------------------------
# 4) Process small subsample
# --------------------------

# Configuration for sliding window
# 512 is the hard limit for BERT-based models. 
# We use slightly less (500) to leave room for special tokens [CLS], [SEP] added by the pipeline.
CHUNK_SIZE = 500  
STRIDE = 100       # Overlap between chunks to ensure context isn't lost at the edges

def get_sliding_windows(text, tokenizer):
    """
    Tokenizes text and slices it into overlapping windows.
    Returns a list of decoded string chunks safe for the model.
    """
    # 1. Tokenize the entire document at once (no truncation yet)
    encodings = tokenizer(text, add_special_tokens=False, return_tensors="pt")
    input_ids = encodings["input_ids"][0]
    
    # 2. If text is short, return it as a single chunk
    if len(input_ids) <= CHUNK_SIZE:
        return [text]
    
    # 3. Create windows
    windows = []
    total_tokens = len(input_ids)
    
    for i in range(0, total_tokens, CHUNK_SIZE - STRIDE):
        # Define the window
        chunk_ids = input_ids[i : i + CHUNK_SIZE]
        
        # Decode back to string so the pipeline can consume it naturally
        # skip_special_tokens=True ensures we don't end up with weird [UNK] artifacts
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        windows.append(chunk_text)
        
        # Stop if we've reached the end
        if i + CHUNK_SIZE >= total_tokens:
            break
            
    return windows

# --------------------------
# 4) Process with Sliding Window
# --------------------------

rows = []

# Access the tokenizer from one of your pipes (assuming they share similar base arch like DistilRoBERTa)
# If they are different architectures, use the specific tokenizer for each pipe.
main_tokenizer = detector_pipe.tokenizer 

for path in tqdm(sample_files, desc="Testing ClimateBERT (full 1A)"):
    meta = parse_filename_meta(path)
    try:
        text = path.read_text(errors="ignore")
        item_1a = extract_item_1a(text)
        
        # Skip empty files immediately
        if not item_1a:
            rows.append({**meta, "item_1a_len": 0, "p_det": None, "p_phys": None, "p_trans": None, "p_spec_high": None})
            continue

        # --- STEP 1: CHUNK THE TEXT ---
        # Instead of item_1a[:4500], we get a list of valid token-safe strings
        chunks = get_sliding_windows(item_1a, main_tokenizer)

        # Storage for scores across all chunks in this document
        doc_scores = {
            "p_det": [],
            "p_phys": [],
            "p_trans": [],
            "p_spec_high": []
        }

        # --- STEP 2: BATCH INFERENCE ---
        # Passing the list 'chunks' directly to pipe() allows it to batch process if on GPU
        # We process all chunks for the document at once.
        
        # 1. Detector
        det_results = detector_pipe(chunks)
        for res in det_results:
            # Handle potential list wrapper if pipe returns list of lists
            r = res if isinstance(res, dict) else res[0] 
            label = r["label"].lower()
            score = r["score"]
            
            # Map label to probability
            # If label is 'climate-related', prob is score. If 'not', prob is 1-score (or just ignore if using max)
            if label in ["climate_related", "climate-related", "climate"]:
                doc_scores["p_det"].append(score)
            else:
                doc_scores["p_det"].append(1 - score) # Invert score if it's "not climate"



        # 2. Transition vs Physical
        # --- Transition vs physical ---
        tp_results = tp_pipe(chunks, top_k=None)
    
        for chunk_res in tp_results:
            # chunk_res is list of dicts: [{'label': 'LABEL_0', 'score': 0.99}, ...]
        
            # Create a score lookup using the mapped names
            scores = {label_map[d["label"]]: d["score"] for d in chunk_res}
        
            # Now you can safely extract
            p_trans = scores.get("transition", 0.0)
            p_phys  = scores.get("physical", 0.0)
        
            doc_scores["p_trans"].append(p_trans)
            doc_scores["p_phys"].append(p_phys)
        
        # 3. Specificity
        spec_results = spec_pipe(chunks)
        for res in spec_results:
            r = res if isinstance(res, dict) else res[0]
            if r["label"].lower() == "high":
                doc_scores["p_spec_high"].append(r["score"])
            else:
                doc_scores["p_spec_high"].append(1 - r["score"])

        # --- STEP 3: AGGREGATION (MAX POOLING) ---
        # We take the MAXIMUM signal found in any chunk as the document signal.
        # Use np.max with a default of 0.0 if the list is empty
        
        final_p_det = np.max(doc_scores["p_det"]) if doc_scores["p_det"] else 0.0
        final_p_phys = np.max(doc_scores["p_phys"]) if doc_scores["p_phys"] else 0.0
        final_p_trans = np.max(doc_scores["p_trans"]) if doc_scores["p_trans"] else 0.0
        final_p_spec = np.max(doc_scores["p_spec_high"]) if doc_scores["p_spec_high"] else 0.0

        rows.append({
            **meta,
            "item_1a_len": len(item_1a), # Original length
            "chunks_processed": len(chunks), # Useful metadata
            "p_det": final_p_det,
            "p_phys": final_p_phys,
            "p_trans": final_p_trans,
            "p_spec_high": final_p_spec,
        })

    except Exception as e:
        print(f"Error on {path}: {e}")

Testing ClimateBERT (full 1A):   0%|          | 0/10 [00:00<?, ?it/s]

In [67]:
df_test = pd.DataFrame(rows)

print(df_test[["p_det", "p_phys", "p_trans", "p_spec_high"]].describe())

df_test

          p_det    p_phys   p_trans  p_spec_high
count  6.000000  6.000000  6.000000     6.000000
mean   0.831035  0.783958  0.831944     0.747444
std    0.404638  0.401360  0.407483     0.131271
min    0.005074  0.000283  0.000190     0.548578
25%    0.995172  0.779362  0.994707     0.661879
50%    0.995859  0.998607  0.999564     0.802555
75%    0.996269  0.999747  0.999612     0.840237
max    0.998073  0.999841  0.999632     0.863191


Unnamed: 0,cik,form,filing_date,year,item_1a_len,p_det,p_phys,p_trans,p_spec_high,chunks_processed
0,1852536,10-K,2024-05-02,2024,0,,,,,
1,1121788,10-K,2021-02-17,2021,93238,0.998073,0.999841,0.999628,0.61874,46.0
2,66570,10-K,2016-02-29,2016,26524,0.996144,0.70665,0.999564,0.548578,13.0
3,1866692,10-K,2024-02-20,2024,161067,0.995573,0.999717,0.993089,0.791294,79.0
4,1766367,10-K,2020-03-18,2020,0,,,,,
5,1714899,10-K,2019-03-12,2019,223699,0.995038,0.999757,0.999563,0.863191,108.0
6,1711933,10-K,2022-03-15,2022,211703,0.996311,0.997498,0.999632,0.813816,100.0
7,1566053,10-K,2019-03-25,2019,0,,,,,
8,1422222,10-K-A,2018-01-04,2018,11140,0.005074,0.000283,0.00019,0.849044,6.0
9,1580013,10-K,2018-03-20,2018,0,,,,,


# 1E. NLP Workflow building (ClimateBert: Climate Detector → Transition-Physical → Climate Specificit) - no keyword filtering - models loading

In [71]:
# --------------------------
# 1) Setup Device & Load Models
# --------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Helper to load model & tokenizer ---
def load_climatebert_model(model_name, tokenizer_name=None):
    """
    Loads model and tokenizer. 
    If tokenizer_name is NOT provided, it defaults to using model_name.
    """
    if tokenizer_name is None:
        tokenizer_name = model_name
        
    print(f"Loading Model: {model_name}")
    print(f"Loading Tokenizer: {tokenizer_name}")
    
    # Load tokenizer from the specific source (which might differ from model_name)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # Load model weights
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    model.eval()
    return tokenizer, model

# A. Climate Detector
# Both model and tokenizer exist in this repo
det_name = "climatebert/distilroberta-base-climate-detector"
det_tokenizer, det_model = load_climatebert_model(det_name)

# B. Transition-Physical
# ⚠️ THE FIX: We use the TP repo for the model, but the Detector repo for the tokenizer
tp_model_name = "climatebert/transition-physical"
tp_tokenizer_name = "climatebert/distilroberta-base-climate-detector" 

tp_tokenizer, tp_model = load_climatebert_model(
    model_name=tp_model_name, 
    tokenizer_name=tp_tokenizer_name
)

# Label Map for Transition-Physical
tp_label_map = {
    0: "transition",
    1: "none",
    2: "physical"
}

# C. Climate Specificity
spec_name = "climatebert/distilroberta-base-climate-specificity"
spec_tokenizer, spec_model = load_climatebert_model(spec_name)

print("✅ All models loaded successfully.")

Using device: cpu
Loading Model: climatebert/distilroberta-base-climate-detector
Loading Tokenizer: climatebert/distilroberta-base-climate-detector
Loading Model: climatebert/transition-physical
Loading Tokenizer: climatebert/distilroberta-base-climate-detector
Loading Model: climatebert/distilroberta-base-climate-specificity
Loading Tokenizer: climatebert/distilroberta-base-climate-specificity
✅ All models loaded successfully.


In [74]:
# --------------------------
# 2) Helper function for inference (the pipline was doing that before)
# --------------------------

def predict_batch(texts, tokenizer, model):
    """
    Runs inference on a list of texts (chunks) using the direct model.
    Returns a list of dictionaries containing label probabilities.
    """
    # 1. Tokenize
    # padding=True ensures all sequences in the batch have the same length
    # truncation=True ensures we don't crash on >512 tokens
    inputs = tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        max_length=512, 
        return_tensors="pt"
    )
    
    # 2. Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 3. Inference (No Gradient Calculation for speed)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 4. Convert Logits to Probabilities (Softmax)
    probs = F.softmax(outputs.logits, dim=-1)
    
    # 5. Move back to CPU and convert to numpy/list
    return probs.cpu().numpy()

# --------------------------
# 3) Utility: parse filename
# --------------------------

def parse_filename_meta(path: Path):
    fname = path.name
    m = re.match(
        r'(?P<date>\d{8})_(?P<form>[0-9A-Z\-]+)_edgar_data_(?P<cik>\d+)_',
        fname
    )
    if not m:
        return {"cik": None, "form": None, "filing_date": None, "year": None}
    date_str = m.group("date")
    filing_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return {
        "cik": m.group("cik"),
        "form": m.group("form"),
        "filing_date": filing_date,
        "year": int(date_str[:4]),
    }

# --------------------------
# 4) Extract full Item 1A
# --------------------------

# def extract_item_1a(text: str):
#     pattern = re.compile(
#         r'ITEM\s+1A\.?(.*?)(ITEM\s+1B\.|ITEM\s+2\.)',
#         flags=re.IGNORECASE | re.DOTALL
#     )
#     m = pattern.search(text)
#     return m.group(1) if m else None

def extract_item_1a(text: str):
    """
    Robust extraction of Item 1A (Risk Factors).
    Strategies used:
    1. Case-insensitive search.
    2. Mandatory 'Risk Factors' title to avoid TOC matches.
    3. Finding ALL matches and returning the longest one (to bypass TOC entries).
    """
    
    # IMPROVEMENT 1: Look for "Item 1A" followed optionally by punctuation, 
    # then explicitly "Risk Factors". This filters out many TOC entries.
    # We use [\.\:\-\s]* to allow for separators like "Item 1A: Risk Factors" or "Item 1A. Risk Factors"
    # We use (?i) for case insensitivity inside the pattern.
    start_pattern = r'ITEM\s+1A[\.\:\-\s]*Risk\s+Factors'
    
    # IMPROVEMENT 2: The end pattern looks for the next likely headers (1B or 2).
    # We include word boundaries (\b) to avoid matching "Item 20" as "Item 2".
    end_pattern = r'(ITEM\s+1B|ITEM\s+2)\b'
    
    # Combine into a single regex.
    # Note: We use DOTALL (re.S) so the dot (.) matches newlines.
    regex = re.compile(
        f"({start_pattern})(.*?)({end_pattern})", 
        re.IGNORECASE | re.DOTALL
    )
    
    matches = regex.findall(text)
    
    if not matches:
        return None

    # IMPROVEMENT 3: The "Longest Match" Heuristic.
    # A TOC entry is usually short (< 500 chars). The real section is long.
    # matches is a list of tuples: [(start_match, content, end_match), ...]
    # We want the 'content' group (index 1).
    candidates = [m[1] for m in matches]
    
    # Sort by length, descending, and take the longest.
    best_candidate = max(candidates, key=len)
    
    # Optional: Logic to reject if even the best candidate is too short to be real
    if len(best_candidate) < 1000:
        return None

    return best_candidate.strip()

In [75]:
N_TEST = 10  

if len(all_files) <= N_TEST:
    sample_files = all_files
else:
    sample_files = random.sample(all_files, N_TEST)

len(sample_files)

10

In [76]:
# --------------------------
# 5) Process all files
# --------------------------

# Configuration for sliding window
# 512 is the hard limit for BERT-based models. 
# We use slightly less (500) to leave room for special tokens [CLS], [SEP] added by the pipeline.
CHUNK_SIZE = 500  
STRIDE = 100       # Overlap between chunks to ensure context isn't lost at the edges

def get_sliding_windows(text, tokenizer):
    """
    Tokenizes text and slices it into overlapping windows.
    Returns a list of decoded string chunks safe for the model.
    """
    # 1. Tokenize the entire document at once (no truncation yet)
    encodings = tokenizer(text, add_special_tokens=False, return_tensors="pt")
    input_ids = encodings["input_ids"][0]
    
    # 2. If text is short, return it as a single chunk
    if len(input_ids) <= CHUNK_SIZE:
        return [text]
    
    # 3. Create windows
    windows = []
    total_tokens = len(input_ids)
    
    for i in range(0, total_tokens, CHUNK_SIZE - STRIDE):
        # Define the window
        chunk_ids = input_ids[i : i + CHUNK_SIZE]
        
        # Decode back to string so the pipeline can consume it naturally
        # skip_special_tokens=True ensures we don't end up with weird [UNK] artifacts
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        windows.append(chunk_text)
        
        # Stop if we've reached the end
        if i + CHUNK_SIZE >= total_tokens:
            break
            
    return windows

# --------------------------
# 6) Process with Sliding Window
# --------------------------

rows = []

# Access the tokenizer from one of your pipes (assuming they share similar base arch like DistilRoBERTa)
# If they are different architectures, use the specific tokenizer for each pipe.
main_tokenizer = detector_pipe.tokenizer 

for path in tqdm(all_files, desc="Testing ClimateBERT (full 1A)"):
    meta = parse_filename_meta(path)
    try:
        text = path.read_text(errors="ignore")
        item_1a = extract_item_1a(text)
        
        # Skip empty files immediately
        if not item_1a:
            rows.append({**meta, "item_1a_len": 0, "p_det": None, "p_phys": None, "p_trans": None, "p_spec_high": None})
            continue

        # --- STEP 1: CHUNK THE TEXT ---
        # Instead of item_1a[:4500], we get a list of valid token-safe strings
        chunks = get_sliding_windows(item_1a, main_tokenizer)

        # Storage for scores across all chunks in this document
        doc_scores = {
            "p_det": [],
            "p_phys": [],
            "p_trans": [],
            "p_spec_high": []
        }

        # --- STEP 2: BATCH INFERENCE ---
        # Passing the list 'chunks' directly to pipe() allows it to batch process if on GPU
        # We process all chunks for the document at once.
        
        # 1. Detector
        # Returns array of shape (n_chunks, 2) -> [prob_not_climate, prob_climate]
        det_probs = predict_batch(chunks, det_tokenizer, det_model)
        
        # Assuming label 1 is "yes" (check model card if unsure, usually 1=climate)
        # We take the probability of class 1 for every chunk
        chunk_det_scores = det_probs[:, 1] 
        doc_scores["p_det"].extend(chunk_det_scores.tolist())

        # 2. Transition vs Physical
        # Returns array of shape (n_chunks, 3) -> [prob_trans, prob_none, prob_phys]
        tp_probs = predict_batch(chunks, tp_tokenizer, tp_model)
        
        # Extract columns based on your label map: 0=trans, 1=none, 2=phys
        # We interpret the columns directly
        chunk_trans_scores = tp_probs[:, 0]
        chunk_phys_scores  = tp_probs[:, 2]
        
        doc_scores["p_trans"].extend(chunk_trans_scores.tolist())
        doc_scores["p_phys"].extend(chunk_phys_scores.tolist())

        # 3. Specificity
        # Returns array of shape (n_chunks, 2) -> [prob_non_specific, prob_specific]
        spec_probs = predict_batch(chunks, spec_tokenizer, spec_model)
        
        # Assuming label 1 is "specific" (high specificity)
        chunk_spec_scores = spec_probs[:, 1]
        doc_scores["p_spec_high"].extend(chunk_spec_scores.tolist())

        # --- STEP 3: AGGREGATION (MAX POOLING) ---
        # We take the MAXIMUM signal found in any chunk as the document signal.
        # Use np.max with a default of 0.0 if the list is empty
        
        final_p_det = np.max(doc_scores["p_det"]) if doc_scores["p_det"] else 0.0
        final_p_phys = np.max(doc_scores["p_phys"]) if doc_scores["p_phys"] else 0.0
        final_p_trans = np.max(doc_scores["p_trans"]) if doc_scores["p_trans"] else 0.0
        final_p_spec = np.max(doc_scores["p_spec_high"]) if doc_scores["p_spec_high"] else 0.0

        rows.append({
            **meta,
            "item_1a_len": len(item_1a), # Original length
            "chunks_processed": len(chunks), # Useful metadata
            "p_det": final_p_det,
            "p_phys": final_p_phys,
            "p_trans": final_p_trans,
            "p_spec_high": final_p_spec,
        })

    except Exception as e:
        print(f"Error on {path}: {e}")

Testing ClimateBERT (full 1A):   0%|          | 0/10 [00:00<?, ?it/s]

In [78]:
df_final = pd.DataFrame(rows)

# 1. Basic Stats
print("\n--- Statistical Summary ---")
print(df_final[["p_det", "p_phys", "p_trans", "p_spec_high"]].describe())

# 2. Senior Dev Logic Checks
print("\n--- Logic Checks ---")

# CHECK A: Did we actually process chunks?
# If mean chunks_processed is ~1.0, your extraction logic failed (you are scanning titles, not text).
avg_chunks = df_final["chunks_processed"].mean()
print(f"Average chunks per file: {avg_chunks:.1f} (Should be > 20 for 10-Ks)")

# CHECK B: Do we have ANY Physical Risk detected?
# If max(p_phys) is 0.0, your label mapping (LABEL_0/1/2) is likely wrong.
max_phys = df_final["p_phys"].max()
print(f"Max Physical Risk Score: {max_phys:.4f} (Should be > 0.9)")

# CHECK C: How many 'empty' files did we skip?
# Count rows where item_1a_len is 0
empty_count = len(df_final[df_final["item_1a_len"] == 0])
print(f"Empty/Skipped Files: {empty_count} ({(empty_count/len(df_final))*100:.1f}%)")

df_final


--- Statistical Summary ---
          p_det    p_phys   p_trans  p_spec_high
count  8.000000  8.000000  8.000000     8.000000
mean   0.506997  0.628909  0.282909     0.699845
std    0.441254  0.511877  0.451222     0.125633
min    0.005358  0.000089  0.000113     0.490366
25%    0.032214  0.024349  0.000533     0.605845
50%    0.568285  0.999633  0.001620     0.734516
75%    0.904260  0.999771  0.444992     0.790204
max    0.998411  0.999824  0.999645     0.853039

--- Logic Checks ---
Average chunks per file: 31.0 (Should be > 20 for 10-Ks)
Max Physical Risk Score: 0.9998 (Should be > 0.9)
Empty/Skipped Files: 2 (20.0%)


Unnamed: 0,cik,form,filing_date,year,item_1a_len,chunks_processed,p_det,p_phys,p_trans,p_spec_high
0,728535,10-K,2021-02-23,2021,18564,9.0,0.998411,0.999764,0.999645,0.581702
1,1423774,10-K,2020-03-31,2020,154176,76.0,0.461224,0.9997,0.001047,0.756016
2,1749273,10-K,2021-03-26,2021,0,,,,,
3,34903,10-K,2023-02-08,2023,5121,3.0,0.005358,8.9e-05,0.000113,0.613892
4,1785705,10-K,2021-03-30,2021,0,,,,,
5,1719881,10-K,2019-09-09,2019,69170,36.0,0.675346,0.999566,0.260241,0.785043
6,1267332,10-K,2016-03-22,2016,10523,6.0,0.006926,0.000111,0.00012,0.490366
7,1403802,10-K,2016-03-30,2016,44009,21.0,0.040643,0.032428,0.000671,0.713016
8,1705843,10-K,2023-03-02,2023,99755,48.0,0.993582,0.999824,0.999244,0.853039
9,1373715,10-K,2016-02-25,2016,97683,49.0,0.874486,0.999789,0.002194,0.805686


In [79]:
BUCKET_NAME = "tenk-bucket" 
GCS_PATH = f"gs://{BUCKET_NAME}/climatebert/output/FINAL_RESULTS.parquet"

try:
    df_final.to_parquet(GCS_PATH, index=False)
    print(f"✅ Backup saved to GCS: {GCS_PATH}")
except Exception as e:
    print(f"⚠️ Could not save to GCS (check permissions/bucket name): {e}")

✅ Backup saved to GCS: gs://tenk-bucket/climatebert/output/TEST_RESULTS.parquet
