In [None]:
import os
import re
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import random

from transformers import pipeline

import numpy as np

import torch.nn.functional as F

from google.colab import auth

from datetime import datetime

from google.cloud import storage


In [None]:
# 1. Authenticate your account (required for GCS access)
auth.authenticate_user()

In [None]:
# Download zip from GCS bucket

bucket_name = "tenk-bucket"
zip_name = "tenk_filings.zip"

client = storage.Client()

bucket = client.bucket(bucket_name)
blob = bucket.blob(zip_name)

# 1. FIX: Fetch metadata from the server so blob.size is not None
blob.reload()
file_size = blob.size

# 2. FIX: Use blob.open("rb") to stream the download instead of loading it all into RAM
chunk_size = 1024 * 1024  # 1 MB

with open(zip_name, "wb") as f:
    with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024, desc="Downloading") as pbar:
        # Open the blob as a binary stream
        with blob.open("rb") as gcs_file:
            while True:
                # Read a chunk from the stream
                chunk = gcs_file.read(chunk_size)
                if not chunk:
                    break
                # Write to local file and update progress
                f.write(chunk)
                pbar.update(len(chunk))

print("Download complete!")

Downloading:   0%|          | 0.00/14.6G [00:00<?, ?B/s]

Download complete!


In [None]:
!unzip tenk_filings.zip -d tenk_filings

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  inflating: tenk_filings/2024/QTR4/20241010_10-Q_edgar_data_1517389_0001477932-24-006326.txt  
  inflating: tenk_filings/2024/QTR4/20241112_10-Q_edgar_data_1403528_0001403528-24-000054.txt  
  inflating: tenk_filings/2024/QTR4/20241210_10-Q_edgar_data_1441816_0001441816-24-000260.txt  
  inflating: tenk_filings/2024/QTR4/20241106_10-Q_edgar_data_94049_0000950170-24-121883.txt  
  inflating: tenk_filings/2024/QTR4/20241108_10-Q_edgar_data_12239_0001213900-24-095881.txt  
  inflating: tenk_filings/2024/QTR4/20241107_10-K-A_edgar_data_1505155_0001505155-24-000103.txt  
  inflating: tenk_filings/2024/QTR4/20241119_10-Q_edgar_data_1590715_0001477932-24-007457.txt  
  inflating: tenk_filings/2024/QTR4/20241022_10-Q_edgar_data_66740_0000066740-24-000101.txt  
  inflating: tenk_filings/2024/QTR4/20241114_10-Q_edgar_data_1502377_0000950170-24-126724.txt  
  inflating: tenk_filings/2024/QTR4/20241129_10-

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Dec  9 15:25:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# # Mount the Bucket with 10Ks
# BUCKET_NAME = "tenk-bucket"
# MOUNT_DIR = "gcs_10k_data"

# # 1. Create a local directory where the bucket will be mounted
# ! mkdir -p {MOUNT_DIR}

# # 2. Mount the bucket using gcsfuse
# # --implicit-dirs ensures all folders/files are visible
# ! gcsfuse --implicit-dirs {BUCKET_NAME} {MOUNT_DIR}

# print(f"Bucket {BUCKET_NAME} mounted to /{MOUNT_DIR}")

# # You can now use standard Python file operations (like open, Path)
# # on paths starting with '/content/gcs_10k_data/'

{"timestamp":{"seconds":1764925500,"nanos":288479355},"severity":"INFO","message":"Start gcsfuse/3.5.4 (Go version go1.24.11) for app \"\" using mount point: /content/gcs_10k_data\n","mount-id":"tenk-bucket-d22a85c0"}
{"timestamp":{"seconds":1764925500,"nanos":288515582},"severity":"INFO","message":"GCSFuse Config","mount-id":"tenk-bucket-d22a85c0","CLI Flags":{"implicit-dirs":"true"}}
{"timestamp":{"seconds":1764925500,"nanos":288540789},"severity":"INFO","message":"GCSFuse Config","mount-id":"tenk-bucket-d22a85c0","Full Config":{"AppName":"","CacheDir":"","CloudProfiler":{"AllocatedHeap":true,"Cpu":true,"Enabled":false,"Goroutines":false,"Heap":true,"Label":"gcsfuse-0.0.0","Mutex":false},"Debug":{"ExitOnInvariantViolation":false,"Fuse":false,"Gcs":false,"LogMutex":false},"DisableAutoconfig":false,"EnableAtomicRenameObject":true,"EnableGoogleLibAuth":true,"EnableHns":true,"EnableNewReader":true,"EnableUnsupportedPathSupport":false,"FileCache":{"CacheFileForRangeRead":false,"DownloadCh

In [None]:
from pathlib import Path

# Base directory where the files are stored on the local SSD
BASE_DIR = Path("/content/tenk_filings/")

def is_10k(path):
    """
    Returns True if the file is likely a 10-K or 10-K/A based on filename.
    """
    # The original logic looks sound, assuming the filename structure.
    return "10-K" in path.name.upper()

def collect_all_files():
    all_files = []

    for year in range(2016, 2025):
        # --- FIX IS HERE: Convert the integer 'year' to a string ---
        year_dir = BASE_DIR / str(year)
        # -----------------------------------------------------------

        if not year_dir.is_dir():
            # If the script is failing, this line is likely printing or running the continue
            print(f"Directory not found: {year_dir}")
            continue

        # rglob is recursive glob, which is good if you have subfolders in the year directories.
        for path in year_dir.rglob("*.txt"):
            if is_10k(path):
                all_files.append(path)

    all_files = sorted(all_files)
    print(f"Total 10-K/10-K-A filings found: {len(all_files)}")
    return all_files

all_files = collect_all_files()

Total 10-K/10-K-A filings found: 73799


# 1E. NLP Workflow building (ClimateBert: Climate Detector → Transition-Physical → Climate Specificit) - no keyword filtering - models loading

In [None]:

# --------------------------
# 1) Setup Device & Load Models
# --------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Helper to load model & tokenizer ---
def load_climatebert_model(model_name, tokenizer_name=None):
    if tokenizer_name is None:
        tokenizer_name = model_name

    print(f"Loading Model: {model_name}")
    print(f"Loading Tokenizer: {tokenizer_name}")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # THIS is what puts the model on the T4
    model.to(device)
    model.eval()
    return tokenizer, model

# --- Helper for Inference (CRITICAL for Manual Loading) ---
def predict_batch(texts, tokenizer, model):
    """
    Tokenizes text, moves inputs to GPU, runs inference, and moves results back to CPU.
    """
    # 1. Tokenize and pad to max length (512)
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # 2. Move the INPUTS to the GPU (The model is already there)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 3. Inference
    with torch.no_grad():
        outputs = model(**inputs)

    # 4. Softmax and move back to CPU for numpy usage
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

# --------------------------
# 2) Initialize Models
# --------------------------

# A. Climate Detector
det_name = "climatebert/distilroberta-base-climate-detector"
det_tokenizer, det_model = load_climatebert_model(det_name)

# B. Transition-Physical
# Using the fix (TP model + Detector tokenizer)
tp_model_name = "climatebert/transition-physical"
tp_tokenizer_name = "climatebert/distilroberta-base-climate-detector"
tp_tokenizer, tp_model = load_climatebert_model(tp_model_name, tp_tokenizer_name)

# Label Map
tp_label_map = {0: "transition", 1: "none", 2: "physical"}

# C. Climate Specificity
spec_name = "climatebert/distilroberta-base-climate-specificity"
spec_tokenizer, spec_model = load_climatebert_model(spec_name)

print("✅ All models loaded on GPU. Ready for loop.")

Using device: cuda
Loading Model: climatebert/distilroberta-base-climate-detector
Loading Tokenizer: climatebert/distilroberta-base-climate-detector


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/887 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Loading Model: climatebert/transition-physical
Loading Tokenizer: climatebert/distilroberta-base-climate-detector


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Loading Model: climatebert/distilroberta-base-climate-specificity
Loading Tokenizer: climatebert/distilroberta-base-climate-specificity


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

✅ All models loaded on GPU. Ready for loop.


In [None]:
# --------------------------
# 2) Helper function for inference (the pipeline was doing that before)
# --------------------------

def predict_batch(texts, tokenizer, model):
    """
    Runs inference on a list of texts (chunks) using the direct model.
    Returns a list of dictionaries containing label probabilities.
    """
    # 1. Tokenize
    # padding=True ensures all sequences in the batch have the same length
    # truncation=True ensures we don't crash on >512 tokens
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # 2. Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 3. Inference (No Gradient Calculation for speed)
    with torch.no_grad():
        outputs = model(**inputs)

    # 4. Convert Logits to Probabilities (Softmax)
    probs = F.softmax(outputs.logits, dim=-1)

    # 5. Move back to CPU and convert to numpy/list
    return probs.cpu().numpy()

# --------------------------
# 3) Utility: parse filename
# --------------------------

def parse_filename_meta(path: Path):
    fname = path.name
    m = re.match(
        r'(?P<date>\d{8})_(?P<form>[0-9A-Z\-]+)_edgar_data_(?P<cik>\d+)_',
        fname
    )
    if not m:
        return {"cik": None, "form": None, "filing_date": None, "year": None}
    date_str = m.group("date")
    filing_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return {
        "cik": m.group("cik"),
        "form": m.group("form"),
        "filing_date": filing_date,
        "year": int(date_str[:4]),
    }

def extract_item_1a(text: str):
    """
    Robust extraction of Item 1A (Risk Factors).
    Strategies used:
    1. Case-insensitive search.
    2. Mandatory 'Risk Factors' title to avoid TOC matches.
    3. Finding ALL matches and returning the longest one (to bypass TOC entries).
    """

    # IMPROVEMENT 1: Look for "Item 1A" followed optionally by punctuation,
    # then explicitly "Risk Factors". This filters out many TOC entries.
    # We use [\.\:\-\s]* to allow for separators like "Item 1A: Risk Factors" or "Item 1A. Risk Factors"
    # We use (?i) for case insensitivity inside the pattern.
    start_pattern = r'ITEM\s+1A[\.\:\-\s]*Risk\s+Factors'

    # IMPROVEMENT 2: The end pattern looks for the next likely headers (1B or 2).
    # We include word boundaries (\b) to avoid matching "Item 20" as "Item 2".
    end_pattern = r'(ITEM\s+1B|ITEM\s+2)\b'

    # Combine into a single regex.
    # Note: We use DOTALL (re.S) so the dot (.) matches newlines.
    regex = re.compile(
        f"({start_pattern})(.*?)({end_pattern})",
        re.IGNORECASE | re.DOTALL
    )

    matches = regex.findall(text)

    if not matches:
        return None

    # IMPROVEMENT 3: The "Longest Match" Heuristic.
    # A TOC entry is usually short (< 500 chars). The real section is long.
    # matches is a list of tuples: [(start_match, content, end_match), ...]
    # We want the 'content' group (index 1).
    candidates = [m[1] for m in matches]

    # Sort by length, descending, and take the longest.
    best_candidate = max(candidates, key=len)

    # Optional: Logic to reject if even the best candidate is too short to be real
    if len(best_candidate) < 1000:
        return None

    return best_candidate.strip()

In [None]:
# N_TEST = 15

# if len(all_files) <= N_TEST:
#     sample_files = all_files
# else:
#     sample_files = random.sample(all_files, N_TEST)

# len(sample_files)

15

In [None]:

# --------------------------
# Configuration & Setup
# --------------------------
BUCKET_NAME = "tenk-bucket"
CHECKPOINT_SIZE = 1000

# To resume, you must use the run_id of the folder where your EXISTING checkpoints are.
# If starting fresh, set RESUME_RUN_ID = None
# Example: RESUME_RUN_ID = "20241207_143000"
RESUME_RUN_ID = "20251205_091311"

if RESUME_RUN_ID:
    run_id = RESUME_RUN_ID
    print(f"🔄 Attempting to resume run: {run_id}")
else:
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"🚀 Starting NEW run: {run_id}")

OUTPUT_DIR = f"gs://{BUCKET_NAME}/climatebert_results"
save_path_base = f"{OUTPUT_DIR}/{run_id}"
print(f"📂 Results path: {save_path_base}")

# --------------------------
# Sliding Window Helper
# --------------------------
CHUNK_SIZE = 500
STRIDE = 100

def get_sliding_windows(text, tokenizer):
    encodings = tokenizer(text, add_special_tokens=False, return_tensors="pt")
    input_ids = encodings["input_ids"][0]
    if len(input_ids) <= CHUNK_SIZE:
        return [text]
    windows = []
    total_tokens = len(input_ids)
    for i in range(0, total_tokens, CHUNK_SIZE - STRIDE):
        chunk_ids = input_ids[i : i + CHUNK_SIZE]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        windows.append(chunk_text)
        if i + CHUNK_SIZE >= total_tokens:
            break
    return windows

def resume_analysis(all_files):
    """Checks GCS for the highest numbered checkpoint and resumes from there."""

    print("Checking GCS for checkpoints...")
    # This gsutil command might fail if the folder doesn't exist yet (new run),
    # so we wrap in try/except or just handle empty output.
    try:
        command = f"gsutil ls {save_path_base}/batch_*.parquet"
        checkpoint_files = os.popen(command).read().strip().split('\n')
    except Exception as e:
        checkpoint_files = []

    # Clean up empty strings from split
    checkpoint_files = [f for f in checkpoint_files if f.strip() != '']

    if len(checkpoint_files) == 0:
        print("   No checkpoints found. Starting from scratch.")
        return all_files, 0

    # Extract batch numbers
    batch_sizes = []
    for f in checkpoint_files:
        try:
            # Extracts number: .../batch_1000.parquet -> 1000
            num_str = f.split('batch_')[-1].split('.parquet')[0].split('_')[-1]
            batch_sizes.append(int(num_str))
        except:
            continue

    if not batch_sizes:
        print("   No valid batch numbers found. Starting from scratch.")
        return all_files, 0

    # Determine the resume index
    last_processed_count = max(batch_sizes)

    # Slice the list
    files_remaining = all_files[last_processed_count:]

    print(f"✅ Found checkpoints up to file count: {last_processed_count}")
    print(f"   Resuming. Files remaining to process: {len(files_remaining)}")

    return files_remaining, last_processed_count

# --------------------------
# Main Processing Loop
# --------------------------

# 1. Filter the initial list of all files
# Ensure 'all_files' is defined before this (e.g., your local list of 73k files)
files_to_process, total_processed_count = resume_analysis(all_files)

rows = []
main_tokenizer = det_tokenizer # Ensure this is loaded

# !!! CRITICAL FIX 2 !!!
# I REMOVED the line: total_processed_count = 0
# We must keep the count returned by resume_analysis so filenames continue sequentially (13000, 14000...)

for i, path in enumerate(tqdm(files_to_process, desc="ClimateBERT Inference")):
    meta = parse_filename_meta(path)
    try:
        text = path.read_text(errors="ignore")
        item_1a = extract_item_1a(text)

        if not item_1a:
            rows.append({**meta, "item_1a_len": 0, "chunks_processed": 0,
                         "p_det": None, "p_phys": None, "p_trans": None, "p_spec_high": None})
        else:
            # --- STEP 1: CHUNK ---
            chunks = get_sliding_windows(item_1a, main_tokenizer)

            # --- STEP 2: BATCH INFERENCE ---

            # A. Detector
            det_probs = predict_batch(chunks, det_tokenizer, det_model)
            p_det_list = det_probs[:, 1].tolist()

            # B. Transition vs Physical
            tp_probs = predict_batch(chunks, tp_tokenizer, tp_model)
            p_trans_list = tp_probs[:, 0].tolist()
            p_phys_list  = tp_probs[:, 2].tolist()

            # C. Specificity
            spec_probs = predict_batch(chunks, spec_tokenizer, spec_model)
            p_spec_list = spec_probs[:, 1].tolist()

            # --- STEP 3: AGGREGATE ---
            final_p_det = np.max(p_det_list) if len(p_det_list) > 0 else 0.0
            final_p_phys = np.max(p_phys_list) if len(p_phys_list) > 0 else 0.0
            final_p_trans = np.max(p_trans_list) if len(p_trans_list) > 0 else 0.0
            final_p_spec = np.max(p_spec_list) if len(p_spec_list) > 0 else 0.0

            rows.append({
                **meta,
                "item_1a_len": len(item_1a),
                "chunks_processed": len(chunks),
                "p_det": final_p_det,
                "p_phys": final_p_phys,
                "p_trans": final_p_trans,
                "p_spec_high": final_p_spec,
            })

    except Exception as e:
        print(f"Error on {path}: {e}")
        continue

    # --------------------------
    # CHECKPOINTING LOGIC
    # --------------------------
    if len(rows) >= CHECKPOINT_SIZE:
        # Increment the GLOBAL counter, not just the local one
        total_processed_count += len(rows)

        df_batch = pd.DataFrame(rows)
        # File name continues the sequence: batch_13000.parquet, batch_14000.parquet...
        filename = f"{save_path_base}/batch_{total_processed_count}.parquet"

        try:
            df_batch.to_parquet(filename, index=False)
            print(f"✅ Checkpoint saved: {filename}")
            rows = []
        except Exception as e:
            print(f"⚠️ Failed to save checkpoint: {e}")

# --------------------------
# Final Save
# --------------------------
if rows:
    total_processed_count += len(rows)
    df_final = pd.DataFrame(rows)
    filename = f"{save_path_base}/batch_final_{total_processed_count}.parquet"
    try:
        df_final.to_parquet(filename, index=False)
        print(f"🏁 Final batch saved: {filename}")
    except Exception as e:
        print(f"⚠️ Failed to save final batch: {e}")

print("Processing complete.")

🔄 Attempting to resume run: 20251205_091311
📂 Results path: gs://tenk-bucket/climatebert_results/20251205_091311
Checking GCS for checkpoints...
✅ Found checkpoints up to file count: 71000
   Resuming. Files remaining to process: 2799


ClimateBERT Inference:   0%|          | 0/2799 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (31535 > 512). Running this sequence through the model will result in indexing errors


✅ Checkpoint saved: gs://tenk-bucket/climatebert_results/20251205_091311/batch_72000.parquet
✅ Checkpoint saved: gs://tenk-bucket/climatebert_results/20251205_091311/batch_73000.parquet
🏁 Final batch saved: gs://tenk-bucket/climatebert_results/20251205_091311/batch_final_73799.parquet
Processing complete.
