In [None]:
# BGH Gender Counterfactuals - Main Data Pipeline
# This notebook orchestrates the complete data processing pipeline for creating
# the BGH Gender Counterfactuals dataset from raw legal documents

# Import project configuration settings
from src.common import config

## 1. Scraping

In [None]:
# Import scraping module for data collection from BGH (Federal Court of Justice) website
import src.scraping as scr

In [None]:
# Step 1: Scrape document IDs from BGH website
# This identifies all available civil appeals cases for download
_ = await scr.scrape_ids()

In [None]:
# Step 2: Download the actual legal documents
# Downloads PDF files for each identified case from the BGH website
await scr.download_docs()

In [None]:
# Step 3: Extract text content from PDF documents
# Converts PDF files to machine-readable text for further processing
_ = scr.extract_text()

In [None]:
# Step 4: Parse documents into structured format
# Extracts key components (facts, legal reasoning, decisions) from raw text
_ = scr.parse_docs()

## 2. Labeling

In [None]:
# Import labeling module for automated case classification
from src.labeling import label_docs

In [None]:
# Automatically label legal documents with case outcomes
# Uses LLM to classify decisions as "upheld" or "reversed" based on case content
_ = await label_docs()

## 3. Augmentation

In [None]:
# Import augmentation module for creating gender counterfactuals
from src.augmentation import create_augmentations

In [None]:
# Generate gender-swapped versions of legal case facts
# Creates counterfactual versions by swapping gender-specific language
# This enables bias detection by comparing model predictions on original vs. swapped versions
_ = await create_augmentations()

## 4. Train and Test Sets

In [None]:
# Import libraries for dataset creation and train/test splitting
import pandas as pd  # Data manipulation
from sklearn.model_selection import train_test_split  # Stratified data splitting
from datasets import Dataset, DatasetDict  # HuggingFace dataset format

In [None]:
# Load the augmented dataset with both original and gender-swapped case facts
# Sort by ID to ensure consistent ordering across runs
df = pd.read_json(config.DOCS_AUGMENTED_JSONL, lines=True).sort_values(by="id")

In [None]:
# Create initial train/test split (2/3 train, 1/3 test)
# Use stratified split to maintain class balance in both sets
# Fixed random state ensures reproducible splits
train_unbalanced, test = train_test_split(
    df,
    test_size=1/3,
    stratify=df.decision,  # Maintain proportion of "upheld" vs "reversed" decisions
    random_state=42,
    shuffle=True
)

In [None]:
# Balance the training set by undersampling the majority class
# This ensures equal representation of "upheld" and "reversed" decisions in training
n = train_unbalanced.decision.value_counts().min()  # Get size of minority class
train = (
    train_unbalanced.groupby("decision")  # Group by decision type
    .sample(n=n, random_state=42)        # Sample n cases from each group
    .sample(frac=1, random_state=42)     # Shuffle the final balanced dataset
)

In [None]:
# Reset indices for clean, sequential indexing in final datasets
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# Convert pandas DataFrames to HuggingFace Dataset format
# This format is optimized for machine learning workflows and model training
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

# Combine into a DatasetDict for easy access to both splits
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
# Save the final dataset to disk in HuggingFace format
# This creates the "BGH-CivAppeals-GenderCF" dataset ready for upload and use
dataset.save_to_disk(
    config.DATA_DIR / "BGH-CivAppeals-GenderCF"
)