# PFΔ Dataset Quick Start Guide
This notebook demonstrates how to initialize and explore the PFDeltaDataset

In [None]:
import sys
PATH = '/mnt/home/donti-group-shared/pfdelta_neurips'
sys.path.append(PATH)

# Import required libraries
from torch_geometric.data import InMemoryDataset
from core.datasets.pfdelta_dataset import PFDeltaDataset
from core.datasets.dataset_utils import (
    canos_pf_data_mean0_var1,
    canos_pf_slack_mean0_var1,
)
from core.datasets.data_stats import canos_pfdelta_stats, pfnet_pfdata_stats
from functools import partial

### 1. Inititialize Dataset Class & Extract HuggingFace Data

In [7]:
# Basic initialization for Task 1.3 (baseline task with all contingencies)
dataset = PFDeltaDataset(
    root_dir="data",              # Root directory for dataset storage
    case_name="case14",           # Power system case (14-bus system)
    perturbation="n",             # Grid contingency: "n", "n-1", or "n-2"
    feasibility_type="feasible",  # "feasible", "near infeasible", or "approaching infeasible"
    n_samples=-1,                 # -1 loads all available samples
    split="train",                # "train", "val", "test", or "all"
    model="CANOS",                  # Model identifier for processed file naming
    task=1.3,                     # Benchmark task (1.1-4.3) or "analysis"
    add_bus_type=False,           # Include bus-type-specific node sets
    force_reload=False            # Force reprocessing of data
)

print(f"Dataset initialized successfully!")
print(f"Number of samples: {len(dataset)}")
print(f"Task: {dataset.task}")
print(f"Case: {dataset.case_name}")
print(f"Split: {dataset.split}")


Downloading files for task 1.3...
Downloading shuffle files...


Downloading https://huggingface.co/datasets/pfdelta/pfdelta/resolve/main/shuffle_files.tar
Extracting data/shuffle_files.tar
Downloading https://huggingface.co/datasets/pfdelta/pfdelta/resolve/main/case14.tar.gz


Downloading case14 data from https://huggingface.co/datasets/pfdelta/pfdelta/resolve/main/case14.tar.gz ...


Extracting data/case14.tar.gz
Processing...


Extracted case14 data to data
Processing combined data for task 1.3
Processing split: CANOS 1.3 n train (16200 files)


Building train data: 100%|██████████| 16200/16200 [01:56<00:00, 139.37it/s]


Processing split: CANOS 1.3 n val (1800 files)


Building val data: 100%|██████████| 1800/1800 [00:13<00:00, 135.80it/s]


Processing split: CANOS 1.3 n test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:11<00:00, 174.70it/s]


Processing split: CANOS 1.3 n-1 train (16200 files)


Building train data: 100%|██████████| 16200/16200 [02:12<00:00, 122.39it/s]


Processing split: CANOS 1.3 n-1 val (1800 files)


Building val data: 100%|██████████| 1800/1800 [00:14<00:00, 121.92it/s]


Processing split: CANOS 1.3 n-1 test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:17<00:00, 111.47it/s]


Processing split: CANOS 1.3 n-2 train (16200 files)


Building train data: 100%|██████████| 16200/16200 [02:31<00:00, 107.15it/s]


Processing split: CANOS 1.3 n-2 val (1800 files)


Building val data: 100%|██████████| 1800/1800 [00:16<00:00, 110.68it/s]


Processing split: CANOS 1.3 n-2 test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:19<00:00, 102.97it/s]


Processing split: CANOS 1.3 n near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 153.16it/s]


Processing split: CANOS 1.3 n-1 near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 135.01it/s]


Processing split: CANOS 1.3 n-2 near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 114.74it/s]


Collating combined train data with 48600 samples
Saved combined train data with 48600 samples
Collating combined val data with 5400 samples
Saved combined val data with 5400 samples
Collating combined test data with 6600 samples
Saved combined test data with 6600 samples


Done!


Loading train dataset from data/processed/combined_task_1.3_CANOS_case14/train.pt
Dataset initialized successfully!
Number of samples: 48600
Task: 1.3
Case: case14
Split: train


### 2. Access and Print a Data Sample

In [9]:
sample = dataset[0]
sample

HeteroData(
  bus={
    x=[14, 2],
    y=[14, 2],
    bus_gen=[14, 2],
    bus_demand=[14, 2],
    bus_voltages=[14, 2],
    bus_type=[14],
    shunt=[14, 2],
    limits=[14, 2],
  },
  gen={
    limits=[5, 4],
    generation=[5, 2],
    slack_gen=[5],
  },
  load={ demand=[11, 2] },
  (bus, branch, bus)={
    edge_index=[2, 20],
    edge_attr=[20, 8],
    edge_label=[20, 4],
    edge_limits=[20, 1],
  },
  (gen, gen_link, bus)={ edge_index=[2, 5] },
  (bus, gen_link, gen)={ edge_index=[2, 5] },
  (load, load_link, bus)={ edge_index=[2, 11] },
  (bus, load_link, load)={ edge_index=[2, 11] }
)

In [8]:
sample = dataset[0]

print("\n" + "="*70)
print("SAMPLE DATA STRUCTURE")
print("="*70)

# Print sample type
print(f"\nData type: {type(sample)}")

# If it's a HeteroData object, print its structure
if hasattr(sample, 'node_types'):
    print(f"\nNode types: {sample.node_types}")
    print(f"Edge types: {sample.edge_types}")
    
    # Print node features for each node type
    print("\n--- Node Features ---")
    for node_type in sample.node_types:
        if hasattr(sample[node_type], 'x'):
            print(f"{node_type}: {sample[node_type].x.shape}")
    
    # Print edge information
    print("\n--- Edge Information ---")
    for edge_type in sample.edge_types:
        edge_index = sample[edge_type].edge_index
        print(f"{edge_type}: {edge_index.shape} edges")
        if hasattr(sample[edge_type], 'edge_attr'):
            print(f"  Edge attributes: {sample[edge_type].edge_attr.shape}")

# Print available attributes
print("\n--- Available Attributes ---")
if hasattr(sample, 'keys'):
    for key in sample.keys():
        print(f"- {key}")


SAMPLE DATA STRUCTURE

Data type: <class 'torch_geometric.data.hetero_data.HeteroData'>

Node types: ['bus', 'gen', 'load']
Edge types: [('bus', 'branch', 'bus'), ('gen', 'gen_link', 'bus'), ('bus', 'gen_link', 'gen'), ('load', 'load_link', 'bus'), ('bus', 'load_link', 'load')]

--- Node Features ---
bus: torch.Size([14, 2])

--- Edge Information ---
('bus', 'branch', 'bus'): torch.Size([2, 20]) edges
  Edge attributes: torch.Size([20, 8])
('gen', 'gen_link', 'bus'): torch.Size([2, 5]) edges
('bus', 'gen_link', 'gen'): torch.Size([2, 5]) edges
('load', 'load_link', 'bus'): torch.Size([2, 11]) edges
('bus', 'load_link', 'load'): torch.Size([2, 11]) edges

--- Available Attributes ---
- y
- limits
- bus_gen
- edge_index
- edge_limits
- bus_demand
- bus_type
- edge_label
- generation
- slack_gen
- shunt
- demand
- x
- bus_voltages
- edge_attr


### 3. Custom PFDeltaDataset Preprocessing

PFDeltaDataset can be inherited and tailored for model-specific data preprocessing. To create a custom dataset variant, you can:

- Override __init__ to add custom initialization logic (e.g., model-specific transforms or normalization)

- Override build_heterodata to implement custom graph construction or preprocessing steps

Below is a simplified example for the CANOS architecture. Note that the actual CANOS implementation includes additional normalization transforms using case-specific statistics—this example focuses on the core graph pruning logic for clarity.


In [10]:
class PFDeltaCANOS(PFDeltaDataset):
    """
    Simplified PFDelta dataset variant for CANOS model.
    
    Prunes the heterogeneous graph to include only the node types
    required by CANOS: bus, PV, PQ, and slack.
    """

    def __init__(
        self,
        root_dir="data",
        case_name="",
        split="train",
        model="CANOS",
        task=1.1,
        add_bus_type=True,
        force_reload=False,
    ):
        # Initialize parent class with CANOS defaults
        super().__init__(
            root_dir=root_dir,
            case_name=case_name,
            split=split,
            model=model,
            task=task,
            add_bus_type=add_bus_type,
            force_reload=force_reload,
        )

    def build_heterodata(self, pm_case: dict, is_cpf_sample: bool = False):
        """
        Build a CANOS-compatible HeteroData graph with pruned node types.
        
        Parameters
        ----------
        pm_case : dict
            PowerModels.jl case dictionary with bus, branch, gen, and load data
        is_cpf_sample : bool
            Whether this is a continuation power flow sample
            
        Returns
        -------
        data : HeteroData
            Processed graph with only bus, PV, PQ, and slack nodes
        """
        # Build the full heterogeneous graph using parent method
        data = super().build_heterodata(pm_case, is_cpf_sample=is_cpf_sample)

        # Prune to keep only CANOS-required node types
        keep_nodes = {"bus", "PV", "PQ", "slack"}

        # Remove unwanted node types
        for node_type in list(data.node_types):
            if node_type not in keep_nodes:
                del data[node_type]

        # Remove edges connected to deleted node types
        for edge_type in list(data.edge_types):
            src, _, dst = edge_type
            if src not in keep_nodes or dst not in keep_nodes:
                del data[edge_type]

        return data

In [11]:
# Initialize CANOS dataset for Task 1.1
dataset_canos = PFDeltaCANOS(
    root_dir="data",              # Root directory for dataset storage
    case_name="case14",           # Power system case (14-bus system)
    split="train",                # "train", "val", or "test"
    model="CANOS",                # Model identifier
    task=1.1,                     # Benchmark task
    add_bus_type=True,            # Include bus type encodings
    force_reload=False            # Force reprocessing of data
)

print(f"CANOS Dataset initialized successfully!")
print(f"Number of samples: {len(dataset_canos)}")
print(f"Task: {dataset_canos.task}")
print(f"Case: {dataset_canos.case_name}")
print(f"Split: {dataset_canos.split}")

# Get a sample and inspect the pruned heterograph
sample = dataset_canos[0]

print(f"\nHeteroData Structure (CANOS-pruned):")
print(f"Node types: {sample.node_types}")
print(f"Edge types: {sample.edge_types}")

# Show node counts and features
print("\n--- Node Information ---")
for node_type in sample.node_types:
    if hasattr(sample[node_type], 'x'):
        num_nodes = sample[node_type].x.shape[0]
        num_features = sample[node_type].x.shape[1]
        print(f"  {node_type:10s}: {num_nodes:3d} nodes, {num_features:3d} features")

# Show edge connections
print("\n--- Edge Connectivity ---")
for edge_type in sample.edge_types:
    src, relation, dst = edge_type
    num_edges = sample[edge_type].edge_index.shape[1]
    print(f"  {src:10s} --({relation})--> {dst:10s}: {num_edges} edges")

Processing...


Downloading files for task 1.1...
Shuffle files already exist. Skipping download.
case14 data already exists. Skipping download.
Processing combined data for task 1.1
Processing split: CANOS 1.1 n train (48600 files)


Building train data: 100%|██████████| 48600/48600 [06:54<00:00, 117.23it/s]


Processing split: CANOS 1.1 n val (5400 files)


Building val data: 100%|██████████| 5400/5400 [00:50<00:00, 107.71it/s]


Processing split: CANOS 1.1 n test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:14<00:00, 134.51it/s]


Processing split: CANOS 1.1 n-1 train (0 files)


Building train data: 0it [00:00, ?it/s]


Processing split: CANOS 1.1 n-1 val (0 files)


Building val data: 0it [00:00, ?it/s]


Processing split: CANOS 1.1 n-1 test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:21<00:00, 94.53it/s] 


Processing split: CANOS 1.1 n-2 train (0 files)


Building train data: 0it [00:00, ?it/s]


Processing split: CANOS 1.1 n-2 val (0 files)


Building val data: 0it [00:00, ?it/s]


Processing split: CANOS 1.1 n-2 test (2000 files)


Building test data: 100%|██████████| 2000/2000 [00:13<00:00, 153.73it/s]


Processing split: CANOS 1.1 n near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 144.81it/s]


Processing split: CANOS 1.1 n-1 near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 121.74it/s]


Processing split: CANOS 1.1 n-2 near infeasible test (200 files)


Building test data: 100%|██████████| 200/200 [00:01<00:00, 160.27it/s]


Collating combined train data with 48600 samples
Saved combined train data with 48600 samples
Collating combined val data with 5400 samples
Saved combined val data with 5400 samples
Collating combined test data with 6600 samples
Saved combined test data with 6600 samples
Loading train dataset from data/processed/combined_task_1.1_CANOS_case14/train.pt
CANOS Dataset initialized successfully!
Number of samples: 48600
Task: 1.1
Case: case14
Split: train

HeteroData Structure (CANOS-pruned):
Node types: ['bus', 'PQ', 'PV', 'slack']
Edge types: [('bus', 'branch', 'bus'), ('PV', 'PV_link', 'bus'), ('bus', 'PV_link', 'PV'), ('PQ', 'PQ_link', 'bus'), ('bus', 'PQ_link', 'PQ'), ('slack', 'slack_link', 'bus'), ('bus', 'slack_link', 'slack')]

--- Node Information ---
  bus       :  14 nodes,   2 features
  PQ        :   9 nodes,   2 features
  PV        :   4 nodes,   2 features
  slack     :   1 nodes,   2 features

--- Edge Connectivity ---
  bus        --(branch)--> bus       : 20 edges
  PV  

Done!


In [12]:
sample

HeteroData(
  bus={
    x=[14, 2],
    y=[14, 2],
    bus_gen=[14, 2],
    bus_demand=[14, 2],
    bus_voltages=[14, 2],
    bus_type=[14],
    shunt=[14, 2],
    limits=[14, 2],
  },
  PQ={
    x=[9, 2],
    y=[9, 2],
  },
  PV={
    x=[4, 2],
    y=[4, 2],
    generation=[4, 2],
    demand=[4, 2],
  },
  slack={
    x=[1, 2],
    y=[1, 2],
    generation=[1, 2],
    demand=[1, 2],
  },
  (bus, branch, bus)={
    edge_index=[2, 20],
    edge_attr=[20, 8],
    edge_label=[20, 4],
    edge_limits=[20, 1],
  },
  (PV, PV_link, bus)={ edge_index=[2, 4] },
  (bus, PV_link, PV)={ edge_index=[2, 4] },
  (PQ, PQ_link, bus)={ edge_index=[2, 9] },
  (bus, PQ_link, PQ)={ edge_index=[2, 9] },
  (slack, slack_link, bus)={ edge_index=[2, 1] },
  (bus, slack_link, slack)={ edge_index=[2, 1] }
)