# Create the failure dataset at hourly resolution

In [1]:
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import itertools
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))
import data_processing as dp
from zoneinfo import ZoneInfo


from collections import defaultdict

## Load dataset

In [2]:
events_df = dp.load_events()

Loading Events : 2013
Loading Events : 2014
Loading Events : 2015
Loading Events : 2016
Loading Events : 2017
Loading Events : 2018
Loading Events : 2019
Loading Events : 2020
Loading Events : 2021
Loading Events : 2022
Loading Events : 2023
Loading Events : 2024
Loaded 6005103 events


In [3]:
importlib.reload(dp)

filtered_events_df = dp.filter_events(events_df, 
                                      CauseCodes=['U1', 'U2', 'U3', 'D1', 'D2', 'D3', 'SF'], 
                                      filter_fuel=False, 
                                      exclude_states=['Other','Mexico','South America'], 
                                      include_states=None,
                                      add_fuel_failure=False)

Filtering events by cause codes...
Kept 839971 events out of 6005103 (13.99%) after filtering by cause codes.


Merging duplicated events: 100%|██████████| 839971/839971 [00:11<00:00, 72687.29it/s] 
Adding state and region: 100%|██████████| 834847/834847 [01:07<00:00, 12421.82it/s]


In [4]:
units_start_end = dp.get_units_start_end()

Loading Performances : 2013
Loading Performances : 2014
Loading Performances : 2015
Loading Performances : 2016
Loading Performances : 2017
Loading Performances : 2018
Loading Performances : 2019
Loading Performances : 2020
Loading Performances : 2021
Loading Performances : 2022
Loading Performances : 2023
Loading Performances : 2024
Loaded 1035652 events


100%|██████████| 9305/9305 [00:07<00:00, 1195.07it/s]


## Format transition data

In [5]:
# Mapping from U.S. state / Canadian province abbreviations to representative IANA time zones
STATE_TIMEZONE = {
    # --- U.S. ---
    "AL": "America/Chicago", "AK": "America/Anchorage", "AZ": "America/Phoenix",
    "AR": "America/Chicago", "CA": "America/Los_Angeles", "CO": "America/Denver",
    "CT": "America/New_York", "DE": "America/New_York", "DC": "America/New_York",
    "FL": "America/New_York", "GA": "America/New_York", "HI": "Pacific/Honolulu",
    "ID": "America/Boise", "IL": "America/Chicago", "IN": "America/Indiana/Indianapolis",
    "IA": "America/Chicago", "KS": "America/Chicago", "KY": "America/New_York",
    "LA": "America/Chicago", "ME": "America/New_York", "MD": "America/New_York",
    "MA": "America/New_York", "MI": "America/Detroit", "MN": "America/Chicago",
    "MS": "America/Chicago", "MO": "America/Chicago", "MT": "America/Denver",
    "NE": "America/Chicago", "NV": "America/Los_Angeles", "NH": "America/New_York",
    "NJ": "America/New_York", "NM": "America/Denver", "NY": "America/New_York",
    "NC": "America/New_York", "ND": "America/Chicago", "OH": "America/New_York",
    "OK": "America/Chicago", "OR": "America/Los_Angeles", "PA": "America/New_York",
    "RI": "America/New_York", "SC": "America/New_York", "SD": "America/Chicago",
    "TN": "America/Chicago", "TX": "America/Chicago", "UT": "America/Denver",
    "VT": "America/New_York", "VA": "America/New_York", "WA": "America/Los_Angeles",
    "WV": "America/New_York", "WI": "America/Chicago", "WY": "America/Denver",

    # --- Canada ---
    "AB": "America/Edmonton", "BC": "America/Vancouver", "MB": "America/Winnipeg",
    "NB": "America/Moncton", "NL": "America/St_Johns", "NS": "America/Halifax",
    "NT": "America/Yellowknife", "NU": "America/Iqaluit", "ON": "America/Toronto",
    "PE": "America/Halifax", "QC": "America/Montreal", "SK": "America/Regina",
    "YT": "America/Whitehorse",
}

# Load state abbreviations
state_files = "../data/ghcn-states.txt"
state2abv = {}
with open(state_files, "r") as f:
    for line in f:
        line = line.strip().split()
        state2abv[' '.join(line[1:])] = line[0]

In [6]:
transition_data_compressed = defaultdict(pd.DataFrame)

for unit_id, events_unit in tqdm(filtered_events_df.groupby('UnitID')):
    # Store the transitions for this unit
    unit_transitions_data = {"Datetime_local":[], # at the start of each hour
                            "Initial_gen_state":[],
                            "Final_gen_state":[],
                            "hours_in_state": []
                            }

    # Unit metadata
    unit_geo_state = events_unit["State"].iloc[0]
    unit_technology = events_unit["UnitTypeCodeName"].iloc[0]
    state_abv = state2abv.get(unit_geo_state.upper(), None)
    tz_name = STATE_TIMEZONE.get(state_abv)

    if tz_name is None:
        print(f"⚠️ Unknown timezone for {unit_geo_state}, skipping unit {unit_id}")
        continue

    tz = ZoneInfo(tz_name)

    # Get start and end time for this unit
    start_time_unit_local = pd.Timestamp(units_start_end.loc[
                                                        units_start_end['UnitID'] == unit_id, 'First_Operation_Date'
                                                      ].values[0])
    end_time_unit_local = pd.Timestamp(units_start_end.loc[
                                                        units_start_end['UnitID'] == unit_id, 'Last_Operation_Date'
                                                    ].values[0])

    current_time_local = start_time_unit_local
    events_unit = events_unit.sort_values('EventStartDT')
    last_state = 'A'
    hours_in_current_state = np.nan
    
    for _, event in events_unit.iterrows():
        if hours_in_current_state > 0:
            hours_in_current_state = 1
        event_start_local = event['EventStartDT'].replace(minute=0, second=0, microsecond=0)
        event_end_local = event['EventEndDT'].replace(minute=0, second=0, microsecond=0) + pd.Timedelta(hours=1)
        state_during_event = event["EventTypeCode"]

        # Simplify event type
        if state_during_event.startswith('D'):
            state_during_event = 'D'
        elif state_during_event.startswith('U'):
            state_during_event = 'U'
        elif state_during_event == 'SF':
            if last_state != 'A':
                state_during_event = 'U' # Startup failure treated as outage if the unit was not available before
            else:
                continue # skip startup failure if the unit was available before


        # Record 'A' states until the event starts
        while current_time_local < event_start_local:
            unit_transitions_data["Datetime_local"].append(current_time_local)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append('A')
            unit_transitions_data["hours_in_state"].append(hours_in_current_state)
            current_time_local += pd.Timedelta(hours=1)
            hours_in_current_state += 1
            last_state = 'A'
        
        # Record states during the event
        hours_in_current_state = 1
        while current_time_local < event_end_local:
            unit_transitions_data["Datetime_local"].append(current_time_local)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append(state_during_event)
            unit_transitions_data["hours_in_state"].append(hours_in_current_state)
            current_time_local += pd.Timedelta(hours=1)
            hours_in_current_state += 1
            last_state = state_during_event

    # Fill 'A' states until end of operation
    if hours_in_current_state > 0:
        hours_in_current_state = 1
    while current_time_local <= end_time_unit_local:
        unit_transitions_data["Datetime_local"].append(current_time_local)
        unit_transitions_data["Initial_gen_state"].append(last_state)
        unit_transitions_data["Final_gen_state"].append('A')
        unit_transitions_data["hours_in_state"].append(hours_in_current_state)
        current_time_local += pd.Timedelta(hours=1)
        hours_in_current_state += 1
        last_state = 'A'

   # --- Create DataFrame and convert local → UTC ---
    unit_transitions_df = pd.DataFrame(unit_transitions_data)

    # Localize to state timezone and drop invalid (nonexistent) times during DST jump
    unit_transitions_df["Datetime_UTC"] = (
    pd.to_datetime(unit_transitions_df["Datetime_local"])
    .dt.tz_localize(tz, nonexistent="NaT", ambiguous="NaT")   # use .dt accessor
    .dt.tz_convert("UTC")
    )
    unit_transitions_df = unit_transitions_df.dropna(subset=["Datetime_UTC"])

    transition_data_compressed[unit_id] = {
        "metadata": {
            "UnitID": unit_id,
            "State": unit_geo_state,
            "Technology": unit_technology
        },
        "transitions": unit_transitions_df.reset_index(drop=True)
    }

100%|██████████| 7166/7166 [31:36<00:00,  3.78it/s]


In [None]:
transition_data_compressed = defaultdict(pd.DataFrame)

for unit_id, events_unit in tqdm(filtered_events_df.groupby('UnitID')):
    # Store the transitions for this unit
    unit_transitions_data = {"Datetime_local":[], # at the start of each hour
                            "Initial_gen_state":[],
                            "Final_gen_state":[],
                            }

    # Unit metadata
    unit_geo_state = events_unit["State"].iloc[0]
    unit_technology = events_unit["UnitTypeCodeName"].iloc[0]
    state_abv = state2abv.get(unit_geo_state.upper(), None)
    tz_name = STATE_TIMEZONE.get(state_abv)

    if tz_name is None:
        print(f"⚠️ Unknown timezone for {unit_geo_state}, skipping unit {unit_id}")
        continue

    tz = ZoneInfo(tz_name)

    # Get start and end time for this unit
    start_time_unit_local = pd.Timestamp(units_start_end.loc[
                                                        units_start_end['UnitID'] == unit_id, 'First_Operation_Date'
                                                      ].values[0])
    end_time_unit_local = pd.Timestamp(units_start_end.loc[
                                                        units_start_end['UnitID'] == unit_id, 'Last_Operation_Date'
                                                    ].values[0])

    current_time_local = start_time_unit_local
    events_unit = events_unit.sort_values('EventStartDT')
    last_state = 'A'
    
    for _, event in events_unit.iterrows():
        event_start_local = event['EventStartDT'].replace(minute=0, second=0, microsecond=0)
        event_end_local = event['EventEndDT'].replace(minute=0, second=0, microsecond=0) + pd.Timedelta(hours=1)
        state_during_event = event["EventTypeCode"]

        # Simplify event type
        if state_during_event.startswith('D'):
            state_during_event = 'D'
        elif state_during_event.startswith('U'):
            state_during_event = 'U'
        elif state_during_event == 'SF':
            if last_state != 'A':
                state_during_event = 'U' # Startup failure treated as outage if the unit was not available before
            else:
                continue # skip startup failure if the unit was available before


        # Record 'A' states until the event starts
        while current_time_local < event_start_local:
            unit_transitions_data["Datetime_local"].append(current_time_local)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append('A')
            current_time_local += pd.Timedelta(hours=1)
            last_state = 'A'
        
        # Record states during the event
        while current_time_local < event_end_local:
            unit_transitions_data["Datetime_local"].append(current_time_local)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append(state_during_event)
            current_time_local += pd.Timedelta(hours=1)
            last_state = state_during_event

    # Fill 'A' states until end of operation
    while current_time_local <= end_time_unit_local:
        unit_transitions_data["Datetime_local"].append(current_time_local)
        unit_transitions_data["Initial_gen_state"].append(last_state)
        unit_transitions_data["Final_gen_state"].append('A')
        current_time_local += pd.Timedelta(hours=1)
        last_state = 'A'

   # --- Create DataFrame and convert local → UTC ---
    unit_transitions_df = pd.DataFrame(unit_transitions_data)

    # Localize to state timezone and drop invalid (nonexistent) times during DST jump
    unit_transitions_df["Datetime_UTC"] = (
    pd.to_datetime(unit_transitions_df["Datetime_local"])
    .dt.tz_localize(tz, nonexistent="NaT", ambiguous="NaT")   # use .dt accessor
    .dt.tz_convert("UTC")
    )
    unit_transitions_df = unit_transitions_df.dropna(subset=["Datetime_UTC"])

    transition_data_compressed[unit_id] = {
        "metadata": {
            "UnitID": unit_id,
            "State": unit_geo_state,
            "Technology": unit_technology
        },
        "transitions": unit_transitions_df.reset_index(drop=True)
    }

100%|██████████| 7166/7166 [30:28<00:00,  3.92it/s]


## Export 1 file

### Data with $\Delta$

In [12]:
all_data_by_technology = defaultdict(list)


for unit_id, unit_data in tqdm(transition_data_compressed.items()):
    unit_transitions_df = unit_data["transitions"]
    # Filter for the current year
    transitions_df = unit_transitions_df.copy()
    unit_geo_state = unit_data["metadata"]["State"]
    unit_technology = unit_data["metadata"]["Technology"]

    transitions_df["Geographical State"] = unit_geo_state
    transitions_df["Technology"] = unit_technology
    all_data_by_technology[unit_technology].append(transitions_df)

    # all_data.append(transitions_df)

# data_df = pd.concat(all_data, ignore_index=True)

for technology, df_list in all_data_by_technology.items():
    data_df = pd.concat(df_list, ignore_index=True)
    print(f"Technology: {technology}, Total Hours: {len(data_df)}, D Hours: {(data_df['Final_gen_state'] == 'D').sum()}, U Hours: {(data_df['Final_gen_state'] == 'U').sum()}")
    tech_print = technology.replace(" ", "_").replace("/", "_")
    data_df['Data_weight'] = 1.0
    h = data_df["hours_in_state"].values
    h = np.floor(np.log10(h)*10)/10
    data_df['hours_in_state'] = h
    discrete_keys = ["Datetime_UTC", "Geographical State", "Initial_gen_state", "Final_gen_state", "Technology", "hours_in_state"]
    agg_dict = {col: "first" for col in data_df.columns if col not in ["Data_weight"] + discrete_keys}
    agg_dict["Data_weight"] = "sum"
    
    data_df = (
        data_df.groupby(discrete_keys, as_index=False)
        .agg(agg_dict)
        .reset_index(drop=True)
    )
    data_df.to_csv(f"../Data/hourly/by_technology/hourly_failure_deltaTime_dataset_{tech_print}.csv", index=False)

# for technology in data_df["Technology"].unique():
#     tech_df = data_df[data_df["Technology"] == technology].copy()
#     print(f"Technology: {technology}, Total Hours: {len(tech_df)}, D Hours: {(tech_df['Final_gen_state'] == 'D').sum()}, U Hours: {(tech_df['Final_gen_state'] == 'U').sum()}")

#     tech_df.to_csv(f"../Data/hourly/by_technology/hourly_failure_deltaTime_dataset_{technology}.csv", index=False)

100%|██████████| 7166/7166 [00:33<00:00, 212.49it/s]


Technology: Fossil-Steam, Total Hours: 102802822, D Hours: 13009774, U Hours: 5017043
Technology: Combined Cycle Block, Total Hours: 30692826, D Hours: 1249830, U Hours: 780216
Technology: Fluidized Bed, Total Hours: 3334733, D Hours: 368548, U Hours: 147542
Technology: Gas Turbine/Jet Engine (Simple Cycle Operation), Total Hours: 197294723, D Hours: 5311579, U Hours: 7185922
Technology: Nuclear, Total Hours: 10322842, D Hours: 1356159, U Hours: 168664
Technology: Pumped Storage/Hydro, Total Hours: 142306412, D Hours: 4519464, U Hours: 5473204
Technology: Co-generator Block , Total Hours: 3060246, D Hours: 106008, U Hours: 70627
Technology: CC steam units, Total Hours: 29834050, D Hours: 1420680, U Hours: 778298
Technology: CC GT units , Total Hours: 61164928, D Hours: 986461, U Hours: 1735637
Technology: Miscellaneous, Total Hours: 4022297, D Hours: 355229, U Hours: 167111
Technology: Multi-boiler/Multi-turbine, Total Hours: 2023969, D Hours: 291572, U Hours: 73223
Technology: CoG ste

In [None]:
compressed = True

all_data = []
for unit_id, unit_data in tqdm(transition_data_compressed.items()):
    unit_transitions_df = unit_data["transitions"]
    # Filter for the current year
    transitions_df = unit_transitions_df.copy()
    unit_geo_state = unit_data["metadata"]["State"]
    unit_technology = unit_data["metadata"]["Technology"]

    transitions_df["Geographical State"] = unit_geo_state
    transitions_df["Technology"] = unit_technology

    all_data.append(transitions_df)

data_df = pd.concat(all_data, ignore_index=True)

if compressed :
    compressed_df = data_df.groupby(['Datetime_UTC', 'Initial_gen_state', 'Final_gen_state', 'Geographical State', 'Technology']).size().reset_index(name='Count')
    compressed_df.to_csv(f"../Data/hourly_failure_dataset_compressed.csv", index=False)
else:
    data_df.to_csv(f"../Data/hourly_failure_dataset.csv", index=False)

100%|██████████| 7166/7166 [00:12<00:00, 569.90it/s] 


### Export test events not compressed 2022-2023

In [None]:
test_start = pd.Timestamp('2022-01-01 00:00:00', tz='UTC')
test_end = pd.Timestamp('2023-12-31 23:00:00', tz='UTC')

test_dataset = []
for unit_id, unit_data in tqdm(transition_data_compressed.items()):
    unit_transitions_df = unit_data["transitions"]
    # Filter for the test period
    test_transitions_df = unit_transitions_df.loc[(unit_transitions_df['Datetime_UTC'] >= test_start) & (unit_transitions_df['Datetime_UTC'] <= test_end)].copy()
    test_transitions_df = test_transitions_df[['Datetime_UTC', 'Final_gen_state', 'hours_in_state']]
    test_transitions_df['Technology'] = unit_data["metadata"]["Technology"]
    test_transitions_df['State'] = unit_data["metadata"]["State"]
    test_transitions_df['UnitID'] = unit_id
    test_dataset.append(test_transitions_df)

100%|██████████| 7166/7166 [00:46<00:00, 153.89it/s]


: 

In [7]:
test_start = pd.Timestamp('2022-01-01 00:00:00', tz='UTC')
test_end = pd.Timestamp('2023-12-31 23:00:00', tz='UTC')

test_dataset = []
for unit_id, unit_data in tqdm(transition_data_compressed.items()):
    unit_transitions_df = unit_data["transitions"]
    # Filter for the test period
    test_transitions_df = unit_transitions_df.loc[(unit_transitions_df['Datetime_UTC'] >= test_start) & (unit_transitions_df['Datetime_UTC'] <= test_end)].copy()
    test_transitions_df = test_transitions_df[['Datetime_UTC', 'Final_gen_state']]
    test_transitions_df['Technology'] = unit_data["metadata"]["Technology"]
    test_transitions_df['State'] = unit_data["metadata"]["State"]
    test_transitions_df['UnitID'] = unit_id
    test_dataset.append(test_transitions_df)

100%|██████████| 7166/7166 [00:06<00:00, 1133.14it/s]


In [8]:
test_dataset_df = pd.concat(test_dataset, ignore_index=True)
test_dataset_df.rename(columns={'Final_gen_state': 'Gen_state'}, inplace=True)
test_dataset_df

Unnamed: 0,Datetime_UTC,Gen_state,Technology,State,UnitID
0,2022-01-01 00:00:00+00:00,A,Combined Cycle Block,New Jersey,30
1,2022-01-01 01:00:00+00:00,A,Combined Cycle Block,New Jersey,30
2,2022-01-01 02:00:00+00:00,A,Combined Cycle Block,New Jersey,30
3,2022-01-01 03:00:00+00:00,A,Combined Cycle Block,New Jersey,30
4,2022-01-01 04:00:00+00:00,A,Combined Cycle Block,New Jersey,30
...,...,...,...,...,...
95993802,2023-12-31 19:00:00+00:00,A,Gas Turbine/Jet Engine (Simple Cycle Operation),Texas,12871
95993803,2023-12-31 20:00:00+00:00,A,Gas Turbine/Jet Engine (Simple Cycle Operation),Texas,12871
95993804,2023-12-31 21:00:00+00:00,A,Gas Turbine/Jet Engine (Simple Cycle Operation),Texas,12871
95993805,2023-12-31 22:00:00+00:00,A,Gas Turbine/Jet Engine (Simple Cycle Operation),Texas,12871


In [9]:
test_dataset_df.to_csv(f"../Data/hourly_failure_test_dataset_2022_2023.csv", index=False)