# Create the failure dataset at hourly resolution

In [1]:
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import itertools
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))
import data_processing as dp

from collections import defaultdict

## Load dataset

In [2]:
events_df = dp.load_events()

Loading Events : 2013
Loading Events : 2014
Loading Events : 2015
Loading Events : 2016
Loading Events : 2017
Loading Events : 2018
Loading Events : 2019
Loading Events : 2020
Loading Events : 2021
Loading Events : 2022
Loading Events : 2023
Loading Events : 2024
Loaded 6005103 events


In [3]:
importlib.reload(dp)

filtered_events_df = dp.filter_events(events_df, 
                                      CauseCodes=['U1', 'U2', 'U3', 'D1', 'D2', 'D3', 'SF'], 
                                      filter_fuel=False, 
                                      exclude_states=['Other','Mexico','South America'], 
                                      include_states=None,
                                      add_fuel_failure=False)

Filtering events by cause codes...
Kept 839971 events out of 6005103 (13.99%) after filtering by cause codes.


Merging duplicated events: 100%|██████████| 839971/839971 [00:11<00:00, 73947.19it/s] 
Adding state and region: 100%|██████████| 834847/834847 [01:08<00:00, 12177.04it/s]


In [4]:
units_start_end = dp.get_units_start_end()

Loading Performances : 2013
Loading Performances : 2014
Loading Performances : 2015
Loading Performances : 2016
Loading Performances : 2017
Loading Performances : 2018
Loading Performances : 2019
Loading Performances : 2020
Loading Performances : 2021
Loading Performances : 2022
Loading Performances : 2023
Loading Performances : 2024
Loaded 1035652 events


100%|██████████| 9305/9305 [00:07<00:00, 1173.11it/s]


## Format transition data

In [5]:
transition_data_compressed = defaultdict(pd.DataFrame)

for unit_id, events_unit in tqdm(filtered_events_df.groupby('UnitID')):
    # Store the transitions for this unit
    unit_transitions_data = {"Datetime":[], # at the start of each hour
                            "Initial_gen_state":[],
                            "Final_gen_state":[]
                            }
    # Get the start and end time for this unit
    start_time_unit = units_start_end[units_start_end['UnitID'] == unit_id]['First_Operation_Date'].values[0]
    end_time_unit = units_start_end[units_start_end['UnitID'] == unit_id]['Last_Operation_Date'].values[0]
    current_time = start_time_unit

    # Unit metadata
    unit_geo_state = events_unit["State"].iloc[0]
    unit_technology = events_unit["UnitTypeCodeName"].iloc[0]

    # Sort events by start date
    events_unit = events_unit.sort_values('EventStartDT')
    last_state = 'A'

    for event_idx, event in events_unit.iterrows():
        event_start = event['EventStartDT'].replace(minute=0, second=0, microsecond=0)
        event_end = event['EventEndDT'].replace(minute=0, second=0, microsecond=0) + pd.Timedelta(hours=1)
        state_during_event = event["EventTypeCode"]
        if state_during_event.startswith('D'):
            state_during_event = 'D'
        elif state_during_event.startswith('U'):
            state_during_event = 'U'
        elif state_during_event == 'SF':
            if last_state != 'A':
                state_during_event = 'U' # Startup failure treated as outage if the unit was not available before
            else:
                continue # skip startup failure if the unit was available before
        

        # Record 'A' states until the event starts
        while current_time < event_start:
            unit_transitions_data["Datetime"].append(current_time)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append('A')
            current_time += pd.Timedelta(hours=1)
            last_state = 'A'
        
        # Record states during the event
        while current_time < event_end:
            unit_transitions_data["Datetime"].append(current_time)
            unit_transitions_data["Initial_gen_state"].append(last_state)
            unit_transitions_data["Final_gen_state"].append(state_during_event)
            current_time += pd.Timedelta(hours=1)
            last_state = state_during_event
    
    # Record 'A' states until the end of the unit's operation
    while current_time <= end_time_unit:
        unit_transitions_data["Datetime"].append(current_time)
        unit_transitions_data["Initial_gen_state"].append(last_state)
        unit_transitions_data["Final_gen_state"].append('A')
        current_time += pd.Timedelta(hours=1)
        last_state = 'A'

    unit_transitions_df = pd.DataFrame(unit_transitions_data)

    transition_data_compressed[unit_id] = {
                                            "metadata": {
                                                "UnitID": unit_id,
                                                "State": unit_geo_state,
                                                "Technology": unit_technology
                                            },
                                            "transitions": unit_transitions_df
                                          }


100%|██████████| 7166/7166 [18:56<00:00,  6.30it/s]


## Export yearly files

In [6]:
compressed = True

for year in range(2013, 2024 + 1):
    print(f"Exporting year {year}...")
    yearly_data = []
    for unit_id, unit_data in tqdm(transition_data_compressed.items()):
        unit_transitions_df = unit_data["transitions"]
        # Filter for the current year
        yearly_transitions_df = unit_transitions_df[unit_transitions_df["Datetime"].dt.year == year].copy()
        unit_geo_state = unit_data["metadata"]["State"]
        unit_technology = unit_data["metadata"]["Technology"]

        yearly_transitions_df["Geographical State"] = unit_geo_state
        yearly_transitions_df["Technology"] = unit_technology

        yearly_data.append(yearly_transitions_df)
    yearly_data_df = pd.concat(yearly_data, ignore_index=True)
    
    if compressed :
        compressed_df = yearly_data_df.groupby(['Datetime', 'Initial_gen_state', 'Final_gen_state', 'Geographical State', 'Technology']).size().reset_index(name='Count')
        compressed_df.to_csv(f"../Data/hourly_failure_dataset_compressed_{year}.csv", index=False)
    else:
        yearly_data_df.to_csv(f"../Data/hourly_failure_dataset_{year}.csv", index=False)

Exporting year 2013...


100%|██████████| 7166/7166 [00:07<00:00, 948.65it/s] 


Exporting year 2014...


100%|██████████| 7166/7166 [00:07<00:00, 971.16it/s] 


Exporting year 2015...


100%|██████████| 7166/7166 [00:07<00:00, 961.66it/s] 


Exporting year 2016...


100%|██████████| 7166/7166 [00:07<00:00, 982.54it/s] 


Exporting year 2017...


100%|██████████| 7166/7166 [00:07<00:00, 983.45it/s] 


Exporting year 2018...


100%|██████████| 7166/7166 [00:07<00:00, 983.93it/s] 


Exporting year 2019...


100%|██████████| 7166/7166 [00:07<00:00, 972.20it/s] 


Exporting year 2020...


100%|██████████| 7166/7166 [00:07<00:00, 959.19it/s] 


Exporting year 2021...


100%|██████████| 7166/7166 [00:07<00:00, 977.04it/s] 


Exporting year 2022...


100%|██████████| 7166/7166 [00:07<00:00, 981.87it/s] 


Exporting year 2023...


100%|██████████| 7166/7166 [00:07<00:00, 976.20it/s] 


Exporting year 2024...


100%|██████████| 7166/7166 [00:07<00:00, 982.58it/s] 
