Analyzing Power Outages

**Name(s)**: Nimisha Mishra

**Website Link**: [Power Outage Analysis](https://nimishamishra95.github.io/power-outage-analysis/)

In [89]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

os.makedirs("html-views", exist_ok=True)

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

In [90]:
# Helper Function to Save Plotly Figures
def save_plot(fig, name):
    path = f"assets/{name}.html"
    fig.write_html(path, include_plotlyjs="cdn")
    print(f"Saved to {path}")

## Step 1: Introduction

In [91]:
# QUESTION:
#    How do climate region and the categorical cause of an outage influence the 
#    duration of major power outages?

## Step 2: Data Cleaning and Exploratory Data Analysis

##### Import Data

In [92]:
# Import data
df = pd.read_csv("data/outage_cleaned_csv.csv")
df

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,...,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND
0,1,2011,7.0,Minnesota,...,0.60,91.59,8.41,5.48
1,2,2014,5.0,Minnesota,...,0.60,91.59,8.41,5.48
2,3,2010,10.0,Minnesota,...,0.60,91.59,8.41,5.48
...,...,...,...,...,...,...,...,...,...
1531,1532,2009,8.0,South Dakota,...,0.15,98.31,1.69,1.69
1532,1533,2009,8.0,South Dakota,...,0.15,98.31,1.69,1.69
1533,1534,2000,,Alaska,...,0.02,85.76,14.24,2.90


##### Fix Column Names

In [93]:
# Fix column names
new_cols = []
for col in df.columns:
    col = col.replace('U.S.','US')
    col = col.lower().split('.')
    if (len(col) > 1):
        col = ' '.join(col)
    else:
        col = col[0]
    col = col.replace('_', ' ')
    new_cols.append(col)
df.columns = new_cols
df.columns

Index(['obs', 'year', 'month', 'us state', 'postal code', 'nerc region',
       'climate region', 'anomaly level', 'climate category',
       'outage start date', 'outage start time', 'outage restoration date',
       'outage restoration time', 'cause category', 'cause category detail',
       'hurricane names', 'outage duration', 'demand loss mw',
       'customers affected', 'res price', 'com price', 'ind price',
       'total price', 'res sales', 'com sales', 'ind sales', 'total sales',
       'res percen', 'com percen', 'ind percen', 'res customers',
       'com customers', 'ind customers', 'total customers', 'res cust pct',
       'com cust pct', 'ind cust pct', 'pc realgsp state', 'pc realgsp usa',
       'pc realgsp rel', 'pc realgsp change', 'util realgsp', 'total realgsp',
       'util contri', 'pi util ofusa', 'population', 'poppct urban',
       'poppct uc', 'popden urban', 'popden uc', 'popden rural',
       'areapct urban', 'areapct uc', 'pct land', 'pct water tot',
      

##### Drop All Unnecessary Data

In [94]:
# Drop all unnecessary columns
df = df[['year', 'month', 'us state', 'nerc region', 'climate region', 
         'climate category', 'outage start date', 'outage start time', 'outage restoration date',
         'outage restoration time', 'cause category', 'outage duration']]
df

Unnamed: 0,year,month,us state,nerc region,...,outage restoration date,outage restoration time,cause category,outage duration
0,2011,7.0,Minnesota,MRO,...,"Sunday, July 3, 2011",8:00:00 PM,severe weather,3060.0
1,2014,5.0,Minnesota,MRO,...,"Sunday, May 11, 2014",6:39:00 PM,intentional attack,1.0
2,2010,10.0,Minnesota,MRO,...,"Thursday, October 28, 2010",10:00:00 PM,severe weather,3000.0
...,...,...,...,...,...,...,...,...,...
1531,2009,8.0,South Dakota,RFC,...,"Saturday, August 29, 2009",11:53:00 PM,islanding,59.0
1532,2009,8.0,South Dakota,MRO,...,"Saturday, August 29, 2009",2:01:00 PM,islanding,181.0
1533,2000,,Alaska,ASCC,...,,,equipment failure,


##### Data Manipulation

In [95]:
# Merge date and time columns to make one cohesive timestamp column
temp = pd.DataFrame()

def convert_date(series):
    return pd.to_datetime(series, format="%A, %B %d, %Y").dt.strftime("%Y-%m-%d")
temp['formatted outage start date'] = convert_date(df['outage start date'])
temp['formatted outage restoration date'] = convert_date(df['outage restoration date'])

def convert_time(series):
    return pd.to_datetime(series, format="%I:%M:%S %p").dt.strftime("%H:%M:%S")
temp['formatted outage start time'] = convert_time(df['outage start time'])
temp['formatted outage restoration time'] = convert_time(df['outage restoration time'])

df['outage start datetime'] = pd.to_datetime(temp['formatted outage start date'] + " " + temp['formatted outage start time'])
df['outage restoration datetime'] = pd.to_datetime(temp['formatted outage restoration date'] + " " + temp['formatted outage restoration time'])
df[['outage start datetime', 'outage restoration datetime']]

Unnamed: 0,outage start datetime,outage restoration datetime
0,2011-07-01 17:00:00,2011-07-03 20:00:00
1,2014-05-11 18:38:00,2014-05-11 18:39:00
2,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...
1531,2009-08-29 22:54:00,2009-08-29 23:53:00
1532,2009-08-29 11:00:00,2009-08-29 14:01:00
1533,NaT,NaT


In [96]:
# Remove object type date and time columns
df = df.drop(['outage start date', 'outage restoration date', 'outage start time', 'outage restoration time'], axis=1)
df

Unnamed: 0,year,month,us state,nerc region,...,cause category,outage duration,outage start datetime,outage restoration datetime
0,2011,7.0,Minnesota,MRO,...,severe weather,3060.0,2011-07-01 17:00:00,2011-07-03 20:00:00
1,2014,5.0,Minnesota,MRO,...,intentional attack,1.0,2014-05-11 18:38:00,2014-05-11 18:39:00
2,2010,10.0,Minnesota,MRO,...,severe weather,3000.0,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...,...,...,...,...,...,...,...
1531,2009,8.0,South Dakota,RFC,...,islanding,59.0,2009-08-29 22:54:00,2009-08-29 23:53:00
1532,2009,8.0,South Dakota,MRO,...,islanding,181.0,2009-08-29 11:00:00,2009-08-29 14:01:00
1533,2000,,Alaska,ASCC,...,equipment failure,,NaT,NaT


In [97]:
# Replace all 0s in numeric columns with np.nan
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].replace(0, np.nan)
df

Unnamed: 0,year,month,us state,nerc region,...,cause category,outage duration,outage start datetime,outage restoration datetime
0,2011,7.0,Minnesota,MRO,...,severe weather,3060.0,2011-07-01 17:00:00,2011-07-03 20:00:00
1,2014,5.0,Minnesota,MRO,...,intentional attack,1.0,2014-05-11 18:38:00,2014-05-11 18:39:00
2,2010,10.0,Minnesota,MRO,...,severe weather,3000.0,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...,...,...,...,...,...,...,...
1531,2009,8.0,South Dakota,RFC,...,islanding,59.0,2009-08-29 22:54:00,2009-08-29 23:53:00
1532,2009,8.0,South Dakota,MRO,...,islanding,181.0,2009-08-29 11:00:00,2009-08-29 14:01:00
1533,2000,,Alaska,ASCC,...,equipment failure,,NaT,NaT


In [98]:
# Replace all empty strings in string columns with np.nan
for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):
        df[col] = df[col].replace('', np.nan)
df

Unnamed: 0,year,month,us state,nerc region,...,cause category,outage duration,outage start datetime,outage restoration datetime
0,2011,7.0,Minnesota,MRO,...,severe weather,3060.0,2011-07-01 17:00:00,2011-07-03 20:00:00
1,2014,5.0,Minnesota,MRO,...,intentional attack,1.0,2014-05-11 18:38:00,2014-05-11 18:39:00
2,2010,10.0,Minnesota,MRO,...,severe weather,3000.0,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...,...,...,...,...,...,...,...
1531,2009,8.0,South Dakota,RFC,...,islanding,59.0,2009-08-29 22:54:00,2009-08-29 23:53:00
1532,2009,8.0,South Dakota,MRO,...,islanding,181.0,2009-08-29 11:00:00,2009-08-29 14:01:00
1533,2000,,Alaska,ASCC,...,equipment failure,,NaT,NaT


In [99]:
# View Columns
df.columns

Index(['year', 'month', 'us state', 'nerc region', 'climate region',
       'climate category', 'cause category', 'outage duration',
       'outage start datetime', 'outage restoration datetime'],
      dtype='object')

In [100]:
# View in HTML
df.to_html("html-views/cleaned_data.html")

In [101]:
# CSV Output to Generate Markdown Table for df.head
print(df.head().to_csv(index=False))

year,month,us state,nerc region,climate region,climate category,cause category,outage duration,outage start datetime,outage restoration datetime
2011,7.0,Minnesota,MRO,East North Central,normal,severe weather,3060.0,2011-07-01 17:00:00,2011-07-03 20:00:00
2014,5.0,Minnesota,MRO,East North Central,normal,intentional attack,1.0,2014-05-11 18:38:00,2014-05-11 18:39:00
2010,10.0,Minnesota,MRO,East North Central,cold,severe weather,3000.0,2010-10-26 20:00:00,2010-10-28 22:00:00
2012,6.0,Minnesota,MRO,East North Central,normal,severe weather,2550.0,2012-06-19 04:30:00,2012-06-20 23:00:00
2015,7.0,Minnesota,MRO,East North Central,warm,severe weather,1740.0,2015-07-18 02:00:00,2015-07-19 07:00:00



##### Univariate Data Analysis

In [102]:
# Histogram for Distribution of Cause Category
fig = px.histogram(
    df,
    x='cause category',
    nbins=20,
    title="Distribution of Cause Category",
    labels={'cause category':'Cause Category'},
    color_discrete_sequence=['skyblue']
)

fig.update_layout(
    xaxis_title="Cause Category",
    yaxis_title="Frequency",
    title_font_size=20,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12
)

fig.update_layout(
    width=800,
    height=500,
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outages-by-cause-category")

Saved to assets/outages-by-cause-category.html


In [103]:
# Pie Chart for Climate Category of Power Outages
climate_category = df['climate category']

cc_counts = climate_category.value_counts().reset_index()
cc_counts.columns = ['Climate Category', 'Count']

fig = px.pie(
    cc_counts,
    names='Climate Category',
    values='Count',
    title="Pie Chart of Categories",
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(
    width=700,
    height=500,
    title=dict(
        text="Pie Chart for Climate Categories of Power Outages",
        font=dict(size=20),
        x=0.5
    )
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=20)
)

fig.show()

In [104]:
# Pie Chart for Climate Region of Power Outages
climate_category = df['climate region']

cc_counts = climate_category.value_counts().reset_index()
cc_counts.columns = ['Climate Region', 'Count']

fig = px.pie(
    cc_counts,
    names='Climate Region',
    values='Count',
    title="Pie Chart of Climate Region",
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(
    width=700,
    height=500,
    title=dict(
        text="Pie Chart for Climate Region of Power Outages",
        font=dict(size=20),
        x=0.5
    )
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=20)
)

fig.show()

save_plot(fig, "outages-by-climate-region")

Saved to assets/outages-by-climate-region.html


In [105]:
# Boxplot for Outage Duration
fig = px.box(
    df,
    x='outage duration',
    points='all',
    title="Boxplot of Outage Duration",
    color_discrete_sequence=['skyblue']
)

fig.update_layout(
    width=700,
    height=500,
    title=dict(
        text="Boxplot of Outage Duration (minutes)",
        font=dict(size=20),
        x=0.5
    ),
    xaxis_title="Outage Duration (minutes)"
)

fig.update_layout(
    width=1500,
    height=400,
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()

fig = px.box(
    df,
    x='outage duration',
    points=False,
    title="Boxplot of Outage Duration (minutes)",
    color_discrete_sequence=['skyblue']
)

fig.update_layout(
    width=700,
    height=500,
    title=dict(
        text="Boxplot of Outage Duration (minutes) WITHOUT Points and Outliers",
        font=dict(size=20),
        x=0.5
    ),
    xaxis_title="Outage Duration (minutes)"
)

fig.update_layout(
    width=900,
    height=400,
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()


In [106]:
# Line Graph of Counts per Year
year_counts = df['year'].value_counts().sort_index()

fig = px.line(
    x=year_counts.index,
    y=year_counts.values,
    markers=True,
    title="Count of Outages By Year"
)

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Outages",
    width=800,
    height=450
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outages-by-year")

Saved to assets/outages-by-year.html


In [107]:
# Line Graph of Counts per Hour
hour = df['outage start datetime'].dt.hour
hour_counts = hour.value_counts().sort_index()

fig = px.line(
    x=hour_counts.index,
    y=hour_counts.values,
    markers=True,
    title="Count of Outages By Hour of the Day"
)

fig.update_layout(
    width=800,
    height=450,
    xaxis_title="Hour of Day (0–23)",
    yaxis_title="Outages",
    title=dict(x=0.5)
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outages-by-hour")

Saved to assets/outages-by-hour.html


##### Bivariate Analysis

In [108]:
# Scatter Plot of Month (1-12) VS Outage Duration (min)
fig = px.scatter(
    df,
    x="month",
    y="outage duration",
    title="Outage Duration (min) by Month (1-12)",
    opacity=0.7
)

fig.update_layout(
    width=800,
    height=500,
    xaxis_title="Month (1-12)",
    yaxis_title="Outage Duration (min)",
    title=dict(x=0.5)
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outage-duration-by-month")

Saved to assets/outage-duration-by-month.html


In [109]:
# Scatter Plot of Hour (0-23) VS Outage Duration (min)
temp['hour'] = df['outage start datetime'].dt.hour
temp['outage duration'] = df['outage duration']

fig = px.scatter(
    temp,
    x="hour",
    y="outage duration",
    title="Hour (0-23) VS Outage Duration (min)",
    opacity=0.7
)

fig.update_layout(
    width=800,
    height=500,
    xaxis_title="Hour (0-23)",
    yaxis_title="Outage Duration (min)",
    title=dict(x=0.5)
)

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50)
)

fig.show()


In [110]:
# Scatter Plot of Cause Category VS Outage Duration (min)
import plotly.express as px

fig = px.scatter(
    df,
    x="cause category",
    y="outage duration",
    title="Outage Duration (min) by Cause Category"
)

fig.update_layout(
    xaxis_title="Cause Category",
    yaxis_title="Outage Duration (min)",
    width=800,
    height=500,
    title=dict(x=0.5)
)

fig.update_layout(
    margin=dict(l=80, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outage-duration-by-cause-category")

Saved to assets/outage-duration-by-cause-category.html


In [111]:
# Stacked Bar Chart of NERC Region VS Power Outages by Cause Category
import plotly.express as px

fig = px.bar(
    df,
    x="nerc region",
    color="cause category",
    title="Distribution of Outage Causes by NERC Region",
)

fig.update_layout(
    barmode="stack",
    xaxis_title="NERC Region",
    yaxis_title="Power Outages",
    width=900,
    height=500,
    title=dict(x=0.5)
)

fig.update_layout(
    margin=dict(l=80, r=50, t=50, b=50)
)

fig.show()

save_plot(fig, "outage-cause-by-nerc-region")


Saved to assets/outage-cause-by-nerc-region.html


##### Interesting Aggregates

In [112]:
# Median Outage Duration for Climate Region VS Category Cause
pivot = df.pivot_table(
    index="climate region",
    columns="cause category",
    values="outage duration",
    aggfunc="median"
)

pivot.to_html("html-views/interesting-agg-pivot.html")

pivot


cause category,equipment failure,fuel supply emergency,intentional attack,islanding,public appeal,severe weather,system operability disruption
climate region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Central,149.0,7500.5,198.0,96.0,1410.0,1695.0,65.0
East North Central,761.0,13564.0,1046.0,1.0,733.0,4005.0,2694.0
Northeast,267.5,12240.0,30.0,881.0,2760.0,3189.0,234.5
...,...,...,...,...,...,...,...
Southwest,35.0,76.0,57.0,2.0,2275.0,2425.0,337.5
West,269.0,882.5,118.0,128.5,420.0,962.0,199.0
West North Central,61.0,,47.0,56.0,439.5,83.0,


In [113]:
# Average Month for Climate Category VS Cause Category
pivot = df.pivot_table(
    index="climate category",
    columns="cause category",
    values="month",
    aggfunc="mean"
)
pivot

cause category,equipment failure,fuel supply emergency,intentional attack,islanding,public appeal,severe weather,system operability disruption
climate category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cold,5.37,4.26,5.13,7.53,6.95,5.86,5.19
normal,5.71,6.77,6.29,6.24,5.59,6.73,6.12
warm,8.3,4.6,4.89,5.79,7.15,7.4,6.43


## Step 3: Assessment of Missingness

##### Identify Columns with Missing Values

In [122]:
# Identify columns with missing values
og_total_rows = df.shape[0]
def check_cols(df):
    missing_col_data = {}
    for col in df.columns:
        missing_col_data[col] = int(df[col].isna().sum())
    missing_col_data = {key: val for key, val in missing_col_data.items() if val > 0}
    return missing_col_data
check_cols(df)

{'month': 9,
 'climate region': 6,
 'climate category': 9,
 'outage duration': 136,
 'outage start datetime': 9,
 'outage restoration datetime': 58}

##### NMAR Analysis

The column `OUTAGE.RESTORATION.DATE`, which contains 58 missing values, can be classified as Not Missing At Random (NMAR) because the missingness could be inherently linked to the nature of the outage event itself. For example, if the power was not restored at the time the data was reported, there would be no date or time to record, which directly ties the missing value to the ongoing status of the event. Similarly, for outages that were minor or very short, restoration details could have been deemed unnecessary to log, which means that the absence of this data is again related to the characteristics of the outage rather than occurring randomly. In both cases, the probability of a missing value is dependent on the unobserved value or the specific circumstances of the event rather than other observed data, making this column likely NMAR.

##### Missingness Dependency

In [202]:
# Set columns and number of permutation tests
target_col = "outage duration"
dependent_col = "cause category"
independent_col = "nerc region"
n_permutations = 1000

In [None]:
# Compute TVD
def tvd_between_groups(df, col, missing_indicator):
    counts_missing = df.loc[missing_indicator == 1, col].value_counts(normalize=True)
    counts_not_missing = df.loc[missing_indicator == 0, col].value_counts(normalize=True)
    all_categories = counts_missing.index.union(counts_not_missing.index)
    p = counts_missing.reindex(all_categories, fill_value=0)
    q = counts_not_missing.reindex(all_categories, fill_value=0)
    return 0.5 * np.sum(np.abs(p - q))

In [186]:
# Permutation test with TVD
def permutation_test_tvd(df, target_col, other_col, n_permutations=1000):
    missing_indicator = df[target_col].isna().astype(int)
    observed_tvd = tvd_between_groups(df, other_col, missing_indicator)
    
    perm_tvd = []
    for _ in range(n_permutations):
        shuffled = np.random.permutation(df[other_col])
        perm_tvd.append(tvd_between_groups(df.assign(temp=shuffled), 'temp', missing_indicator))
    
    perm_tvd = np.array(perm_tvd)
    p_value = (perm_tvd >= observed_tvd).mean()  # one-sided test
    return observed_tvd, perm_tvd, p_value

In [187]:
# Run permutation test for dependent column
obs_dep, perm_dep, p_dep = permutation_test_tvd(df, target_col, dependent_col, n_permutations)

print(f"permutation test for dependency on {dependent_col}:")
print(f"observed TVD: {obs_dep:.3f}, p-value: {p_dep:.3f}\n")

permutation test for dependency on cause category:
observed TVD: 0.469, p-value: 0.000



In [219]:
# Distribution of Cause Category by Missingness of Outage Duration
df['missing_flag'] = df[target_col].isna()
fig = px.histogram(df, x=dependent_col, color='missing_flag', barmode='group', histnorm='probability',
                   title="Distribution of Cause Category by Missingness of Outage Duration", width=800, height=600)
fig.update_layout(xaxis_title="Cause Category", yaxis_title="Proportion", legend_title="Missing Duration")
fig.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig.show()

save_plot(fig, 'missingness-cause-category')


Saved to assets/missingness-cause-category.html


In [220]:
# Empirical Distribution of TVD for Cause Category by Missingness of Outage Duration
perm_df = pd.DataFrame({'perm_tvd': perm_dep})

fig_perm = px.histogram(
    perm_df, 
    x='perm_tvd', 
    nbins=30, 
    histnorm='probability',
    title="Empirical Distribution of TVD for Cause Category by Missingness of Outage Duration",
    width=800,
    height=500
)

fig_perm.add_shape(
    type="line",
    x0=obs_dep, x1=obs_dep,
    y0=0, y1=.2,
    line=dict(color="red", width=3, dash="dash"),
)

fig_perm.add_annotation(
    x=obs_dep,
    y=.17,
    text=f"Observed TVD = {obs_dep:.3f}",
    showarrow=True,
    arrowhead=3,
    ax=80,
    ay=-20
)

fig_perm.update_layout(
    xaxis_title="TVD",
    yaxis_title="Probability",
)

fig_perm.update_layout(margin=dict(l=70, r=50, t=50, b=50))

fig_perm.show()

save_plot(fig_perm, 'tvd-missingness-cause-category')


Saved to assets/tvd-missingness-cause-category.html


In [203]:
# Run permutation test for independent column
obs_indep, perm_indep, p_indep = permutation_test_tvd(df, target_col, independent_col, n_permutations)

print(f"permutation test for dependency on {independent_col}:")
print(f"observed TVD: {obs_indep:.3f}, p-value: {p_indep:.3f}\n")

permutation test for dependency on nerc region:
observed TVD: 0.143, p-value: 0.045



In [221]:
# Distribution of Cause Category by Missingness of Outage Duration
df['missing_flag'] = df[target_col].isna()
fig = px.histogram(df, x=independent_col, color='missing_flag', barmode='group', histnorm='probability',
                   title="Distribution of NERC Region by Missingness of Outage Duration", width=800, height=600)
fig.update_layout(xaxis_title="NERC Region", yaxis_title="Proportion", legend_title="Missing Duration")
fig.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig.show()

save_plot(fig, 'missingness-nerc-region')

Saved to assets/missingness-nerc-region.html


In [222]:
# Empirical Distribution of TVD for NERC Region by Missingness of Outage Duration
perm_df = pd.DataFrame({'perm_tvd': perm_indep})

fig_perm = px.histogram(
    perm_df, 
    x='perm_tvd', 
    nbins=30, 
    histnorm='probability',
    title="Empirical Distribution of TVD for NERC Region by Missingness of Outage Duration",
    width=800,
    height=500
)

fig_perm.add_shape(
    type="line",
    x0=obs_indep, x1=obs_indep,
    y0=0, y1=.2,
    line=dict(color="red", width=3, dash="dash"),
)

fig_perm.add_annotation(
    x=obs_indep,
    y=.17,
    text=f"Observed TVD = {obs_indep:.3f}",
    showarrow=True,
    arrowhead=3,
    ax=80,
    ay=-20
)

fig_perm.update_layout(
    xaxis_title="TVD",
    yaxis_title="Probability",
)

fig_perm.update_layout(margin=dict(l=70, r=50, t=50, b=50))

fig_perm.show()

save_plot(fig_perm, 'tvd-missingness-nerc-region')


Saved to assets/tvd-missingness-nerc-region.html


# DO NOT TOUCH OR RUN FOLLOWING CODE

In [87]:
# Fix 'month' column
df[df['month'].isna()]
df = df.dropna(subset=['month'])
check_cols(df)
# This fixed 'month' along with 'anomaly level', 'climate category', 
# 'outage start date', and 'outage start time'.

{'climate region': 5,
 'outage duration': 127,
 'outage restoration datetime': 49}

In [88]:
# Fix Outage Restoration Date
df[df['outage restoration date'].isna()]
df = df.dropna(subset=['outage restoration date'])
check_cols(df)
# This fixed 'outage restoration date' along with 
# 'outage restoration time', and 'outage duration'.

KeyError: 'outage restoration date'

In [None]:
# Fix Climate Region
df[df['climate region'].isna()]
df = df.dropna(subset=['climate region'])
check_cols(df)
# All missing data fixed.

{}

In [None]:
rows_dropped =  og_total_rows - df.shape[0]
pct_rows_dropped = round((rows_dropped / og_total_rows) * 100, 2)
f'Retained approximately {100-pct_rows_dropped}% of original data as only {rows_dropped} of the {og_total_rows} rows were dropped!'

'Retained approximately 95.89% of original data as only 63 of the 1534 rows were dropped!'

## Step 4: Hypothesis Testing

In [None]:
# TODO

## Step 5: Framing a Prediction Problem

In [None]:
# TODO

## Step 6: Baseline Model

In [None]:
# TODO

## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO