# Your Title Here

**Name(s)**: (your name(s) here)

**Website Link**: (your website link)

## Code

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

In [2]:
# load outage data
outage = pd.read_excel('outage.xlsx', skiprows=5, usecols=['U.S._STATE', 'CAUSE.CATEGORY', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE', 'RES.SALES', 'COM.SALES', 'IND.SALES', 'RES.CUSTOMERS', 'COM.CUSTOMERS', 'IND.CUSTOMERS', 'OUTAGE.START.DATE', 'OUTAGE.START.TIME'])
outage = outage.drop(index=0)
outage

Unnamed: 0,U.S._STATE,OUTAGE.START.DATE,OUTAGE.START.TIME,CAUSE.CATEGORY,OUTAGE.DURATION,DEMAND.LOSS.MW,RES.PRICE,COM.PRICE,IND.PRICE,RES.SALES,COM.SALES,IND.SALES,RES.PERCEN,COM.PERCEN,IND.PERCEN,RES.CUSTOMERS,COM.CUSTOMERS,IND.CUSTOMERS
1,Minnesota,2011-07-01 00:00:00,17:00:00,severe weather,3060,,11.6,9.18,6.81,2332915,2114774,2113291,35.549073,32.225029,32.202431,2308736.0,276286.0,10673.0
2,Minnesota,2014-05-11 00:00:00,18:38:00,intentional attack,1,,12.12,9.71,6.49,1586986,1807756,1887927,30.032487,34.210389,35.727564,2345860.0,284978.0,9898.0
3,Minnesota,2010-10-26 00:00:00,20:00:00,severe weather,3000,,10.87,8.19,6.07,1467293,1801683,1951295,28.097672,34.501015,37.365983,2300291.0,276463.0,10150.0
4,Minnesota,2012-06-19 00:00:00,04:30:00,severe weather,2550,,11.79,9.25,6.71,1851519,1941174,1993026,31.994099,33.54333,34.439329,2317336.0,278466.0,11010.0
5,Minnesota,2015-07-18 00:00:00,02:00:00,severe weather,1740,250,13.07,10.16,7.74,2028875,2161612,1777937,33.982576,36.20585,29.779498,2374674.0,289044.0,9812.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1530,North Dakota,2011-12-06 00:00:00,08:00:00,public appeal,720,155,8.41,7.8,6.2,488853,438133,386693,37.212544,33.351628,29.435904,330738.0,60017.0,3639.0
1531,North Dakota,,,fuel supply emergency,,1650,,,,,,,,,,309997.0,53709.0,2331.0
1532,South Dakota,2009-08-29 00:00:00,22:54:00,islanding,59,84,9.25,7.47,5.53,337874,370771,215406,36.564432,40.124517,23.311051,367206.0,65971.0,3052.0
1533,South Dakota,2009-08-29 00:00:00,11:00:00,islanding,181,373,9.25,7.47,5.53,337874,370771,215406,36.564432,40.124517,23.311051,367206.0,65971.0,3052.0


### Framing the Problem

### Data Cleaning for Outages dataset

In [3]:
#perform probabilistic imputation on multiple columns together
def multi_prob_impute(df, cols):
    missingness = outage[cols[-1]].isna()
    fill_index = np.random.choice(outage[cols[-1]].dropna().index, missingness.sum())
    fill_values = outage.loc[fill_index, cols]
    for col in cols:
        df.loc[missingness, col] = fill_values[col].to_numpy()

# perform probabilistic imputation on single column
def single_prob_impute(df, col):
    fill_values = np.random.choice(df[col].dropna(), df[col].isna().sum())
    df.loc[df[col].isna(), col] = fill_values

In [7]:
def prob_imputate(df, col_category, sectors):
    df = df.copy()
    for category in col_category:
        for sector in sectors:
            col = '.'.join([sector, category])
            single_prob_impute(df, col)
    return df

In [4]:
def merge_sector(df, col_category, sectors):
    df = df.copy() 
    for category in col_category:
        cols = ['.'.join([sector, category]) for sector in sectors]
        df[category] = df[cols].apply(lambda row: list(row.values), axis=1)
        df = df.drop(columns=cols)
    return df

In [8]:
def conditional_mean_impute(df, target, dependent):
    return df.groupby(dependent)[target].apply(lambda x: x.fillna(x.mean()))

In [25]:
outage = outage.copy()[~outage['OUTAGE.START.DATE'].isna()]
non_numeric_col = ['U.S._STATE', 'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'CAUSE.CATEGORY']
sector_related = ['PRICE', 'SALES', 'CUSTOMERS', 'MONEY.LOST']
sectors = ['RES', 'COM', 'IND']

outage.loc[:, ~outage.columns.isin(non_numeric_col)] = outage.loc[:, ~outage.columns.isin(non_numeric_col)].astype(float)

# use probabilistic imputation to fill missing values in sector related columns
outage_imputed = prob_imputate(outage, sector_related[:-1], sectors)
multi_prob_impute(outage_imputed, ['RES.PERCEN', 'IND.PERCEN', 'COM.PERCEN'])

# use within-group mean imputation to fill missing values
outage_imputed['OUTAGE.DURATION'] = conditional_mean_impute(outage_imputed, 'OUTAGE.DURATION', 'CAUSE.CATEGORY')
outage_imputed['DEMAND.LOSS.MW'] = conditional_mean_impute(outage_imputed, 'DEMAND.LOSS.MW', 'CAUSE.CATEGORY')

outage_imputed['RES.MONEY.LOST'] = round(outage_imputed['OUTAGE.DURATION']*outage_imputed['RES.PERCEN']*outage_imputed['DEMAND.LOSS.MW']*outage_imputed['RES.PRICE'] / 60000, 2)
outage_imputed['COM.MONEY.LOST'] = round(outage_imputed['OUTAGE.DURATION']*outage_imputed['COM.PERCEN']*outage_imputed['DEMAND.LOSS.MW']*outage_imputed['COM.PRICE'] / 60000, 2)
outage_imputed['IND.MONEY.LOST'] = round(outage_imputed['OUTAGE.DURATION']*outage_imputed['IND.PERCEN']*outage_imputed['DEMAND.LOSS.MW']*outage_imputed['IND.PRICE'] / 60000, 2)

outage_imputed = outage_imputed.drop(columns=['DEMAND.LOSS.MW', 'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN'])
# print(outage.head().to_markdown())

outage_merge = merge_sector(outage_imputed, sector_related, sectors)
outage_explode = outage_merge.explode(sector_related)
outage_explode['Sector'] = sectors*len(outage_merge)
print(outage_explode)

        U.S._STATE    OUTAGE.START.DATE OUTAGE.START.TIME      CAUSE.CATEGORY  \
1        Minnesota  2011-07-01 00:00:00          17:00:00      severe weather   
1        Minnesota  2011-07-01 00:00:00          17:00:00      severe weather   
1        Minnesota  2011-07-01 00:00:00          17:00:00      severe weather   
2        Minnesota  2014-05-11 00:00:00          18:38:00  intentional attack   
2        Minnesota  2014-05-11 00:00:00          18:38:00  intentional attack   
...            ...                  ...               ...                 ...   
1532  South Dakota  2009-08-29 00:00:00          22:54:00           islanding   
1532  South Dakota  2009-08-29 00:00:00          22:54:00           islanding   
1533  South Dakota  2009-08-29 00:00:00          11:00:00           islanding   
1533  South Dakota  2009-08-29 00:00:00          11:00:00           islanding   
1533  South Dakota  2009-08-29 00:00:00          11:00:00           islanding   

      OUTAGE.DURATION  PRIC

# TODO
## Baseline model
### Categorical features
1. Cause cateogory
2. NERC region
3. Sector
### Numerical features
1. Average electricity consumption (sales)
2. Duration
3. Price
4. Customers served


## Final model
### Categorical features
1. Cause cateogory
2. Date of time (Morning, Evening, Night)
3. Date (Workday or weekend)
4. Geographical region
5. NERC region
6. Sector

### Numerical features
1. Average electricity consumption (sales)
2. Duration
3. Price
4. Customers served

### Baseline Model

In [None]:
# TODO

### Final Model

In [None]:
# TODO

### Fairness Analysis

In [None]:
# TODO