Analyzing Power Outages

**Name(s)**: Nimisha Mishra

**Website Link**: [Power Outage Analysis](https://nimishamishra95.github.io/power-outage-analysis/)

In [125]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [126]:
# QUESTION:
#    How do climate region and the categorical cause of an outage influence the 
#    duration of major power outages?

## Step 2: Data Cleaning and Exploratory Data Analysis

##### Import Data

In [127]:
# Import Data
df = pd.read_csv("data/outage_cleaned_csv.csv")
df

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,...,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND
0,1,2011,7.0,Minnesota,...,0.60,91.59,8.41,5.48
1,2,2014,5.0,Minnesota,...,0.60,91.59,8.41,5.48
2,3,2010,10.0,Minnesota,...,0.60,91.59,8.41,5.48
...,...,...,...,...,...,...,...,...,...
1531,1532,2009,8.0,South Dakota,...,0.15,98.31,1.69,1.69
1532,1533,2009,8.0,South Dakota,...,0.15,98.31,1.69,1.69
1533,1534,2000,,Alaska,...,0.02,85.76,14.24,2.90


##### Fix Column Names

In [128]:
# Fix Column Names
new_cols = []
for col in df.columns:
    col = col.replace('U.S.','US')
    col = col.lower().split('.')
    if (len(col) > 1):
        col = ' '.join(col)
    else:
        col = col[0]
    col = col.replace('_', ' ')
    new_cols.append(col)
df.columns = new_cols
df.columns

Index(['obs', 'year', 'month', 'us state', 'postal code', 'nerc region',
       'climate region', 'anomaly level', 'climate category',
       'outage start date', 'outage start time', 'outage restoration date',
       'outage restoration time', 'cause category', 'cause category detail',
       'hurricane names', 'outage duration', 'demand loss mw',
       'customers affected', 'res price', 'com price', 'ind price',
       'total price', 'res sales', 'com sales', 'ind sales', 'total sales',
       'res percen', 'com percen', 'ind percen', 'res customers',
       'com customers', 'ind customers', 'total customers', 'res cust pct',
       'com cust pct', 'ind cust pct', 'pc realgsp state', 'pc realgsp usa',
       'pc realgsp rel', 'pc realgsp change', 'util realgsp', 'total realgsp',
       'util contri', 'pi util ofusa', 'population', 'poppct urban',
       'poppct uc', 'popden urban', 'popden uc', 'popden rural',
       'areapct urban', 'areapct uc', 'pct land', 'pct water tot',
      

In [129]:
# Remove ALL Unnecessary Columns
df = df[['year', 'month', 'us state', 'nerc region', 'climate region', 'anomaly level', 
         'climate category', 'outage start date', 'outage start time', 'outage restoration date',
         'outage restoration time', 'cause category', 'outage duration']]
df

Unnamed: 0,year,month,us state,nerc region,...,outage restoration date,outage restoration time,cause category,outage duration
0,2011,7.0,Minnesota,MRO,...,"Sunday, July 3, 2011",8:00:00 PM,severe weather,3060.0
1,2014,5.0,Minnesota,MRO,...,"Sunday, May 11, 2014",6:39:00 PM,intentional attack,1.0
2,2010,10.0,Minnesota,MRO,...,"Thursday, October 28, 2010",10:00:00 PM,severe weather,3000.0
...,...,...,...,...,...,...,...,...,...
1531,2009,8.0,South Dakota,RFC,...,"Saturday, August 29, 2009",11:53:00 PM,islanding,59.0
1532,2009,8.0,South Dakota,MRO,...,"Saturday, August 29, 2009",2:01:00 PM,islanding,181.0
1533,2000,,Alaska,ASCC,...,,,equipment failure,


## Step 3: Assessment of Missingness

In [130]:
# Identify Columns with Missing Values
og_total_rows = df.shape[0]
def check_cols(df):
    missing_col_data = {}
    for col in df.columns:
        missing_col_data[col] = int(df[col].isna().sum())
    missing_col_data = {key: val for key, val in missing_col_data.items() if val > 0}
    return missing_col_data
list(check_cols(df).keys())

['month',
 'climate region',
 'anomaly level',
 'climate category',
 'outage start date',
 'outage start time',
 'outage restoration date',
 'outage restoration time',
 'outage duration']

In [131]:
# Fix 'month' column
df[df['month'].isna()]
df = df.dropna(subset=['month'])
check_cols(df)
# This fixed 'month' along with 'anomaly level', 'climate category', 
# 'outage start date', and 'outage start time'.

{'climate region': 5,
 'outage restoration date': 49,
 'outage restoration time': 49,
 'outage duration': 49}

In [132]:
# Fix Outage Restoration Date
df[df['outage restoration date'].isna()]
df = df.dropna(subset=['outage restoration date'])
check_cols(df)
# This fixed 'outage restoration date' along with 
# 'outage restoration time', and 'outage duration'.

{'climate region': 5}

In [133]:
# Fix Climate Region
df[df['climate region'].isna()]
df = df.dropna(subset=['climate region'])
check_cols(df)
# All missing data fixed.

{}

In [134]:
rows_dropped =  og_total_rows - df.shape[0]
pct_rows_dropped = round((rows_dropped / og_total_rows) * 100, 2)
f'Retained approximately {100-pct_rows_dropped}% of original data as only {rows_dropped} of the {og_total_rows} rows were dropped!'

'Retained approximately 95.89% of original data as only 63 of the 1534 rows were dropped!'

## Step 4: Hypothesis Testing

In [135]:
# TODO

## Step 5: Framing a Prediction Problem

In [None]:
# TODO

## Step 6: Baseline Model

In [None]:
# TODO

## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO