In [2]:
import pandas as pd
import numpy as np
import os
import plotly.express as px


In [3]:
directory = '../../data/power_outages/oe-417'

files = [f for f in os.listdir(directory) if f.endswith('.xls') and 
         any(str(year) in f for year in range(2002, 2024))]

dfs = [pd.read_excel(os.path.join(directory, f), header=1).drop(0) for f in files]

df = pd.concat(dfs, ignore_index=True)
columns_to_drop = ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5' , 'Unnamed: 6', 'Unnamed: 7', 
                   'Event Month', 'Month']
df = df.drop(columns=columns_to_drop)


# combine duplicate cols with different names
df['customers_affected'] = df['Number of Customers Affected 1'].combine_first(df['Number of Customers Affected 1[1]'])
df['customers_affected'] = df['customers_affected'].combine_first(df['Number of Customers Affected'])
df['customers_affected'] = pd.to_numeric(df['customers_affected'], errors='coerce')
df = df.loc[df['customers_affected'] >= 1000]
#non_null_rows_1 = df[df['Number of Customers Affected 1'].notnull() & df['Number of Customers Affected 1[1]'].notnull() & df['Number of Customers Affected'].notnull()]
#print(non_null_rows_1)

df['nerc_region'] = df['NERC Region'].combine_first(df[' NERC Region'])
non_null_rows_2 = df[df['NERC Region'].notnull() & df[' NERC Region'].notnull()]

df['date_start'] = pd.to_datetime(df['Date Event Began'].combine_first(df['Date']), errors='coerce')
# non_null_rows_3 = df[df['Date Event Began'].notnull() & df['Date'].notnull()]


df['time_start'] = df['Time'].combine_first(df['Time Event Began'])
# non_null_rows_4 = df[df['Time'].notnull() & df['Time Event Began'].notnull()]

df['demand_loss_mw'] = df['Loss (megawatts)'].combine_first(df['Demand Loss (MW)'])
# non_null_rows_5 = df[df['Loss (megawatts)'].notnull() & df['Demand Loss (MW)'].notnull()]

df['date_restoration'] = pd.to_datetime(df['Restoration'].combine_first(df['Date of Restoration']), errors='coerce')
# non_null_rows_6 = df[df['Restoration'].notnull() & df['Date of Restoration'].notnull()]
# #print(non_null_rows_6)

df['event_type'] = df['Type of Disturbance'].combine_first(df['Event Type'])
df['event_type'] = df['event_type'].str.lstrip("=-")

# non_null_rows_7 = df[df['Type of Disturbance'].notnull() & df['Event Type'].notnull()]
# #print(non_null_rows_7)

df.drop(columns=['Number of Customers Affected 1[1]', 'Number of Customers Affected 1', 'Number of Customers Affected', 
    'NERC Region', ' NERC Region', 'Date', 'Date Event Began', 'Time', 'Time Event Began', 'Loss (megawatts)', 'Demand Loss (MW)',
    'Date of Restoration', 'Time of Restoration', 'Restoration', 'Event Type', 'Type of Disturbance', 'Alert Criteria'], 
    inplace=True)

df = df.dropna(how='all')



In [85]:
df.to_csv('../../data/power_outages/power_outages_cleaned.csv', index=False)

In [7]:
df_clean = pd.read_csv('../../data/power_outages/power_outages_cleaned.csv', parse_dates=['date_start', 'date_restoration'])

In [8]:
hurricane_df = df_clean[
    (df_clean['nerc_region'].str.strip() == 'SERC') & 
                            (df_clean['date_start'].dt.month >= 6) & 
                            (df_clean['date_start'].dt.month <= 11)]
hurricane_df = hurricane_df[hurricane_df['event_type'].str.contains('hurricane|cyclone|weather|wind', case=False, na=False)]
# hurricane_df = hurricane_df[~hurricane_df['event_type'].str.contains(
#   'attack|vandalism|suspicious activity|cyber|load reduction|system operations|sabotage|general inadequacy|public appeal|fuel supply deficiency',
#   case=False, na=False)]
hurricane_df

Unnamed: 0,Area Affected,customers_affected,nerc_region,date_start,time_start,demand_loss_mw,date_restoration,event_type
24,Southern Company Territory,165798.0,SERC,2013-06-13,20:47:00,550,2013-06-14,Severe Weather - Thunderstorms
26,"Richmond Metro area, Virginia",283000.0,SERC,2013-06-13,16:08:00,900,2013-06-14,Severe Weather - Thunderstorms
27,Western Piedmont North Carolina,175000.0,SERC,2013-06-13,13:17:00,1000,2013-06-14,Severe Weather - Thunderstorms
28,Central and Eastern North Carolina,53000.0,SERC,2013-06-13,17:45:00,Unknown,2013-06-14,Severe Weather - Thunderstorms
60,"Central Missouri, Central Illinois",200000.0,SERC,2013-11-17,12:35:00,Unknown,2013-11-20,Severe Weather - Tornadoes
...,...,...,...,...,...,...,...,...
1824,North Carolina: South Carolina:,154100.0,SERC,2022-09-30,18:30:00,Unknown,2022-10-01,Severe Weather
1825,South Carolina:,108930.0,SERC,2022-09-30,13:46:00,Unknown,2022-09-30,Severe Weather
1826,North Carolina:,48323.0,SERC,2022-09-30,02:45:00,Unknown,2022-10-01,Severe Weather
1835,"Florida: Alachua County, Bay County, Citrus Co...",160000.0,SERC,2022-11-10,06:00:00,Unknown,2022-11-11,Severe Weather


In [9]:
# Assuming you have a 'date' column in datetime format
hurricane_df['year'] = hurricane_df['date_start'].dt.year

# Count the number of records per year
records_per_year = hurricane_df.groupby('year').size()

# Create the plot
fig = px.bar(records_per_year, x=records_per_year.index, y=records_per_year.values, labels={'x':'Year', 'y':'Number of Records'})
fig.show()

In [10]:
def remove_small_eaglei_outages(eaglei_raw):
    eaglei_small = eaglei_raw[eaglei_raw['sum'] >= 500]
    # uncomment line for 2023 data that changed the column name for some reason
    #eaglei_small = eaglei_small.rename(columns={'sum':'customers_out'})
    return eaglei_small

In [72]:
df = pd.read_csv('../../../data/power_outages/eaglei/eaglei_outages_2023_cleaned.csv', parse_dates = ['run_start_time'])
#df = remove_small_eaglei_outages(df)
df['customers_out'].min()
#df.to_csv('../../data/power_outages/eaglei/eaglei_outages_2023_cleaned.csv', index=False)

1000

In [70]:
def merge_oe417_eaglei(eaglei, oe417):
    # Create a temporary key for the cross join
    oe417['key'] = 0
    eaglei['key'] = 0

    # Perform the cross join
    cross_joined_df = pd.merge(oe417, eaglei, on='key', how='outer')

    # Drop the temporary key
    cross_joined_df.drop(columns='key', inplace=True)

    # Filter the cross-joined DataFrame based on your criteria
    filtered_df = cross_joined_df[
        (cross_joined_df['eaglei_date'] >= cross_joined_df['date_start']) &
        (cross_joined_df['eaglei_date'] <= cross_joined_df['date_restoration']) &
        (cross_joined_df.apply(lambda row: 
                            # row['county'].lower() in row['area_affected'].lower() and 
                            row['state'].lower() in row['area_affected'].lower(), axis=1))
    ]

    filtered_deduped_df = filtered_df.drop_duplicates(subset=['eaglei_date', 'fips_code'])
    filtered_deduped_df = filtered_df[['eaglei_date', 'fips_code', 'county', 'state', 'customers_out', 'total_customers_affected', 'area_affected', 'event_type']]

    return filtered_deduped_df


In [71]:
def clean_eaglei(year):
    eaglei = pd.read_csv(f'../../data/power_outages/eaglei/eaglei_outages_{year}_cleaned.csv', parse_dates = ['run_start_time'])
    eaglei['eaglei_date'] = eaglei['run_start_time'].dt.date
    eaglei_grouped = eaglei.groupby(['fips_code', 'eaglei_date']).agg({
        'customers_out': 'max',
        'county': 'first',
        'state': 'first'
    }).reset_index()
    eaglei_grouped.sort_values(by='customers_out', inplace=True, ascending = False )
    eaglei_grouped = eaglei_grouped.reset_index().rename(columns={'index': 'eaglei_id'})
    eaglei_grouped = eaglei_grouped.drop_duplicates(subset=['eaglei_date', 'fips_code'])
    return eaglei_grouped

def clean_oe417(year):
    oe417 = pd.read_csv('../../data/power_outages/power_outages_cleaned.csv', parse_dates = ['date_start', 'date_restoration'])
    oe417 = oe417.reset_index().rename(columns={'index': 'oe417_id'})
    oe417 = oe417[oe417['date_start'].dt.year == year]
    oe417['date_start'] = oe417['date_start'].dt.date
    oe417['date_restoration'] = oe417['date_restoration'].dt.date
    oe417 = oe417.rename(columns={'Area Affected': 'area_affected', 'customers_affected': 'total_customers_affected'})
    oe417 = oe417[['area_affected', 'date_start', 'date_restoration', 'event_type', 'total_customers_affected']]
    eo417 = oe417.drop_duplicates()
    return oe417

In [88]:
merged = pd.DataFrame()

for year in range(2015, 2024):
    print(f'Processing year {year}')
    cleaned_eaglei = clean_eaglei(year)
    cleaned_oe417 = clean_oe417(year)
    new_merged = merge_oe417_eaglei(cleaned_eaglei, cleaned_oe417)
    merged = pd.concat([merged, new_merged], ignore_index=True)

merged_deduped = merged.drop_duplicates(subset=['eaglei_date', 'fips_code'])

merged_deduped.to_csv('../../data/power_outages/oe417_eaglei_merge.csv', index=False)

Processing year 2015
Processing year 2016
Processing year 2017
Processing year 2018
Processing year 2019
Processing year 2020
Processing year 2021
Processing year 2022
Processing year 2023


In [6]:
power_outages_by_county = pd.read_csv('../../../data/power_outages/oe417_eaglei_merge.csv', parse_dates=['eaglei_date'])

power_outages_by_county['year'] = power_outages_by_county['eaglei_date'].dt.year
power_outages_hurricane_season = power_outages_by_county[(power_outages_by_county['eaglei_date'].dt.month >= 6) & (power_outages_by_county['eaglei_date'].dt.month <= 11)]

search_strings = ['weather', 'natural disaster', 'wind']

# Initialize a dictionary to store the counts for each string

mask = power_outages_hurricane_season['event_type'].str.lower().str.contains('|'.join(search_strings))

weather_outage_stats_annual = power_outages_hurricane_season[mask].groupby(['fips_code', 'year', 'state', 'county']).sum(['customers_out']).reset_index()
weather_outage_stats = power_outages_hurricane_season[mask].reset_index()
print(weather_outage_stats)

       index eaglei_date  fips_code      county       state  customers_out  \
0        590  2015-06-24      34007      Camden  New Jersey        93824.0   
1        591  2015-06-23      34007      Camden  New Jersey        86758.0   
2        592  2015-06-24      34015  Gloucester  New Jersey        81657.0   
3        593  2015-06-25      34015  Gloucester  New Jersey        76830.0   
4        594  2015-06-23      34009    Cape May  New Jersey        76550.0   
...      ...         ...        ...         ...         ...            ...   
14007  28724  2023-10-05      48499        Wood       Texas         1029.0   
14008  28725  2023-10-04      48037       Bowie       Texas         1019.0   
14009  28726  2023-10-04      48053      Burnet       Texas         1016.0   
14010  28727  2023-10-05      48053      Burnet       Texas         1016.0   
14011  28728  2023-10-06      48289        Leon       Texas         1002.0   

       total_customers_affected  \
0                      26300

In [20]:
# Here's where I calculate county level stats for hurricane-related power outages
# Produces a csv for a flourish map

# calculate time range
first_year = weather_outage_stats['year'].min()
last_year = weather_outage_stats['year'].max()

# relevant states
coastal_states = ['Texas', 'Louisiana', 'Florida', 'Alabama', 'Mississippi', 'Georgia', 'South Carolina', 'North Carolina', 'Virginia', 'Maryland', 'Delaware', 'New Jersey', 'New York', 'Connecticut', 'Rhode Island', 'Massachusetts', 'New Hampshire', 'Maine']
coastal_bordering_states = ['Arkansas', 'Oklahoma', 'Tennessee', 'Kentucky', 'West Virginia', 'Pennsylvania', 'Vermont', 'New Mexico']
#outage_stats_coastal = weather_outage_stats[weather_outage_stats['state'].isin(coastal_states)]

# add in population
population_df = pd.read_csv('../../../data/us_census/county_populations.csv')
population_df['combined_fips'] = population_df['state_fips'].astype(str).str.zfill(2) + population_df['county_fips'].astype(str).str.zfill(3)
weather_outage_stats['fips_code'] = weather_outage_stats['fips_code'].astype(str).str.zfill(5)

# Count the number of days with an outage for each county
days_with_outages_by_county = weather_outage_stats.groupby(['fips_code']).size().reset_index(name='count')


# # merge in population, setting count to 0 if the county is only in population 
days_with_outage_all_counties = population_df.merge(days_with_outages_by_county[['fips_code', 'count']], left_on='combined_fips', right_on='fips_code', how='outer')
days_with_outage_all_counties['count'] = days_with_outage_all_counties['count'].fillna(0).astype(int)
days_with_outage_all_counties.loc[~days_with_outage_all_counties['state_name'].isin(coastal_states + coastal_bordering_states), 'count'] = np.nan

# # add in number of outages per year
#days_with_outage_all_counties['average_annual_count'] = round(days_with_outage_all_counties['count'] / (last_year - first_year + 1), 1)
days_with_outage_all_counties['count_percentile'] = round(days_with_outage_all_counties['count'].rank(pct=True, method='min') * 100, 0)
days_with_outage_all_counties.to_csv('../../../data/power_outages/final/county_outages_stats_for_plot.csv', index=False)


In [12]:
# # look at percent changes over time

# # Step 1: Filter the data for the two time periods
# outages_2015_2018 = weather_outage_stats_pop[(weather_outage_stats_pop['year'] >= 2015) & (weather_outage_stats_pop['year'] <= 2018)]
# outages_2020_2023 = weather_outage_stats_pop[(weather_outage_stats_pop['year'] >= 2020) & (weather_outage_stats_pop['year'] <= 2023)]

# # # Step 2: Group by county and count outages for each period
# counts_2015_2018 = outages_2015_2018.groupby(['combined_fips']).size().reset_index(name='count_2015_2018')
# counts_2020_2023 = outages_2020_2023.groupby(['combined_fips']).size().reset_index(name='count_2020_2023')

# counts_2015_2018['average_annual_count_2015_2018'] = round(counts_2015_2018['count_2015_2018'] / 4)
# counts_2020_2023['average_annual_count_2020_2023'] = round(counts_2020_2023['count_2020_2023'] / 4)

# # Extreme outages
# extreme_outages_2015_2018 = outages_2015_2018[outages_2015_2018['percent_population_affected'] > 25]
# extreme_outages_2020_2023 = outages_2020_2023[outages_2020_2023['percent_population_affected'] > 25]
# extreme_outages_counts_2015_2018 = extreme_outages_2015_2018.groupby(['combined_fips']).size().reset_index(name='extreme_count_2015_2018')
# extreme_outages_counts_2020_2023 = extreme_outages_2020_2023.groupby(['combined_fips']).size().reset_index(name='extreme_count_2020_2023')

# # # Step 3: Merge the counts for the two periods
# merged_counts = pd.merge(counts_2015_2018, counts_2020_2023, on=['combined_fips'], how='outer').fillna(0)
# extreme_merged_counts = pd.merge(extreme_outages_counts_2015_2018, extreme_outages_counts_2020_2023, on=['combined_fips'], how='outer').fillna(0)

# # # Step 4: Calculate change
# merged_counts['change'] = (merged_counts['count_2020_2023'] - merged_counts['count_2015_2018'])
# merged_counts['percent_increase'] = (merged_counts['change'] / merged_counts['count_2015_2018']) * 100
# merged_counts['average_annual_change'] = (merged_counts['average_annual_count_2020_2023'] - merged_counts['average_annual_count_2015_2018'])
# extreme_merged_counts['extreme_change'] = (extreme_merged_counts['extreme_count_2020_2023'] - extreme_merged_counts['extreme_count_2015_2018'])

# # merge both counts dfs
# combined_df = merged_counts.merge(extreme_merged_counts, on=['combined_fips'], how='outer').fillna(0)
# combined_df.head()

Unnamed: 0,combined_fips,count_2015_2018,average_annual_count_2015_2018,count_2020_2023,average_annual_count_2020_2023,change,percent_increase,average_annual_change,extreme_count_2015_2018,extreme_count_2020_2023,extreme_change
0,1001,2.0,0.0,7.0,2.0,5.0,250.0,2.0,0.0,0.0,0.0
1,1003,3.0,1.0,11.0,3.0,8.0,266.666667,2.0,0.0,5.0,5.0
2,1005,0.0,0.0,1.0,0.0,1.0,inf,0.0,0.0,0.0,0.0
3,1007,2.0,0.0,3.0,1.0,1.0,50.0,1.0,0.0,1.0,1.0
4,1009,1.0,0.0,3.0,1.0,2.0,200.0,1.0,0.0,0.0,0.0


In [13]:
# df_for_map_plot = county_outages_stats.merge(combined_df, on=['combined_fips'], how='outer').fillna(0)
# df_for_map_plot = df_for_map_plot[[
#     'combined_fips', 'state_name', 'county_name', 'count', 'count_2015_2018', 'count_2020_2023',  'change',  'count_percentile', 'percent_increase'
#     ]]
# df_for_map_plot = df_for_map_plot.astype({'change': 'int'})

# #df_for_map_plot.head()
# df_for_map_plot.to_csv('../../data/power_outages/county_outages_stats_for_plot.csv', index=False)

In [None]:
top_10_counties_fips = ['48201', '48339', '48113', '48439', '48245', '48029', '48453', '12086', '25017', '25027']

counties_to_highlight_table = days_with_outage_all_counties[days_with_outage_all_counties['combined_fips'].isin(top_10_counties_fips)]
counties_to_highlight_table = counties_to_highlight_table[['combined_fips', 'state_name', 'county_name', 'count', 'percent_increase', 'count_percentile']]

counties_to_highlight_table.sort_values(by='count', inplace=True, ascending = False )

print(counties_to_highlight_table)

     combined_fips     state_name        county_name  count  percent_increase  \
924          48201          Texas      Harris County   62.0        900.000000   
993          48339          Texas  Montgomery County   62.0         77.777778   
880          48113          Texas      Dallas County   58.0       3600.000000   
1043         48439          Texas     Tarrant County   53.0       3600.000000   
946          48245          Texas   Jefferson County   45.0         10.000000   
838          48029          Texas       Bexar County   44.0         12.500000   
1050         48453          Texas      Travis County   41.0         58.333333   
129          12086        Florida  Miami-Dade County   38.0        -19.047619   
425          25017  Massachusetts   Middlesex County   38.0         20.000000   
430          25027  Massachusetts   Worcester County   37.0         66.666667   

      count_percentile  
924              100.0  
993              100.0  
880              100.0  
1043    

In [42]:

coastal_outage_stats = weather_outage_stats[weather_outage_stats['state'].isin(coastal_states)]

unique_years = sorted(coastal_outage_stats['year'].unique())

# Determine the first 15 and last 15 years
first_4_years = unique_years[:4]
last_4_years = unique_years[-4:]
print(first_4_years, last_4_years)

data = []

for state in coastal_states:
    state_outages = coastal_outage_stats[coastal_outage_stats['state'] == state]
    first_4_df = state_outages[state_outages['year'].isin(first_4_years)]
    last_4_df = state_outages[state_outages['year'].isin(last_4_years)]
    first_4_customers_out = first_4_df['customers_out'].sum()
    last_4_customers_out = last_4_df['customers_out'].sum()
    if first_4_customers_out > 0:  # To avoid division by zero
        percentage_increase = ((last_4_customers_out - first_4_customers_out) / first_4_customers_out) * 100
    else:
        percentage_increase = None

    data.append({
        'state': state,
        'first_5_mean': first_4_customers_out,
        'last_5_mean': last_4_customers_out,
        'percentage_increase': percentage_increase
    })

state_outage_summary = pd.DataFrame(data)
state_outage_summary_ordered = state_outage_summary.sort_values(by='percentage_increase', ascending=False)

# Display the DataFrame
print(state_outage_summary_ordered)

[2015, 2016, 2017, 2018] [2020, 2021, 2022, 2023]
             state  first_5_mean  last_5_mean  percentage_increase
1        Louisiana       70793.0   11612585.0         16303.578037
4      Mississippi       37826.0     850724.0          2149.045630
3          Alabama      437891.0    2548439.0           481.980219
0            Texas     1004924.0    5710374.0           468.239389
14     Connecticut      609247.0    3011579.0           394.311667
11      New Jersey     1156551.0    4648067.0           301.890362
8         Virginia      398737.0    1569710.0           293.670515
16   Massachusetts     1064206.0    3152792.0           196.257679
15    Rhode Island      620883.0     824596.0            32.810207
10        Delaware       98938.0     108128.0             9.288645
6   South Carolina     1297980.0    1117989.0           -13.867009
5          Georgia     5406456.0    2762443.0           -48.904735
19           Maine     1719315.0     776251.0           -54.851147
7   North Ca

In [36]:
# texas_total = county_counts_sorted[county_counts_sorted['state'] == 'Texas']['Count'].sum()
# florida_total = county_counts_sorted[county_counts_sorted['state'] == 'Florida']['Count'].sum()
# louisiana_total = county_counts_sorted[county_counts_sorted['state'] == 'Louisiana']['Count'].sum()
# michigan_total = county_counts_sorted[county_counts_sorted['state'] == 'Michigan']['Count'].sum()
# georgia_total = county_counts_sorted[county_counts_sorted['state'] == 'Georgia']['Count'].sum()
# north_carolina_total = county_counts_sorted[county_counts_sorted['state'] == 'North Carolina']['Count'].sum()
# south_carolina_total = county_counts_sorted[county_counts_sorted['state'] == 'South Carolina']['Count'].sum()
# virginia_total = county_counts_sorted[county_counts_sorted['state'] == 'Virginia']['Count'].sum()
# california_total = county_counts_sorted[county_counts_sorted['state'] == 'California']['Count'].sum()
# alabama_total = county_counts_sorted[county_counts_sorted['state'] == 'Alabama']['Count'].sum()
# mississippi_total = county_counts_sorted[county_counts_sorted['state'] == 'Mississippi']['Count'].sum()
# total = county_counts_sorted['Count'].sum()


# texas_pct = texas_total / total
# florida_pct = florida_total / total
# louisiana_pct = louisiana_total / total
# michigan_pct = michigan_total / total
# georgia_pct = georgia_total / total
# north_carolina_pct = north_carolina_total / total
# south_carolina_pct = south_carolina_total / total
# virginia_pct = virginia_total / total
# california_pct = california_total / total
# alabama_pct = alabama_total / total
# mississippi_pct = mississippi_total / total


# print(f'Texas: {texas_total} ({texas_pct:.2%})')
# print(f'Florida: {florida_total} ({florida_pct:.2%})')
# print(f'Michigan: {michigan_total} ({michigan_pct:.2%})')
# print(f'Louisiana: {louisiana_total} ({louisiana_pct:.2%})')
# print(f'Georgia: {georgia_total} ({georgia_pct:.2%})')
# print(f'North Carolina: {north_carolina_total} ({north_carolina_pct:.2%})')
# print(f'South Carolina: {south_carolina_total} ({south_carolina_pct:.2%})')
# print(f'Virginia: {virginia_total} ({virginia_pct:.2%})')
# print(f'California: {california_total} ({california_pct:.2%})')
# print(f'Alabama: {alabama_total} ({alabama_pct:.2%})')
# print(f'Mississippi: {mississippi_total} ({mississippi_pct:.2%})')




Texas: 1350 (14.28%)
Florida: 380 (4.02%)
Michigan: 1155 (12.22%)
Louisiana: 688 (7.28%)
Georgia: 427 (4.52%)
North Carolina: 434 (4.59%)
South Carolina: 177 (1.87%)
Virginia: 345 (3.65%)
California: 876 (9.27%)
Alabama: 239 (2.53%)
Mississippi: 105 (1.11%)


In [67]:
# MDK coming back to this 8/30. 

outages_df = pd.read_csv('../../../data/power_outages/weather_outage_stats.csv', parse_dates=['eaglei_date'])
population_df = pd.read_csv('../../../data/us_census/county_populations.csv')
population_df['combined_fips'] = population_df['state_fips'].astype(str).str.zfill(2) + population_df['county_fips'].astype(str).str.zfill(3)
outages_df['fips_code'] = outages_df['fips_code'].astype(str).str.zfill(5)

last_5_years = outages_df[outages_df['year'] >= 2019]
yearly_counts = last_5_years.groupby(['county', 'fips_code', 'year']).size().reset_index(name='count')

yearly_counts_all_counties = population_df.merge(yearly_counts[['fips_code', 'count', 'year']], left_on='combined_fips', right_on='fips_code', how='outer')
yearly_counts_all_counties['count'] = yearly_counts_all_counties['count'].fillna(0).astype(int)
yearly_counts_all_counties.loc[yearly_counts_all_counties['state_name'] != 'Texas', 'count'] = np.nan
yearly_counts_mean = yearly_counts_all_counties.groupby(['state_name', 'county_name', 'combined_fips']).agg({'count': 'mean'}).reset_index()
yearly_counts_mean
yearly_counts_mean.to_csv('../../../data/power_outages/texas_outages.csv', index=False)


In [52]:
yearly_counts = texas_outages.groupby('year').size().reset_index(name='count')
yearly_customers_out = houston_outages.groupby('year')['customers_out'].sum().reset_index()

# Create a bar plot with Plotly
fig = px.bar(yearly_counts, x='year', y='count', title='Count of Rows per Year')

# Show the plot
fig.show()

In [53]:
# last_5_years_mean = yearly_counts['count'].tail().mean()
# all_years_mean = yearly_counts['count'].mean()
# print(last_5_years_mean)
# print(all_years_mean)
# houston_outages.to_csv('../../../data/power_outages/houston_outages.csv', index=False)
last_5_years = houston_outages[houston_outages['year'] >= 2019]
yearly_counts = last_5_years.groupby(['county', 'fips_code', 'year']).size().reset_index(name='count')
mean_counts = yearly_counts.groupby(['county', 'fips_code'])['count'].mean().reset_index(name='mean_count')
mean_counts
mean_counts.to_csv('../../../data/power_outages/houston_outages_to_plot.csv', index=False)