<a href="https://colab.research.google.com/github/nosejohn/eagle-i/blob/main/EAGLE2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# importing necessary datasets
import pandas as pd
eagle2019 = pd.read_csv('eaglei_outages_2019.csv', index_col = False)

# Convert run_start_time to datetime
eagle2019['run_start_time'] = pd.to_datetime(eagle2019['run_start_time'])

# Sort the data
eagle2019.sort_values(by=['fips_code', 'run_start_time'], inplace=True)
eagle2019.head()

def process_group(group):
    # Ensure 'run_start_time' is in datetime format
    group['run_start_time'] = pd.to_datetime(group['run_start_time'])

    # Calculate time differences between consecutive rows in minutes
    group['time_diff'] = group['run_start_time'].diff().dt.total_seconds() / 60.0

    # Mark rows as new outage based on time difference and sum change
    group['new_outage'] = (group['time_diff'] > 16) | (group['sum'] != group['sum'].shift()) | group['time_diff'].isnull()
    group['outage_id'] = group['fips_code'].astype(str) + '_' + group['new_outage'].cumsum().astype(str)

    # Determine start and end times for each outage
    group['start_time'] = group.groupby('outage_id')['run_start_time'].transform('first')
    group['end_time'] = group.groupby('outage_id')['run_start_time'].transform('last')

    return group[['fips_code', 'county', 'state', 'sum', 'start_time', 'end_time']]


In [3]:
import re

date_pattern = re.compile(r"\d{4}-\d{2}-\d{2}")

# Find rows with invalid 'run_start_time' formats
invalid_dates = eagle2019[~eagle2019['run_start_time'].astype(str).str.match(date_pattern)]

# Inspect invalid dates to decide on correction
print(invalid_dates)

          fips_code  county     state   sum run_start_time
13110790      17133  Monroe  Illinois  14.0             20


In [8]:
print(eagle2019[(eagle2019['county'] == 'Monroe') & (eagle2019['state'] == 'Illinois') & (eagle2019['sum'] == 14)])

          fips_code  county     state   sum      run_start_time
3251849       17133  Monroe  Illinois  14.0 2019-02-26 23:45:00
3252681       17133  Monroe  Illinois  14.0 2019-02-27 00:00:00
8648803       17133  Monroe  Illinois  14.0 2019-05-23 12:00:00
8649553       17133  Monroe  Illinois  14.0 2019-05-23 12:15:00
8650328       17133  Monroe  Illinois  14.0 2019-05-23 12:30:00
8651171       17133  Monroe  Illinois  14.0 2019-05-23 12:45:00
8652067       17133  Monroe  Illinois  14.0 2019-05-23 13:00:00
8653002       17133  Monroe  Illinois  14.0 2019-05-23 13:15:00
8653982       17133  Monroe  Illinois  14.0 2019-05-23 13:30:00
8655036       17133  Monroe  Illinois  14.0 2019-05-23 13:45:00
8656132       17133  Monroe  Illinois  14.0 2019-05-23 14:00:00
8657274       17133  Monroe  Illinois  14.0 2019-05-23 14:15:00
8658437       17133  Monroe  Illinois  14.0 2019-05-23 14:30:00
8659610       17133  Monroe  Illinois  14.0 2019-05-23 14:45:00
8660799       17133  Monroe  Illinois  1

In [None]:
eagle2019 = eagle2019.drop(eagle2019['run_start_time'] == 20)


In [10]:
# Apply the function to each group
aggregated2019 = eagle2019.groupby(['fips_code', 'county', 'state', 'sum']).apply(process_group).reset_index(drop=True)

print(aggregated2019)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  aggregated2019 = eagle2019.groupby(['fips_code', 'county', 'state', 'sum']).apply(process_group).reset_index(drop=True)


          fips_code      county              state   sum          start_time  \
0              1001     Autauga            Alabama  10.0 2019-01-01 00:00:00   
1              1001     Autauga            Alabama  10.0 2019-01-01 00:00:00   
2              1001     Autauga            Alabama  10.0 2019-01-01 00:00:00   
3              1001     Autauga            Alabama  10.0 2019-01-01 00:00:00   
4              1001     Autauga            Alabama  10.0 2019-01-01 00:00:00   
...             ...         ...                ...   ...                 ...   
18399553      78030  St. Thomas  US Virgin Islands  85.0 2019-10-16 17:30:00   
18399554      78030  St. Thomas  US Virgin Islands  85.0 2019-10-16 17:30:00   
18399555      78030  St. Thomas  US Virgin Islands  85.0 2019-10-16 17:30:00   
18399556      78030  St. Thomas  US Virgin Islands  85.0 2019-10-16 17:30:00   
18399557      78030  St. Thomas  US Virgin Islands  85.0 2019-10-16 17:30:00   

                    end_time  
0       

In [11]:
aggregated2019['duration'] = aggregated2019['end_time'] - aggregated2019['start_time'] + pd.Timedelta(minutes=15)
aggregated2019['duration'] = aggregated2019['duration'].dt.total_seconds() / 60


In [12]:
aggregated2019.to_csv('aggregated_eaglei_2019.csv')