<a href="https://colab.research.google.com/github/nosejohn/eagle-i/blob/main/EAGLE2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing necessary datasets
import pandas as pd
eagle2020 = pd.read_csv('/content/drive/MyDrive/eaglei/eaglei_outages_2020.csv', index_col = False)

# Convert run_start_time to datetime
eagle2020['run_start_time'] = pd.to_datetime(eagle2020['run_start_time'])

# Sort the data
eagle2020.sort_values(by=['fips_code', 'run_start_time'], inplace=True)
eagle2020.head()

def process_group(group):
    # Ensure 'run_start_time' is in datetime format
    group['run_start_time'] = pd.to_datetime(group['run_start_time'])

    # Calculate time differences between consecutive rows in minutes
    group['time_diff'] = group['run_start_time'].diff().dt.total_seconds() / 60.0

    # Mark rows as new outage based on time difference and sum change
    group['new_outage'] = (group['time_diff'] > 16) | (group['sum'] != group['sum'].shift()) | group['time_diff'].isnull()
    group['outage_id'] = group['fips_code'].astype(str) + '_' + group['new_outage'].cumsum().astype(str)

    # Determine start and end times for each outage
    group['start_time'] = group.groupby('outage_id')['run_start_time'].transform('first')
    group['end_time'] = group.groupby('outage_id')['run_start_time'].transform('last')

    return group[['fips_code', 'county', 'state', 'sum', 'start_time', 'end_time']]


In [None]:
len(eagle2020)

25545517

In [None]:
# Apply the function to each group
from tqdm.auto import tqdm
tqdm.pandas()

aggregated2020 = eagle2020.groupby(['fips_code', 'county', 'state', 'sum']).progress_apply(process_group).reset_index(drop=True)

print(aggregated2020)

  0%|          | 0/1117693 [00:00<?, ?it/s]

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  return getattr(df, df_function)(wrapper, **kwargs)


          fips_code      county              state     sum  \
0              1001     Autauga            Alabama     4.0   
1              1001     Autauga            Alabama     2.0   
2              1001     Autauga            Alabama     2.0   
3              1001     Autauga            Alabama     2.0   
4              1001     Autauga            Alabama     4.0   
...             ...         ...                ...     ...   
23443992      78030  St. Thomas  US Virgin Islands  1213.0   
23443993      78030  St. Thomas  US Virgin Islands  1213.0   
23443994      78030  St. Thomas  US Virgin Islands  1213.0   
23443995      78030  St. Thomas  US Virgin Islands  1213.0   
23443996      78030  St. Thomas  US Virgin Islands  1213.0   

                  start_time            end_time  
0        2020-01-02 15:45:00 2020-01-02 15:45:00  
1        2020-01-02 16:30:00 2020-01-02 17:00:00  
2        2020-01-02 16:30:00 2020-01-02 17:00:00  
3        2020-01-02 16:30:00 2020-01-02 17:00:00  


In [None]:
aggregated2020['duration'] = aggregated2020['end_time'] - aggregated2020['start_time'] + pd.Timedelta(minutes=15)
aggregated2020['duration'] = aggregated2020['duration'].dt.total_seconds() / 60


In [None]:
df_unique = aggregated2020.drop_duplicates(subset=['county', 'state', 'sum', 'start_time', 'end_time'])
grouped = df_unique.groupby(['fips_code', 'state', 'county']).agg({'sum': 'sum', 'duration': 'sum'}).reset_index()


In [None]:
grouped.to_csv('aggregated_eaglei_2020.csv')