In [31]:
import pandas as pd
from tqdm import tqdm

### Data Loading

In [19]:
elec_path='data/eaglei_data/'
storm_path='data/NOAA_StormEvents/'
eaglei_2015=pd.read_csv(elec_path+'eaglei_outages_2015.csv')
Storms_2015=pd.read_csv(storm_path+'StormEvents_details-ftp_v1.0_d2015_c20240716.csv')

### Concatenate 'STATE_FIPS' and 'CZ_FIPS' into "FIPS" in Storms

In [None]:
def keep_c_storm(Storms_2015):
    Storms_2015 = Storms_2015[Storms_2015['CZ_TYPE'] == 'C']
    return Storms_2015

def make_fips_storm(storms):
    storms['STATE_FIPS'] = storms['STATE_FIPS'].astype(str).str.zfill(2)
    storms['CZ_FIPS'] = storms['CZ_FIPS'].astype(str).str.zfill(3)
    storms['FIPS'] = storms['STATE_FIPS'] + storms['CZ_FIPS']
    return storms

# Storms_2015=keep_c_storm(Storms_2015)
# Storms_2015=make_fips_storm(Storms_2015)


### Concatenate 'BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME' into "BEGIN" and "END"

In [33]:
def convert_to_datetime(df):
    # Function to convert the 'BEGIN' and 'END' columns to datetime
    def convert_datetime(yearmonth, day, time):
        return pd.to_datetime(
            yearmonth.astype(str).str[:4] + '-' +  # Year
            yearmonth.astype(str).str[4:6] + '-' +  # Month
            day.astype(str).str.zfill(2) + ' ' +  # Day
            time.astype(str).str.zfill(4).str[:2] + ':' +  # Hour
            time.astype(str).str.zfill(4).str[2:]  # Minute
        )
    
    # Apply conversion to both BEGIN and END columns
    df['BEGIN'] = convert_datetime(df['BEGIN_YEARMONTH'], df['BEGIN_DAY'], df['BEGIN_TIME'])
    df['END'] = convert_datetime(df['END_YEARMONTH'], df['END_DAY'], df['END_TIME'])
    
    return df


### Matches storm events with outage records by time and location










In [34]:
def match_intervals_and_aggregate_full(df, eaglei_2015):
    df['BEGIN'] = pd.to_datetime(df['BEGIN'])
    df['END'] = pd.to_datetime(df['END'])
    eaglei_2015['run_start_time'] = pd.to_datetime(eaglei_2015['run_start_time'])

    df['FIPS'] = pd.to_numeric(df['FIPS'], errors='coerce').astype('Int64')
    eaglei_2015['fips_code'] = pd.to_numeric(eaglei_2015['fips_code'], errors='coerce').astype('Int64')

    results = []

    # 构建 eaglei 分组
    run_grouped = eaglei_2015.groupby('fips_code')

    for _, row in df.iterrows():
        fips = row['FIPS']
        begin, end = row['BEGIN'], row['END']

        # 默认空匹配
        customers_out_sum = 0
        run_start_time_mean = pd.NaT
        interval_count = 0

        if fips in run_grouped.groups:
            run_part = run_grouped.get_group(fips)
            matched = run_part[(run_part['run_start_time'] >= begin) & (run_part['run_start_time'] <= end)]

            if not matched.empty:
                customers_out_sum = matched['customers_out'].sum()
                run_start_time_mean = matched['run_start_time'].mean()
                interval_count = matched.shape[0]

        combined_row = row.to_dict()
        combined_row.update({
            'customers_out_sum': customers_out_sum,
            'run_start_time_mean': run_start_time_mean,
            'interval_count': interval_count
        })
        results.append(combined_row)

    return pd.DataFrame(results)
# results = match_intervals_and_aggregate_full(Storms_2015, eaglei_2015)

### Merging Storm data and eaglei data year by year and concatenate all years 

In [None]:
storm_names = ['StormEvents_details-ftp_v1.0_d2015_c20240716.csv', 'StormEvents_details-ftp_v1.0_d2016_c20220719.csv', 
               'StormEvents_details-ftp_v1.0_d2017_c20230317.csv', 'StormEvents_details-ftp_v1.0_d2018_c20240716.csv',
               'StormEvents_details-ftp_v1.0_d2019_c20240117.csv', 'StormEvents_details-ftp_v1.0_d2020_c20240620.csv',
               'StormEvents_details-ftp_v1.0_d2021_c20240716.csv', 'StormEvents_details-ftp_v1.0_d2022_c20241121.csv',
               'StormEvents_details-ftp_v1.0_d2023_c20241216.csv']

eaglei_names = ['eaglei_outages_2015.csv', 'eaglei_outages_2016.csv', 'eaglei_outages_2017.csv', 'eaglei_outages_2018.csv',
                'eaglei_outages_2019.csv', 'eaglei_outages_2020.csv', 'eaglei_outages_2021.csv', 'eaglei_outages_2022.csv',
                'eaglei_outages_2023.csv']

data_path_eaglei = 'data/eaglei_data/'
data_path_storms = 'data/NOAA_StormEvents/'

all_years_combined = []

for i in range(0, len(storm_names)):
    eaglei_year=pd.read_csv(data_path_eaglei + eaglei_names[i])
    storms_year=pd.read_csv(data_path_storms + storm_names[i])

    storms_year=keep_c_storm(storms_year)
    storms_year=make_fips_storm(storms_year)

    storms_year=convert_to_datetime(storms_year)

    df_combined=match_intervals_and_aggregate_full(storms_year, eaglei_year)
    filename = f"combined_{2014 + i}.csv"
    df_combined.to_csv(filename, index=False)
    all_years_combined.append(df_combined)
    # print(i)
final_df = pd.concat(all_years_combined, ignore_index=True)

### Save "all_year_combined.csv"

In [37]:
final_df.to_csv('all_years_combined.csv', index=False)