In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
elec_path='../data/eaglei_data/'
storm_path='../data/NOAA_StormEvents/'
eaglei_2015=pd.read_csv(elec_path+'eaglei_outages_2015.csv')
Storms=pd.read_csv(storm_path+'StormEvents_details-ftp_v1.0_d2015_c20240716.csv')


In [3]:
def group_customers_out_by_date(df):
    """
    Group the DataFrame by county, state, and the date part of run_start_time, 
    and sum the customers_out values.

    Args:
        df (pd.DataFrame): Original data containing columns 
            ['fips_code', 'county', 'state', 'customers_out', 'run_start_time'].

    Returns:
        pd.DataFrame: Grouped data with columns 
            ['county', 'state', 'date', 'customers_out', 'run_start_time'].
    """
    # Make a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Ensure 'run_start_time' is of datetime type
    df['run_start_time'] = pd.to_datetime(df['run_start_time'])

    # Extract the date part only
    df['date'] = df['run_start_time'].dt.date

    # Group by county, state, and date, and sum the customers_out
    grouped = df.groupby(['fips_code', 'date']).agg({
        'customers_out': 'sum'
    }).reset_index()

    # Add back a datetime version of the date if needed
    grouped['run_start_time'] = pd.to_datetime(grouped['date'])

    return grouped

In [4]:
df_outage=group_customers_out_by_date(eaglei_2015)

In [5]:
def change_date_storm(df_storm):
    df_storm['BEGIN_DATE_TIME'] = pd.to_datetime(df_storm['BEGIN_DATE_TIME']).dt.date
    df_storm['END_DATE_TIME'] = pd.to_datetime(df_storm['END_DATE_TIME']).dt.date
    return df_storm
df_storm=change_date_storm(Storms)

  df_storm['BEGIN_DATE_TIME'] = pd.to_datetime(df_storm['BEGIN_DATE_TIME']).dt.date
  df_storm['END_DATE_TIME'] = pd.to_datetime(df_storm['END_DATE_TIME']).dt.date


In [6]:
def keep_c_storm(df_storm):
    df_storm = df_storm[df_storm['CZ_TYPE'] == 'C']
    return df_storm

def make_fips_storm(storms):
    storms['STATE_FIPS'] = storms['STATE_FIPS'].astype(str).str.zfill(2)
    storms['CZ_FIPS'] = storms['CZ_FIPS'].astype(str).str.zfill(3)
    storms['FIPS'] = storms['STATE_FIPS'] + storms['CZ_FIPS']
    return storms

df_storm=keep_c_storm(df_storm)
df_storm=make_fips_storm(df_storm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  storms['STATE_FIPS'] = storms['STATE_FIPS'].astype(str).str.zfill(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  storms['CZ_FIPS'] = storms['CZ_FIPS'].astype(str).str.zfill(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  storms['FIPS'] = storms['STATE_FIPS'] + storms['CZ_FIPS']


In [7]:
def update_customers_out(df_storm, df_example):
    """
    Update the 'customers_out' column in df_storm based on matching time intervals 
    and FIPS codes from df_example, and store an array of run_start_time values 
    for matching outage events.

    Args:
        df_storm (pd.DataFrame): DataFrame containing storm events with columns 
            ['FIPS', 'BEGIN_DATE_TIME', 'END_DATE_TIME', ...].
        df_example (pd.DataFrame): DataFrame containing customer outage events with columns 
            ['fips_code', 'customers_out', 'run_start_time', ...].

    Returns:
        pd.DataFrame: Updated df_storm with 'customers_out' and 'run_start_times' columns modified.
    """

    # Ensure correct data types
    df_storm = df_storm.copy()
    df_storm['FIPS'] = df_storm['FIPS'].astype('int64')
    df_storm['BEGIN_DATE_TIME'] = pd.to_datetime(df_storm['BEGIN_DATE_TIME'])
    df_storm['END_DATE_TIME'] = pd.to_datetime(df_storm['END_DATE_TIME'])
    
    df_example = df_example.copy()
    df_example['run_start_time'] = pd.to_datetime(df_example['run_start_time'])
    df_example['fips_code'] = df_example['fips_code'].astype('int64')

    # Initialize columns if they do not exist
    if 'customers_out' not in df_storm.columns:
        df_storm['customers_out'] = 0
    if 'run_start_times' not in df_storm.columns:
        df_storm['run_start_times'] = [[] for _ in range(len(df_storm))]

    # Iterate over each row in df_example
    for idx, row in tqdm(df_example.iterrows(), total=len(df_example), desc="Updating customers_out"):
        run_time = row['run_start_time']
        customers_out_value = row['customers_out']
        fips_code = row['fips_code']

        # Create a mask to find matching storm records
        mask = (
            (df_storm['BEGIN_DATE_TIME'] <= run_time) & 
            (df_storm['END_DATE_TIME'] >= run_time) & 
            (df_storm['FIPS'] == fips_code)
        )

        # Update customers_out and append run_start_time to the list
        df_storm.loc[mask, 'customers_out'] += customers_out_value
        df_storm.loc[mask, 'run_start_times'] = df_storm.loc[mask, 'run_start_times'].apply(
            lambda x: x + [run_time]
        )

    return df_storm


df_combined=update_customers_out(df_storm, df_outage)

Updating customers_out:   2%|▏         | 11162/485528 [00:44<31:36, 250.11it/s]


KeyboardInterrupt: 

In [None]:
def add_lead_time_column(df_storm):
    """
    Add a 'lead_time' column to df_storm, containing a list of time differences (in hours)
    between each run_start_time and BEGIN_DATE_TIME. If run_start_times is empty, set lead_time to 0.

    Args:
        df_storm (pd.DataFrame): DataFrame with columns ['BEGIN_DATE_TIME', 'run_start_times', ...].

    Returns:
        pd.DataFrame: Updated df_storm with a new 'lead_time' column.
    """
    # Ensure df_storm is a copy to avoid modifying the input
    df_storm = df_storm.copy()

    # Initialize the lead_time column
    df_storm['lead_time'] = df_storm.apply(
        lambda row: [
            (run_time - row['BEGIN_DATE_TIME']).total_seconds() / 86400  # Convert to hours
            for run_time in row['run_start_times']
        ] if row['run_start_times'] else [-1.0],
        axis=1
    )

    return df_storm

df_combined = add_lead_time_column(df_combined)

In [None]:
df_combined.to_csv('combined_data_2015.csv')