In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, Normalize
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests, ccf
import numpy as np
import plotly.express as px
from datetime import datetime
import geopandas as gpd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

sys.path.append(os.path.abspath("../scripts"))  # Adjust the path
os.chdir(os.path.dirname(os.path.abspath("__file__")))  # Set working directory

from utils import *

from utils import plot_outages_on_map_us

## 1.  Environment Setup

✔ Ensure all dependencies are installed using requirements.txt

In [2]:
pip install -r "../dynamic_rhythms/requirements.txt"

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: '../dynamic_rhythms/requirements.txt'[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## 2. Data Loading & Exploration

### 2.1 Load Power Outage Data
- Read .csv files in ../dynamic_rhythm_env/eaglei_data/
- Inspect columns, data types, and missing values
- Identify key variables like state, run_start_time, customers_out


In [None]:
# fileURL = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# dest = "repdata_data_StormData.csv.bz2"
# if(!file.exists(dest))
#     download.file(fileURL,dest)
# storm = read.csv(dest)


# setwd(".")
# download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "./repdata-data-StormData.csv.bz2", method = "curl")
# storm <- read.csv("repdata-data-StormData.csv.bz2", header = TRUE)

In [3]:
# import kagglehub

# # Download latest  
# path = kagglehub.dataset_download("sobhanmoosavi/us-weather-events")

# print("Path to dataset files:", path)

In [4]:
# Define dataset directories
outages_dir = "../dynamic_rhythm_train_data/eaglei_data/"
storms_dir = "../dynamic_rhythm_train_data/NOAA_StormEvents/"

# List files
outage_files = os.listdir(outages_dir)
storm_files = os.listdir(storms_dir)

print("Power Outage Files:", outage_files)
print("Storm Event Files:", storm_files)



Power Outage Files: ['DQI_processing.R', 'DQI.csv', 'eaglei_outages_2019.csv', 'eaglei_outages_2018.csv', 'eaglei_outages_2023.csv', 'MCC.csv', 'eaglei_outages_2022.csv', 'eaglei_outages_2020.csv', 'eaglei_outages_2021.csv', 'coverage_history.csv', 'eaglei_outages_2016.csv', 'eaglei_outages_2017.csv', 'eaglei_outages_2015.csv', 'eaglei_outages_2014.csv', 'Uri_Map.R']
Storm Event Files: ['StormEvents_details-ftp_v1.0_d2014_c20231116.csv', 'StormEvents_details-ftp_v1.0_d2019_c20240117.csv', 'StormEvents_details-ftp_v1.0_d2021_c20240716.csv', 'StormEvents_details-ftp_v1.0_d2018_c20240716.csv', 'StormEvents_details-ftp_v1.0_d2024_c20241216.csv', 'StormEvents_details-ftp_v1.0_d2023_c20241216.csv', 'StormEvents_details-ftp_v1.0_d2016_c20220719.csv', 'StormEvents_details-ftp_v1.0_d2017_c20230317.csv', 'StormEvents_details-ftp_v1.0_d2015_c20240716.csv', 'StormEvents_details-ftp_v1.0_d2022_c20241121.csv', 'StormEvents_details-ftp_v1.0_d2020_c20240620.csv', 'StormEvents_2014_2024.csv']


In [5]:
print("\n Power Outage Files:")
for file in outage_files:
    print(f"   - {file} ({file.split('.')[-1]})")  # Show file extensions

print("\n Storm Event Files:")
for file in storm_files:
    print(f"   - {file} ({file.split('.')[-1]})")

# Function to preview CSV files
def preview_csv(file_path, num_rows=5):
    """Load and preview a CSV file."""
    try:
        df = pd.read_csv(file_path, nrows=num_rows)  
        print(f"\n🔍 Preview of {os.path.basename(file_path)}:")
        print(df.head())  
        print("\n📊 Column Info:")
        print(df.info())  
        print("\n❗ Missing Values:")
        print(df.isnull().sum())  
    except Exception as e:
        print(f" Error loading {file_path}: {e}")

# Preview power outage CSV datasets
print("\n **Power Outage Data Preview**")
for file in outage_files:
    if file.endswith(".csv"):
        preview_csv(os.path.join(outages_dir, file))

# Preview storm event CSV datasets
print("\n **Storm Event Data Preview**")
for file in storm_files:
    if file.endswith(".csv"):
        preview_csv(os.path.join(storms_dir, file))

# Highlight non-CSV files for manual review
non_csv_files = [f for f in outage_files + storm_files if not f.endswith(".csv")]
if non_csv_files:
    print("\n **Non-CSV Files Found:**")
    for file in non_csv_files:
        print(f"   - {file}")
    print(" Please check these manually—some may contain useful information.")



 Power Outage Files:
   - DQI_processing.R (R)
   - DQI.csv (csv)
   - eaglei_outages_2019.csv (csv)
   - eaglei_outages_2018.csv (csv)
   - eaglei_outages_2023.csv (csv)
   - MCC.csv (csv)
   - eaglei_outages_2022.csv (csv)
   - eaglei_outages_2020.csv (csv)
   - eaglei_outages_2021.csv (csv)
   - coverage_history.csv (csv)
   - eaglei_outages_2016.csv (csv)
   - eaglei_outages_2017.csv (csv)
   - eaglei_outages_2015.csv (csv)
   - eaglei_outages_2014.csv (csv)
   - Uri_Map.R (R)

 Storm Event Files:
   - StormEvents_details-ftp_v1.0_d2014_c20231116.csv (csv)
   - StormEvents_details-ftp_v1.0_d2019_c20240117.csv (csv)
   - StormEvents_details-ftp_v1.0_d2021_c20240716.csv (csv)
   - StormEvents_details-ftp_v1.0_d2018_c20240716.csv (csv)
   - StormEvents_details-ftp_v1.0_d2024_c20241216.csv (csv)
   - StormEvents_details-ftp_v1.0_d2023_c20241216.csv (csv)
   - StormEvents_details-ftp_v1.0_d2016_c20220719.csv (csv)
   - StormEvents_details-ftp_v1.0_d2017_c20230317.csv (csv)
   - StormEv


---

**Observations:**
1. **No Major Missing Data Issues**  
   - Most datasets have complete records.  
   - **Exception:** `eaglei_outages_2020.csv` has a missing value in `customers_out`.

2. **Date Format Consistency Check**  
   - The `run_start_time` column in outage datasets is stored as an **object (string)**.  
   - We need to **convert it to datetime** for time-based analysis.

3. **County-Level Consistency**  
   - Some datasets use `fips_code`, while others (like `MCC.csv`) use `County_FIPS`.  
   - We should ensure **consistent naming and datatype alignment** before merging.

4. **Power Outage Data Granularity**  
   - Outage datasets (`eaglei_outages_YYYY.csv`) contain **county-level** power outage records.  
   - `MCC.csv` provides **total customer counts per county**, which can be used for **normalization** (outages per 1000 customers).  
   - `DQI.csv` and `coverage_history.csv` contain **data quality indicators and coverage trends**, useful for filtering.

---

**Next Steps:**
**Step 1:** Convert `run_start_time` to datetime.  
**Step 2:** Standardize column names (e.g., `County_FIPS` → `fips_code`).  
**Step 3:** Handle missing values (`customers_out` in 2020).  
**Step 4:** Merge datasets (Outages + MCC + Coverage + DQI).  
**Step 5:** Compute new features, e.g., **outage rate per 1000 customers**.

---


In [6]:
from tqdm import tqdm

# Load all outage data files (2014-2023) with tqdm for progress tracking
outage_files_csv = sorted([f for f in os.listdir(outages_dir) if f.startswith("eaglei_outages_") and f.endswith(".csv")])

df_list = []
for f in tqdm(outage_files_csv, desc="Loading outage files", unit="file"):
    df_list.append(pd.read_csv(os.path.join(outages_dir, f)))

outages = pd.concat(df_list, ignore_index=True)


Loading outage files: 100%|██████████| 10/10 [01:10<00:00,  7.10s/file]


In [7]:
outages.info()
outages

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191133068 entries, 0 to 191133067
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   fips_code       int64  
 1   county          object 
 2   state           object 
 3   customers_out   float64
 4   run_start_time  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 7.1+ GB


Unnamed: 0,fips_code,county,state,customers_out,run_start_time
0,1037,Coosa,Alabama,12.0,2014-11-01 04:00:00
1,1051,Elmore,Alabama,7.0,2014-11-01 04:00:00
2,1109,Pike,Alabama,1.0,2014-11-01 04:00:00
3,1121,Talladega,Alabama,31.0,2014-11-01 04:00:00
4,4017,Navajo,Arizona,1.0,2014-11-01 04:00:00
...,...,...,...,...,...
191133063,55095,Polk,Wisconsin,0.0,2023-12-31 23:45:00
191133064,55105,Rock,Wisconsin,1.0,2023-12-31 23:45:00
191133065,55109,St. Croix,Wisconsin,0.0,2023-12-31 23:45:00
191133066,55129,Washburn,Wisconsin,0.0,2023-12-31 23:45:00


- too large for analysis and modelling

In [8]:
outages.columns

Index(['fips_code', 'county', 'state', 'customers_out', 'run_start_time'], dtype='object')

In [9]:

target_state = 'Texas'

# Columns you need (adjust if necessary)
columns_needed = ['STATE', 'UTILITY_ID', 'CUSTOMERS_OUT', 'OUTAGE_START_DATETIME', 'OUTAGE_RESTORATION_DATETIME', 'CUSTOMERS_SERV', 'DATE_EVENT_BEGAN']

# Initialize list to store filtered DataFrames
filtered_outages_list = []

# Loop through files with tqdm for progress bar
for f in tqdm(outage_files_csv, desc="Reading outage files"):
    file_path = os.path.join(outages_dir, f)
    
    df = pd.read_csv(file_path, usecols=lambda c: c in outages.columns)
 
    df_state = df[df['state'] == target_state]
    filtered_outages_list.append(df_state)

# Concatenate all filtered outages into one DataFrame
outages14_23_texas_df = pd.concat(filtered_outages_list, ignore_index=True)

print(f"Finished reading outages. Shape: {outages14_23_texas_df.shape}")


Reading outage files: 100%|██████████| 10/10 [01:21<00:00,  8.17s/it]


Finished reading outages. Shape: (16413490, 5)


In [10]:
# outages14_23_texas_df.info()
outages14_23_texas_df.head(2)

Unnamed: 0,fips_code,county,state,customers_out,run_start_time
0,48029,Bexar,Texas,5.0,2014-11-01 04:00:00
1,48071,Chambers,Texas,1.0,2014-11-01 04:00:00



---  
### 2.2 Load Storm Events Data

- Read .csv files in ../dynamic_rhythm_env/NOAA_StormEvents/
- Inspect structure, relevant columns like event_type, state, begin_date_time

In [11]:
storms_dir = "../dynamic_rhythm_train_data/NOAA_StormEvents/"



In [12]:
# Path to the storm events CSV
storm_events_path = '../dynamic_rhythm_train_data//NOAA_StormEvents/StormEvents_2014_2024.csv'

# Read storm events
storm_events = pd.read_csv(storm_events_path)

# Filter for the same state
storm_events_14_24_df = storm_events[storm_events['STATE'] == target_state.upper()]

print(f"Finished reading storm events. Shape: {storm_events_14_24_df.shape}")

storm_events_14_24_df.info()


Finished reading storm events. Shape: (52039, 51)
<class 'pandas.core.frame.DataFrame'>
Index: 52039 entries, 10 to 691428
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BEGIN_YEARMONTH     52039 non-null  int64  
 1   BEGIN_DAY           52039 non-null  int64  
 2   BEGIN_TIME          52039 non-null  int64  
 3   END_YEARMONTH       52039 non-null  int64  
 4   END_DAY             52039 non-null  int64  
 5   END_TIME            52039 non-null  int64  
 6   EPISODE_ID          52039 non-null  int64  
 7   EVENT_ID            52039 non-null  int64  
 8   STATE               52039 non-null  object 
 9   STATE_FIPS          52039 non-null  int64  
 10  YEAR                52039 non-null  int64  
 11  MONTH_NAME          52039 non-null  object 
 12  EVENT_TYPE          52039 non-null  object 
 13  CZ_TYPE             52039 non-null  object 
 14  CZ_FIPS             52039 non-null  int64  
 15  CZ_NAM

In [13]:
storm_events_14_24_df['STATE'].unique()

array(['TEXAS'], dtype=object)

In [14]:
outages14_23_texas_df.columns


Index(['fips_code', 'county', 'state', 'customers_out', 'run_start_time'], dtype='object')

In [15]:
def make_ts_power(state,
                  start_year,
                  start_month,
                  start_day,
                  end_year,
                  end_month,
                  end_day,
                  df_power):
    """
    Create a time series DataFrame for power outages for a specific state within a date range.

    Parameters:
    - state (str): State to filter.
    - start_year, start_month, start_day, end_year, end_month, end_day (int): Date range.
    - df_power (DataFrame): Pre-loaded and pre-filtered outage data.

    Returns:
    - DataFrame indexed by time with 'customers_out' field.
    """

    # Parse datetime
    df_power['run_start_time'] = pd.to_datetime(df_power['run_start_time'])
    
    # Filter by date range
    start_date = pd.Timestamp(year=start_year, month=start_month, day=start_day)
    end_date = pd.Timestamp(year=end_year, month=end_month, day=end_day)

    df_power_filtered = df_power[
        (df_power['run_start_time'] >= start_date) & (df_power['run_start_time'] <= end_date)
    ]

    # Create time series DataFrame
    df_power_filtered = df_power_filtered.copy()
    df_power_filtered['time'] = df_power_filtered['run_start_time']
    df_power_ts = df_power_filtered[['time', 'customers_out']]
    
    return df_power_ts


def make_ts_events(state,
                   event_types,
                   start_year,
                   start_month,
                   start_day,
                   end_year,
                   end_month,
                   end_day,
                   df):
    """
    Create a time series DataFrame for storm events for a specific state and time period.

    Parameters:
    - state (str): State to filter.
    - event_types (list): List of unique event types.
    - start_year, start_month, start_day, end_year, end_month, end_day (int): Date range.
    - df (DataFrame): Pre-loaded and pre-filtered storm events data.

    Returns:
    - DataFrame indexed by time with event type counts as columns.
    """

    # Parse datetime
    df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'])
    
    # Filter by date range
    start_date = pd.Timestamp(year=start_year, month=start_month, day=start_day)
    end_date = pd.Timestamp(year=end_year, month=end_month, day=end_day)

    df_filtered = df[
        (df['BEGIN_DATE_TIME'] >= start_date) & (df['BEGIN_DATE_TIME'] <= end_date)
    ].copy()

    # Initialize event type columns
    for event in event_types:
        df_filtered[event] = (df_filtered['EVENT_TYPE'] == event).astype(int)
    
    # Create time series
    df_filtered['time'] = df_filtered['BEGIN_DATE_TIME']
    event_columns = event_types
    df_events_ts = df_filtered[['time'] + event_columns]
    
    # Group by time (sum over same timestamp)
    df_events_ts = df_events_ts.groupby('time').sum()

    return df_events_ts.reset_index()


In [16]:
outages14_23_texas_df['run_start_time'].min(), outages14_23_texas_df['run_start_time'].max()

('2014-11-01 04:00:00', '2023-12-31 23:45:00')

In [17]:
# Now create timeseries
df_state_ts_power = make_ts_power(
    state=target_state,
    start_year=2019, start_month=1, start_day=1,
    end_year=2021, end_month=12, end_day=31,
    df_power=outages14_23_texas_df
)

event_types_state = list(storm_events_14_24_df['EVENT_TYPE'].unique())

df_state_ts_events = make_ts_events(
    state=target_state,
    event_types=event_types_state,
    start_year=2019, start_month=1, start_day=1,
    end_year=2021, end_month=12, end_day=31,
    df=storm_events_14_24_df
)


  df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'])


In [18]:
df_state_ts_power.columns

Index(['time', 'customers_out'], dtype='object')

In [19]:
df_state_ts_events.columns

Index(['time', 'Thunderstorm Wind', 'Hail', 'Drought', 'Winter Weather',
       'High Wind', 'Winter Storm', 'Dust Storm', 'Cold/Wind Chill',
       'Freezing Fog', 'Rip Current', 'Strong Wind', 'Dense Fog', 'Sleet',
       'Frost/Freeze', 'Flash Flood', 'Lightning', 'Wildfire', 'Funnel Cloud',
       'Tornado', 'Heavy Rain', 'Flood', 'Ice Storm', 'Astronomical Low Tide',
       'Heavy Snow', 'Seiche', 'Heat', 'Tropical Depression', 'Blizzard',
       'Tropical Storm', 'Coastal Flood', 'Storm Surge/Tide', 'Excessive Heat',
       'High Surf', 'Dust Devil', 'Extreme Cold/Wind Chill', 'Debris Flow',
       'Hurricane', 'Sneakerwave'],
      dtype='object')

In [20]:
# df_combined_hr, df_combined_day = combine_agg_ts(...)


In [21]:
# Step 1: Copy the data
df_labels = outages14_23_texas_df.copy()

# Step 2: Parse run_start_time into date
df_labels['date'] = pd.to_datetime(df_labels['run_start_time']).dt.date

# Step 3: Group by fips_code, county, and date
df_labels_grouped = (
    df_labels
    .groupby(['fips_code', 'county', 'date'])
    .agg({'customers_out': 'sum'})
    .reset_index()
)

# Step 4: Create the binary major outage label
THRESHOLD = 1000  # customers
df_labels_grouped['major_outage'] = (df_labels_grouped['customers_out'] >= THRESHOLD).astype(int)

# Step 5: View a few rows
print(df_labels_grouped.head())


   fips_code    county        date  customers_out  major_outage
0      48001  Anderson  2014-11-01           25.0             0
1      48001  Anderson  2014-11-02           77.0             0
2      48001  Anderson  2014-11-03          137.0             0
3      48001  Anderson  2014-11-04            1.0             0
4      48001  Anderson  2014-11-05         6396.0             1


In [38]:
def parse_damage(damage_str):
    if pd.isnull(damage_str):
        return 0
    if isinstance(damage_str, (int, float)):
        return damage_str
    damage_str = damage_str.strip()
    multiplier = 1
    if damage_str.endswith('K'):
        multiplier = 1_000
        damage_str = damage_str[:-1]
    elif damage_str.endswith('M'):
        multiplier = 1_000_000
        damage_str = damage_str[:-1]
    elif damage_str.endswith('B'):
        multiplier = 1_000_000_000
        damage_str = damage_str[:-1]
    try:
        return float(damage_str) * multiplier
    except ValueError:
        return 0


In [126]:
df_storms = storm_events_14_24_df.copy()

# Fix the date
df_storms['date'] = pd.to_datetime(df_storms['BEGIN_DATE_TIME']).dt.date

# Create fips_code
df_storms['STATE_FIPS'] = df_storms['STATE_FIPS'].astype(str).str.zfill(2)
df_storms['CZ_FIPS'] = df_storms['CZ_FIPS'].astype(str).str.zfill(3)
df_storms['fips_code'] = df_storms['STATE_FIPS'] + df_storms['CZ_FIPS']

# --- Clean the damage columns here ---
df_storms['DAMAGE_PROPERTY'] = df_storms['DAMAGE_PROPERTY'].apply(parse_damage)
df_storms['DAMAGE_CROPS'] = df_storms['DAMAGE_CROPS'].apply(parse_damage)

# --- THEN group ---
df_storms_grouped = (
    df_storms
    .groupby(['fips_code', 'date'])
    .agg({
        'EVENT_TYPE': 'count',          # number of events
        'DAMAGE_PROPERTY': 'sum',        # total property damage (now numeric)
        'DAMAGE_CROPS': 'sum',            # total crop damage (now numeric)
        'TOR_F_SCALE': lambda x: x.nunique()  # number of unique tornado scales
    })
    .rename(columns={
        'EVENT_TYPE': 'num_events',
        'DAMAGE_PROPERTY': 'total_property_damage',
        'DAMAGE_CROPS': 'total_crop_damage',
        'TOR_F_SCALE': 'num_tornado_scales'
    })
    .reset_index()
)

#  fips_code should be string in both
df_labels_grouped['fips_code'] = df_labels_grouped['fips_code'].astype(str)
df_storms_grouped['fips_code'] = df_storms_grouped['fips_code'].astype(str)

# date should be datetime in both
df_labels_grouped['date'] = pd.to_datetime(df_labels_grouped['date'])
df_storms_grouped['date'] = pd.to_datetime(df_storms_grouped['date'])


In [127]:
storm_events_14_24_df[['DAMAGE_PROPERTY','DAMAGE_CROPS']].sample(5)

Unnamed: 0,DAMAGE_PROPERTY,DAMAGE_CROPS
616709,0.00K,0.00K
185740,1.00K,0.00K
130170,10.00K,0.00K
315289,0.00K,0.00K
299224,0.00K,0.00K


In [128]:
df_storms_grouped.sample(5)

Unnamed: 0,fips_code,date,num_events,total_property_damage,total_crop_damage,num_tornado_scales
28059,48341,2023-08-13,1,0.0,0.0,0
18051,48179,2019-05-26,1,0.0,0.0,0
20367,48205,2023-08-04,1,0.0,0.0,0
29698,48375,2016-04-20,1,0.0,0.0,0
21959,48228,2022-04-01,1,0.0,0.0,0


In [129]:
# Check for non-numeric values (should be none)
print("non-numerics:",df_storms_grouped['total_property_damage'].apply(lambda x: isinstance(x, str)).sum())
print("non-numerics:",df_storms_grouped['total_crop_damage'].apply(lambda x: isinstance(x, str)).sum())

# Property damage greater than 1 billion
gt_onebillion_property = df_storms_grouped[df_storms_grouped['total_property_damage'] > 1e9]['total_property_damage'].unique()

# Crop damage greater than 1 billion
gt_onebillion_crop = df_storms_grouped[df_storms_grouped['total_crop_damage'] > 1e9]['total_crop_damage'].unique()

print("Extreme property damage amount entries (over 1 billion):")
print(gt_onebillion_property)

print("\nExtreme crop damage amount amount entries (over 1 billion):")
print(gt_onebillion_crop)


non-numerics: 0
non-numerics: 0
Extreme property damage amount entries (over 1 billion):
[1.360000e+09 2.000000e+09 1.300200e+09 1.960025e+09 1.351270e+09
 8.002830e+09 1.000000e+10 1.000068e+10 1.100000e+09 1.300000e+09
 1.950000e+09 3.000000e+09 7.000000e+09 1.500000e+09]

Extreme crop damage amount amount entries (over 1 billion):
[]


In [130]:
df_storms_grouped = df_storms_grouped.rename(columns={
    'total_property_damage': 'total_property_damage_usd',
    'total_crop_damage': 'total_crop_damage_usd'
})
	

df_storms_grouped[['total_property_damage_usd','total_crop_damage_usd']].sample(2)

Unnamed: 0,total_property_damage_usd,total_crop_damage_usd
7304,0.0,0.0
33226,0.0,0.0


In [131]:
df_storms_grouped['total_crop_damage_usd'].unique()
# df_storms_grouped[['total_property_damage',
#        'total_crop_damage', 'num_tornado_scales']].sample(10)
# df_storms_grouped.columns

array([0.000e+00, 1.000e+04, 9.650e+04, 8.000e+03, 5.000e+04, 2.000e+05,
       1.000e+06, 2.500e+05, 5.000e+06, 3.000e+06, 2.850e+07, 3.000e+03,
       1.000e+05, 5.000e+02, 2.000e+03, 1.000e+03, 6.000e+03, 7.000e+03,
       4.000e+03, 1.700e+04, 5.140e+06, 2.000e+04, 4.400e+04, 2.710e+07,
       1.500e+05, 1.300e+04, 5.000e+03, 5.000e+05, 4.500e+04, 2.000e+06,
       3.000e+04, 2.500e+03, 1.100e+04, 1.500e+04, 5.000e+07, 8.347e+05,
       3.200e+04, 9.000e+03, 1.300e+05, 3.500e+04, 1.200e+04, 2.870e+06,
       3.370e+06, 1.930e+06, 2.000e+07, 9.000e+04, 2.270e+06, 8.890e+06,
       3.368e+07, 4.118e+07, 2.920e+07, 3.080e+08, 1.100e+08, 3.460e+07,
       4.240e+07, 1.235e+07, 2.590e+07, 4.600e+07, 3.360e+07, 3.500e+05,
       1.000e+08, 1.000e+07, 2.500e+06, 7.500e+04, 2.500e+08, 3.000e+05,
       6.000e+04, 4.000e+02, 3.300e+04, 7.100e+03, 1.363e+07, 3.000e+07,
       2.500e+04, 7.010e+06, 7.000e+05, 3.250e+07, 7.000e+04, 3.420e+05,
       5.700e+04, 2.500e+02, 3.610e+07])

In [132]:
df_storms_grouped
# .info()


Unnamed: 0,fips_code,date,num_events,total_property_damage_usd,total_crop_damage_usd,num_tornado_scales
0,48001,2014-01-01,1,0.0,0.0,0
1,48001,2014-01-04,1,0.0,0.0,0
2,48001,2014-01-23,1,0.0,0.0,0
3,48001,2014-02-01,2,0.0,0.0,0
4,48001,2014-02-04,1,0.0,0.0,0
...,...,...,...,...,...,...
33744,48615,2024-06-19,1,0.0,0.0,0
33745,48615,2024-07-08,2,2000.0,0.0,0
33746,48615,2024-09-11,1,1000.0,0.0,0
33747,48616,2023-08-01,1,0.0,0.0,0


In [133]:
df_labels_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522485 entries, 0 to 522484
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   fips_code      522485 non-null  object        
 1   county         522485 non-null  object        
 2   date           522485 non-null  datetime64[ns]
 3   customers_out  522485 non-null  float64       
 4   major_outage   522485 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 19.9+ MB


In [134]:
# Merge storm data with outage labels
df_model = pd.merge(
    df_labels_grouped,
    df_storms_grouped,
    on=['fips_code', 'date'],
    how='left'
)

# Fill missing storm values with 0 (no storm that day)
df_model.fillna({
    'num_events': 0,
    'total_property_damage': 0,
    'total_crop_damage': 0,
    'num_tornado_scales': 0
}, inplace=True)

df_model.head()


Unnamed: 0,fips_code,county,date,customers_out,major_outage,num_events,total_property_damage_usd,total_crop_damage_usd,num_tornado_scales
0,48001,Anderson,2014-11-01,25.0,0,1.0,0.0,0.0,0.0
1,48001,Anderson,2014-11-02,77.0,0,0.0,,,0.0
2,48001,Anderson,2014-11-03,137.0,0,0.0,,,0.0
3,48001,Anderson,2014-11-04,1.0,0,0.0,,,0.0
4,48001,Anderson,2014-11-05,6396.0,1,0.0,,,0.0


In [135]:
df_model.isnull().mean() * 100

fips_code                     0.000000
county                        0.000000
date                          0.000000
customers_out                 0.000000
major_outage                  0.000000
num_events                    0.000000
total_property_damage_usd    97.304037
total_crop_damage_usd        97.304037
num_tornado_scales            0.000000
dtype: float64