# Feature Engineering
## Wildfire-Induced Power Outages: A Data Mining Analysis

This notebook creates features from the processed datasets and merges them for analysis.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Define paths
PROCESSED_DATA_PATH = '../data/processed/'
FEATURES_DATA_PATH = '../data/features/'

# Create features directory if it doesn't exist
os.makedirs(FEATURES_DATA_PATH, exist_ok=True)

print('Libraries imported successfully!')

---
## 1. Load Processed Datasets

In [2]:
# Load all processed datasets
power_outages = pd.read_csv(PROCESSED_DATA_PATH + 'california_power_outages.csv')
grid_disruptions = pd.read_csv(PROCESSED_DATA_PATH + 'california_grid_disruptions.csv')
grid_2023 = pd.read_csv(PROCESSED_DATA_PATH + 'california_grid_2023.csv')
calfire = pd.read_csv(PROCESSED_DATA_PATH + 'california_calfire_incidents.csv')
wildfires_fpa = pd.read_csv(PROCESSED_DATA_PATH + 'california_wildfires_fpa.csv')
nasa_firms = pd.read_csv(PROCESSED_DATA_PATH + 'california_nasa_firms.csv')

print('Datasets loaded:')
print(f'  Power Outages (Purdue): {len(power_outages):,} records')
print(f'  Grid Disruptions (DOE): {len(grid_disruptions):,} records')
print(f'  Grid 2023: {len(grid_2023):,} records')
print(f'  CAL FIRE: {len(calfire):,} records')
print(f'  Wildfires FPA: {len(wildfires_fpa):,} records')
print(f'  NASA FIRMS: {len(nasa_firms):,} records')

Datasets loaded:
  Power Outages (Purdue): 210 records
  Grid Disruptions (DOE): 300 records
  Grid 2023: 15 records
  CAL FIRE: 1,636 records
  Wildfires FPA: 189,550 records
  NASA FIRMS: 221,183 records


---
## 2. Power Outage Feature Engineering

In [3]:
# Convert date columns
power_outages['OUTAGE.START.DATE'] = pd.to_datetime(power_outages['OUTAGE.START.DATE'])
power_outages['OUTAGE.RESTORATION.DATE'] = pd.to_datetime(power_outages['OUTAGE.RESTORATION.DATE'])

# Create temporal features
power_outages['outage_date'] = power_outages['OUTAGE.START.DATE'].dt.date
power_outages['outage_year'] = power_outages['OUTAGE.START.DATE'].dt.year
power_outages['outage_month'] = power_outages['OUTAGE.START.DATE'].dt.month
power_outages['outage_day'] = power_outages['OUTAGE.START.DATE'].dt.day
power_outages['outage_dayofweek'] = power_outages['OUTAGE.START.DATE'].dt.dayofweek
power_outages['outage_quarter'] = power_outages['OUTAGE.START.DATE'].dt.quarter

# Create season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

power_outages['season'] = power_outages['outage_month'].apply(get_season)

# California wildfire season indicator (typically June - November)
power_outages['is_wildfire_season'] = power_outages['outage_month'].isin([6, 7, 8, 9, 10, 11]).astype(int)

# Weekend indicator
power_outages['is_weekend'] = power_outages['outage_dayofweek'].isin([5, 6]).astype(int)

print('Temporal features created for power outages')
power_outages[['OUTAGE.START.DATE', 'outage_year', 'outage_month', 'season', 'is_wildfire_season', 'outage_dayofweek']].head(10)

Temporal features created for power outages


Unnamed: 0,OUTAGE.START.DATE,outage_year,outage_month,season,is_wildfire_season,outage_dayofweek
0,2007-09-04,2007,9,Fall,1,1
1,2008-05-08,2008,5,Spring,0,3
2,2006-05-19,2006,5,Spring,0,4
3,2015-10-13,2015,10,Fall,1,1
4,2014-02-06,2014,2,Winter,0,3
5,2013-04-25,2013,4,Spring,0,3
6,2008-01-29,2008,1,Winter,0,1
7,2006-07-24,2006,7,Summer,1,0
8,2005-07-21,2005,7,Summer,1,3
9,2013-08-19,2013,8,Summer,1,0


In [4]:
# Create outage severity categories
def categorize_duration(duration):
    if pd.isna(duration):
        return 'Unknown'
    elif duration < 60:  # Less than 1 hour
        return 'Short'
    elif duration < 480:  # 1-8 hours
        return 'Medium'
    elif duration < 1440:  # 8-24 hours
        return 'Long'
    else:  # More than 24 hours
        return 'Extended'

power_outages['duration_category'] = power_outages['OUTAGE.DURATION'].apply(categorize_duration)

# Customer impact severity
def categorize_customer_impact(customers):
    if pd.isna(customers):
        return 'Unknown'
    elif customers < 10000:
        return 'Low'
    elif customers < 50000:
        return 'Medium'
    elif customers < 100000:
        return 'High'
    else:
        return 'Severe'

power_outages['impact_category'] = power_outages['CUSTOMERS.AFFECTED'].apply(categorize_customer_impact)

# Wildfire-related outage indicator (from cause category)
power_outages['is_wildfire_related'] = (
    power_outages['CAUSE.CATEGORY.DETAIL'].str.contains('wildfire|fire', case=False, na=False) |
    power_outages['CAUSE.CATEGORY'].str.contains('wildfire|fire', case=False, na=False)
).astype(int)

print('Severity and category features created')
print(f"\nDuration Categories:\n{power_outages['duration_category'].value_counts()}")
print(f"\nImpact Categories:\n{power_outages['impact_category'].value_counts()}")
print(f"\nWildfire-related outages: {power_outages['is_wildfire_related'].sum()}")

Severity and category features created

Duration Categories:
duration_category
Medium      82
Extended    43
Short       37
Long        36
Unknown     12
Name: count, dtype: int64

Impact Categories:
impact_category
Unknown    83
Severe     48
Low        45
High       24
Medium     10
Name: count, dtype: int64

Wildfire-related outages: 16


---
## 3. Wildfire Feature Engineering

In [5]:
# Convert date columns for FPA wildfires
wildfires_fpa['DISCOVERY_DATE'] = pd.to_datetime(wildfires_fpa['DISCOVERY_DATE'])
wildfires_fpa['fire_date'] = wildfires_fpa['DISCOVERY_DATE'].dt.date
wildfires_fpa['fire_year'] = wildfires_fpa['DISCOVERY_DATE'].dt.year
wildfires_fpa['fire_month'] = wildfires_fpa['DISCOVERY_DATE'].dt.month

# Fire size categories
fire_size_map = {
    'A': '0-0.25 acres',
    'B': '0.26-9.9 acres',
    'C': '10-99.9 acres',
    'D': '100-299 acres',
    'E': '300-999 acres',
    'F': '1000-4999 acres',
    'G': '5000+ acres'
}
wildfires_fpa['fire_size_desc'] = wildfires_fpa['FIRE_SIZE_CLASS'].map(fire_size_map)

print('Wildfire FPA temporal features created')
print(f"\nFire Size Distribution:\n{wildfires_fpa['FIRE_SIZE_CLASS'].value_counts().sort_index()}")

Wildfire FPA temporal features created

Fire Size Distribution:
FIRE_SIZE_CLASS
A    98309
B    76942
C     9825
D     2137
E     1187
F      756
G      394
Name: count, dtype: int64


In [6]:
# Create daily wildfire aggregations
daily_fires = wildfires_fpa.groupby('fire_date').agg({
    'FIRE_SIZE': ['count', 'sum', 'mean', 'max'],
    'LATITUDE': 'mean',
    'LONGITUDE': 'mean'
}).reset_index()

# Flatten column names
daily_fires.columns = ['fire_date', 'daily_fire_count', 'daily_acres_burned', 
                       'avg_fire_size', 'max_fire_size', 'avg_lat', 'avg_lon']

# Convert fire_date to datetime for merging
daily_fires['fire_date'] = pd.to_datetime(daily_fires['fire_date'])

print(f'Daily wildfire aggregations: {len(daily_fires):,} days')
daily_fires.head(10)

Daily wildfire aggregations: 8,324 days


Unnamed: 0,fire_date,daily_fire_count,daily_acres_burned,avg_fire_size,max_fire_size,avg_lat,avg_lon
0,1992-01-01,3,5.3,1.766667,5.0,35.011863,-117.713715
1,1992-01-02,3,0.8,0.266667,0.5,33.534722,-116.66463
2,1992-01-03,1,0.1,0.1,0.1,37.266111,-122.321944
3,1992-01-04,2,0.2,0.1,0.1,33.9025,-117.3475
4,1992-01-06,1,0.1,0.1,0.1,34.341667,-118.106667
5,1992-01-07,1,0.1,0.1,0.1,33.865,-117.373889
6,1992-01-10,2,3.3,1.65,3.0,32.7334,-114.6508
7,1992-01-11,1,0.2,0.2,0.2,32.7334,-114.6175
8,1992-01-12,3,62.5,20.833333,60.0,35.753611,-118.068333
9,1992-01-13,2,0.4,0.2,0.3,33.213472,-117.018056


In [7]:
# Create monthly wildfire aggregations
wildfires_fpa['year_month'] = wildfires_fpa['DISCOVERY_DATE'].dt.to_period('M')

monthly_fires = wildfires_fpa.groupby('year_month').agg({
    'FIRE_SIZE': ['count', 'sum', 'mean', 'max'],
    'STAT_CAUSE_DESCR': lambda x: x.value_counts().index[0] if len(x) > 0 else 'Unknown'  # Most common cause
}).reset_index()

monthly_fires.columns = ['year_month', 'monthly_fire_count', 'monthly_acres_burned', 
                         'avg_fire_size', 'max_fire_size', 'most_common_cause']

print(f'Monthly wildfire aggregations: {len(monthly_fires):,} months')
monthly_fires.head(10)

Monthly wildfire aggregations: 288 months


Unnamed: 0,year_month,monthly_fire_count,monthly_acres_burned,avg_fire_size,max_fire_size,most_common_cause
0,1992-01,92,222.3,2.416304,60.0,Debris Burning
1,1992-02,54,225.5,4.175926,120.0,Debris Burning
2,1992-03,48,1885.4,39.279167,1800.0,Debris Burning
3,1992-04,259,7364.5,28.434363,4200.0,Miscellaneous
4,1992-05,1339,12041.2,8.992681,1350.0,Equipment Use
5,1992-06,2185,29892.0,13.680549,2150.0,Lightning
6,1992-07,2064,23378.3,11.326696,2080.0,Miscellaneous
7,1992-08,2284,155138.0,67.923818,64000.0,Lightning
8,1992-09,1361,60516.0,44.464364,24580.0,Equipment Use
9,1992-10,753,3897.1,5.175432,430.0,Miscellaneous


In [8]:
# Large fire indicator (Class F or G - 1000+ acres)
large_fires = wildfires_fpa[wildfires_fpa['FIRE_SIZE_CLASS'].isin(['F', 'G'])].copy()

daily_large_fires = large_fires.groupby('fire_date').agg({
    'FIRE_SIZE': ['count', 'sum']
}).reset_index()

daily_large_fires.columns = ['fire_date', 'large_fire_count', 'large_fire_acres']
daily_large_fires['fire_date'] = pd.to_datetime(daily_large_fires['fire_date'])

print(f'Days with large fires (1000+ acres): {len(daily_large_fires):,}')
print(f'Total large fires: {large_fires.shape[0]:,}')

Days with large fires (1000+ acres): 769
Total large fires: 1,150


---
## 4. NASA FIRMS Satellite Data Features

In [9]:
# Convert date
nasa_firms['acq_date'] = pd.to_datetime(nasa_firms['acq_date'])

# Daily satellite fire detections
daily_satellite = nasa_firms.groupby('acq_date').agg({
    'frp': ['count', 'sum', 'mean', 'max'],  # Fire Radiative Power
    'confidence': 'mean',
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

daily_satellite.columns = ['date', 'satellite_detections', 'total_frp', 'avg_frp', 
                           'max_frp', 'avg_confidence', 'avg_lat', 'avg_lon']

print(f'Daily satellite fire detections: {len(daily_satellite):,} days')
daily_satellite.head(10)

Daily satellite fire detections: 6,352 days


Unnamed: 0,date,satellite_detections,total_frp,avg_frp,max_frp,avg_confidence,avg_lat,avg_lon
0,2000-11-01,1,11.3,11.3,11.3,52.0,38.4464,-120.3885
1,2000-11-02,15,463.2,30.88,95.5,67.466667,38.179747,-120.714507
2,2000-11-03,24,463.0,19.291667,53.0,72.916667,39.628475,-122.167221
3,2000-11-05,6,109.9,18.316667,44.7,70.0,38.894133,-121.564867
4,2000-11-06,9,645.9,71.766667,200.9,66.555556,40.3631,-122.813378
5,2000-11-07,59,1286.8,21.810169,163.7,71.644068,38.086729,-120.929854
6,2000-11-08,20,553.7,27.685,51.9,65.0,39.4063,-121.81474
7,2000-11-09,25,1362.0,54.48,534.2,68.36,37.575648,-120.443392
8,2000-11-10,2,82.1,41.05,64.9,70.0,38.70105,-121.1325
9,2000-11-11,3,33.9,11.3,16.4,60.666667,35.868767,-119.281367


---
## 5. Merge Datasets

In [10]:
# Prepare power outages for merging
power_outages['outage_date'] = pd.to_datetime(power_outages['outage_date'])

# Merge power outages with daily wildfire data
merged_df = power_outages.merge(
    daily_fires,
    left_on='outage_date',
    right_on='fire_date',
    how='left'
)

# Merge with large fires data
merged_df = merged_df.merge(
    daily_large_fires,
    left_on='outage_date',
    right_on='fire_date',
    how='left',
    suffixes=('', '_large')
)

# Merge with satellite data
merged_df = merged_df.merge(
    daily_satellite,
    left_on='outage_date',
    right_on='date',
    how='left'
)

# Fill NaN values for fire-related columns (no fires on that day)
fire_columns = ['daily_fire_count', 'daily_acres_burned', 'avg_fire_size', 'max_fire_size',
                'large_fire_count', 'large_fire_acres', 'satellite_detections', 'total_frp',
                'avg_frp', 'max_frp', 'avg_confidence']

for col in fire_columns:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].fillna(0)

print(f'Merged dataset shape: {merged_df.shape}')
merged_df.head()

Merged dataset shape: (210, 45)


Unnamed: 0,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,OUTAGE.START.DATE,OUTAGE.START.TIME,OUTAGE.RESTORATION.DATE,OUTAGE.RESTORATION.TIME,CAUSE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,outage_date,outage_year,outage_month,outage_day,outage_dayofweek,outage_quarter,season,is_wildfire_season,is_weekend,duration_category,impact_category,is_wildfire_related,fire_date,daily_fire_count,daily_acres_burned,avg_fire_size,max_fire_size,avg_lat_x,avg_lon_x,fire_date_large,large_fire_count,large_fire_acres,date,satellite_detections,total_frp,avg_frp,max_frp,avg_confidence,avg_lat_y,avg_lon_y
0,2007,9,California,CA,WECC,West,2007-09-04,08:30:00,2007-09-04,15:30:00,severe weather,heatwave,420.0,,,2007-09-04,2007,9,4,1,3,Fall,1,0,Medium,Unknown,0,2007-09-04,48.0,24.55,0.511458,5.0,39.178522,-121.107644,NaT,0.0,0.0,2007-09-04,165.0,34776.4,210.766061,2500.9,89.836364,38.501007,-121.013092
1,2008,5,California,CA,WECC,West,2008-05-08,10:21:00,2008-05-08,12:56:00,system operability disruption,,155.0,483.0,,2008-05-08,2008,5,8,3,2,Spring,0,0,Medium,Unknown,0,2008-05-08,16.0,76.7,4.79375,52.0,38.182292,-121.094687,NaT,0.0,0.0,2008-05-08,17.0,520.6,30.623529,175.7,70.529412,36.851729,-119.619753
2,2006,5,California,CA,WECC,West,2006-05-19,15:13:00,2006-05-19,22:30:00,severe weather,thunderstorm,437.0,133.0,,2006-05-19,2006,5,19,4,2,Spring,0,0,Medium,Unknown,0,2006-05-19,15.0,14.1,0.94,10.0,35.874541,-119.055952,NaT,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,,
3,2015,10,California,CA,WECC,West,2015-10-13,16:32:00,2015-10-13,20:39:00,public appeal,,247.0,41788.0,,2015-10-13,2015,10,13,1,4,Fall,1,0,Medium,Unknown,0,2015-10-13,20.0,255.25,12.7625,250.0,38.396808,-121.249141,NaT,0.0,0.0,2015-10-13,13.0,1615.2,124.246154,418.1,79.615385,37.401808,-121.283192
4,2014,2,California,CA,WECC,West,2014-02-06,13:00:00,2014-02-06,22:00:00,fuel supply emergency,Natural Gas,540.0,4000.0,,2014-02-06,2014,2,6,3,1,Winter,0,0,Long,Unknown,0,2014-02-06,3.0,1.2,0.4,1.0,37.507109,-120.534354,NaT,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,,


In [11]:
# Add rolling window features (fires in past 7 days, 30 days)
# First, create a complete date range
date_range = pd.date_range(
    start=daily_fires['fire_date'].min(),
    end=daily_fires['fire_date'].max(),
    freq='D'
)

# Create complete daily fire dataset
complete_daily = pd.DataFrame({'fire_date': date_range})
complete_daily = complete_daily.merge(daily_fires, on='fire_date', how='left')
complete_daily = complete_daily.fillna(0)

# Calculate rolling sums
complete_daily['fires_7day'] = complete_daily['daily_fire_count'].rolling(window=7, min_periods=1).sum()
complete_daily['fires_30day'] = complete_daily['daily_fire_count'].rolling(window=30, min_periods=1).sum()
complete_daily['acres_7day'] = complete_daily['daily_acres_burned'].rolling(window=7, min_periods=1).sum()
complete_daily['acres_30day'] = complete_daily['daily_acres_burned'].rolling(window=30, min_periods=1).sum()

# Merge rolling features
merged_df = merged_df.merge(
    complete_daily[['fire_date', 'fires_7day', 'fires_30day', 'acres_7day', 'acres_30day']],
    left_on='outage_date',
    right_on='fire_date',
    how='left',
    suffixes=('', '_rolling')
)

print('Rolling window features added')
merged_df[['outage_date', 'daily_fire_count', 'fires_7day', 'fires_30day', 'acres_7day', 'acres_30day']].head(10)

Rolling window features added


Unnamed: 0,outage_date,daily_fire_count,fires_7day,fires_30day,acres_7day,acres_30day
0,2007-09-04,48.0,480.0,1473.0,119713.43,134779.78
1,2008-05-08,16.0,133.0,693.0,2752.66,5495.77
2,2006-05-19,15.0,194.0,399.0,2759.14,3000.94
3,2015-10-13,20.0,126.0,572.0,1100.93,3807.79
4,2014-02-06,3.0,61.0,445.0,43.52,4157.61
5,2013-04-25,30.0,226.0,537.0,573.93,1642.63
6,2008-01-29,1.0,10.0,88.0,25.4,121.9
7,2006-07-24,161.0,664.0,2312.0,218728.31,403733.82
8,2005-07-21,45.0,298.0,1355.0,19625.72,117483.2
9,2013-08-19,93.0,316.0,1242.0,261734.14,388940.411


In [12]:
# Create binary target variables for classification

# High severity outage (>50,000 customers OR >1000 MW demand loss)
merged_df['is_high_severity'] = (
    (merged_df['CUSTOMERS.AFFECTED'] >= 50000) | 
    (merged_df['DEMAND.LOSS.MW'] >= 1000)
).astype(int)

# Long duration outage (>8 hours / 480 minutes)
merged_df['is_long_duration'] = (merged_df['OUTAGE.DURATION'] >= 480).astype(int)

# Active fire day (any fire reported on that day)
merged_df['has_active_fire'] = (merged_df['daily_fire_count'] > 0).astype(int)

# High fire activity day (above median fire count)
median_fires = merged_df[merged_df['daily_fire_count'] > 0]['daily_fire_count'].median()
merged_df['is_high_fire_activity'] = (merged_df['daily_fire_count'] > median_fires).astype(int)

print('Target variables created:')
print(f"  High severity outages: {merged_df['is_high_severity'].sum()} ({merged_df['is_high_severity'].mean()*100:.1f}%)")
print(f"  Long duration outages: {merged_df['is_long_duration'].sum()} ({merged_df['is_long_duration'].mean()*100:.1f}%)")
print(f"  Days with active fires: {merged_df['has_active_fire'].sum()} ({merged_df['has_active_fire'].mean()*100:.1f}%)")

Target variables created:
  High severity outages: 76 (36.2%)
  Long duration outages: 79 (37.6%)
  Days with active fires: 193 (91.9%)


---
## 6. Final Dataset Summary

In [13]:
# Select final features for analysis
final_features = [
    # Identifiers
    'outage_date', 'outage_year', 'outage_month',
    
    # Temporal features
    'outage_dayofweek', 'outage_quarter', 'season', 
    'is_wildfire_season', 'is_weekend',
    
    # Outage characteristics
    'CAUSE.CATEGORY', 'CAUSE.CATEGORY.DETAIL',
    'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED',
    'duration_category', 'impact_category', 'is_wildfire_related',
    
    # Daily fire metrics
    'daily_fire_count', 'daily_acres_burned', 'avg_fire_size', 'max_fire_size',
    'large_fire_count', 'large_fire_acres',
    
    # Satellite metrics
    'satellite_detections', 'total_frp', 'avg_frp', 'max_frp', 'avg_confidence',
    
    # Rolling features
    'fires_7day', 'fires_30day', 'acres_7day', 'acres_30day',
    
    # Target variables
    'is_high_severity', 'is_long_duration', 'has_active_fire', 'is_high_fire_activity'
]

# Keep only columns that exist
final_features = [col for col in final_features if col in merged_df.columns]

final_df = merged_df[final_features].copy()

print(f'Final dataset shape: {final_df.shape}')
print(f'\nFeature columns: {len(final_features)}')
final_df.info()

Final dataset shape: (210, 35)

Feature columns: 35
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   outage_date            210 non-null    datetime64[ns]
 1   outage_year            210 non-null    int32         
 2   outage_month           210 non-null    int32         
 3   outage_dayofweek       210 non-null    int32         
 4   outage_quarter         210 non-null    int32         
 5   season                 210 non-null    object        
 6   is_wildfire_season     210 non-null    int64         
 7   is_weekend             210 non-null    int64         
 8   CAUSE.CATEGORY         210 non-null    object        
 9   CAUSE.CATEGORY.DETAIL  110 non-null    object        
 10  OUTAGE.DURATION        198 non-null    float64       
 11  DEMAND.LOSS.MW         158 non-null    float64       
 12  CUSTOMERS.AF

In [14]:
# Summary statistics
print('=' * 60)
print('FINAL DATASET SUMMARY')
print('=' * 60)
print(f"\nTotal records: {len(final_df):,}")
print(f"Date range: {final_df['outage_date'].min()} to {final_df['outage_date'].max()}")
print(f"\nNumerical Features Summary:")
final_df.describe()

FINAL DATASET SUMMARY

Total records: 210
Date range: 2000-06-14 00:00:00 to 2016-04-02 00:00:00

Numerical Features Summary:


Unnamed: 0,outage_date,outage_year,outage_month,outage_dayofweek,outage_quarter,is_wildfire_season,is_weekend,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,is_wildfire_related,daily_fire_count,daily_acres_burned,avg_fire_size,max_fire_size,large_fire_count,large_fire_acres,satellite_detections,total_frp,avg_frp,max_frp,avg_confidence,fires_7day,fires_30day,acres_7day,acres_30day,is_high_severity,is_long_duration,has_active_fire,is_high_fire_activity
count,210,210.0,210.0,210.0,210.0,210.0,210.0,198.0,158.0,127.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,206.0,206.0,206.0,206.0,210.0,210.0,210.0,210.0
mean,2009-10-09 22:10:17.142857216,2009.271429,6.585714,2.590476,2.533333,0.490476,0.190476,1666.338384,667.594937,201365.7,0.07619,26.495238,6900.503619,83.069204,3275.070619,0.566667,6617.040238,46.128571,8498.151905,63.712726,462.934762,61.604223,159.961165,657.73301,30034.722044,80010.66,0.361905,0.37619,0.919048,0.452381
min,2000-06-14 00:00:00,2000.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,40.0,0.7,5.66,0.0,0.0,0.0,0.0
25%,2007-05-11 00:00:00,2007.0,4.0,1.0,2.0,0.0,0.0,111.5,76.25,1487.5,0.0,4.0,1.2875,0.297656,1.0,0.0,0.0,1.0,16.75,10.7,12.8,61.083333,36.0,199.0,45.655,551.58,0.0,0.0,1.0,0.0
50%,2010-01-19 00:00:00,2010.0,7.0,3.0,3.0,0.0,0.0,294.5,188.5,65000.0,0.0,18.0,18.19,1.172785,7.55,0.0,0.0,6.0,201.25,32.0,80.65,72.861111,139.0,536.0,808.12,7479.275,0.0,0.0,1.0,0.0
75%,2013-02-28 00:00:00,2013.0,10.0,4.0,4.0,1.0,0.0,1049.25,324.0,149000.0,0.0,36.0,381.5575,12.048682,281.0,0.0,0.0,22.75,1337.5,75.732675,303.425,82.975124,244.75,1054.0,5701.3475,35721.61,1.0,1.0,1.0,1.0
max,2016-04-02 00:00:00,2016.0,12.0,6.0,4.0,1.0,1.0,49427.0,41788.0,2606931.0,1.0,552.0,612702.1,3603.025,162818.0,57.0,597621.0,1105.0,480173.0,542.706897,11800.8,99.0,901.0,2312.0,749509.17,1131300.0,1.0,1.0,1.0,1.0
std,,4.043737,3.443986,1.902882,1.137164,0.501104,0.393615,4785.495501,3512.537804,408357.5,0.265937,45.213925,45336.670448,347.777101,14628.120328,4.015262,44327.650233,145.390442,40138.824483,85.545498,1219.731699,31.361867,143.777329,550.058109,105299.574235,216456.9,0.4817,0.485586,0.273414,0.498917


In [15]:
# Categorical features distribution
print('Cause Category Distribution:')
print(final_df['CAUSE.CATEGORY'].value_counts())
print('\nSeason Distribution:')
print(final_df['season'].value_counts())
print('\nDuration Category Distribution:')
print(final_df['duration_category'].value_counts())

Cause Category Distribution:
CAUSE.CATEGORY
severe weather                   70
system operability disruption    41
islanding                        28
intentional attack               24
equipment failure                21
fuel supply emergency            17
public appeal                     9
Name: count, dtype: int64

Season Distribution:
season
Summer    60
Winter    59
Spring    48
Fall      43
Name: count, dtype: int64

Duration Category Distribution:
duration_category
Medium      82
Extended    43
Short       37
Long        36
Unknown     12
Name: count, dtype: int64


---
## 7. Save Final Dataset

In [None]:
# Save the final merged dataset to features folder
final_df.to_csv(FEATURES_DATA_PATH + 'california_outages_with_fire_features.csv', index=False)

# Also save the monthly aggregations for time series analysis
monthly_fires.to_csv(FEATURES_DATA_PATH + 'monthly_wildfire_aggregations.csv', index=False)

# Save daily aggregations
complete_daily.to_csv(FEATURES_DATA_PATH + 'daily_wildfire_aggregations.csv', index=False)

print('Feature datasets saved to data/features/')
print(f'  - california_outages_with_fire_features.csv ({len(final_df):,} records)')
print(f'  - monthly_wildfire_aggregations.csv ({len(monthly_fires):,} records)')
print(f'  - daily_wildfire_aggregations.csv ({len(complete_daily):,} records)')

In [17]:
# Preview final dataset
print('Final Dataset Preview:')
final_df.head(10)

Final Dataset Preview:


Unnamed: 0,outage_date,outage_year,outage_month,outage_dayofweek,outage_quarter,season,is_wildfire_season,is_weekend,CAUSE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,duration_category,impact_category,is_wildfire_related,daily_fire_count,daily_acres_burned,avg_fire_size,max_fire_size,large_fire_count,large_fire_acres,satellite_detections,total_frp,avg_frp,max_frp,avg_confidence,fires_7day,fires_30day,acres_7day,acres_30day,is_high_severity,is_long_duration,has_active_fire,is_high_fire_activity
0,2007-09-04,2007,9,1,3,Fall,1,0,severe weather,heatwave,420.0,,,Medium,Unknown,0,48.0,24.55,0.511458,5.0,0.0,0.0,165.0,34776.4,210.766061,2500.9,89.836364,480.0,1473.0,119713.43,134779.78,0,0,1,1
1,2008-05-08,2008,5,3,2,Spring,0,0,system operability disruption,,155.0,483.0,,Medium,Unknown,0,16.0,76.7,4.79375,52.0,0.0,0.0,17.0,520.6,30.623529,175.7,70.529412,133.0,693.0,2752.66,5495.77,0,0,1,0
2,2006-05-19,2006,5,4,2,Spring,0,0,severe weather,thunderstorm,437.0,133.0,,Medium,Unknown,0,15.0,14.1,0.94,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,194.0,399.0,2759.14,3000.94,0,0,1,0
3,2015-10-13,2015,10,1,4,Fall,1,0,public appeal,,247.0,41788.0,,Medium,Unknown,0,20.0,255.25,12.7625,250.0,0.0,0.0,13.0,1615.2,124.246154,418.1,79.615385,126.0,572.0,1100.93,3807.79,1,0,1,1
4,2014-02-06,2014,2,3,1,Winter,0,0,fuel supply emergency,Natural Gas,540.0,4000.0,,Long,Unknown,0,3.0,1.2,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,445.0,43.52,4157.61,1,1,1,0
5,2013-04-25,2013,4,3,2,Spring,0,0,intentional attack,vandalism,1135.0,0.0,0.0,Long,Low,0,30.0,19.14,0.638,7.0,0.0,0.0,7.0,174.7,24.957143,46.9,72.571429,226.0,537.0,573.93,1642.63,0,1,1,1
6,2008-01-29,2008,1,1,1,Winter,0,0,equipment failure,,437.0,,,Medium,Unknown,0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,43.0,21.5,22.8,69.0,10.0,88.0,25.4,121.9,0,0,1,0
7,2006-07-24,2006,7,0,3,Summer,1,0,severe weather,heatwave,180.0,695.0,,Medium,Unknown,0,161.0,124358.95,772.415839,66113.0,6.0,123019.0,94.0,11231.2,119.480851,1104.9,87.12766,664.0,2312.0,218728.31,403733.82,0,0,1,1
8,2005-07-21,2005,7,3,3,Summer,1,0,system operability disruption,,171.0,197.0,128050.0,Medium,Severe,0,45.0,495.75,11.016667,181.0,0.0,0.0,8.0,611.2,76.4,304.1,83.625,298.0,1355.0,19625.72,117483.2,1,0,1,1
9,2013-08-19,2013,8,0,3,Summer,1,0,severe weather,lightning,656.0,685.0,124000.0,Long,Severe,0,93.0,1010.89,10.869785,413.0,0.0,0.0,114.0,8405.3,73.730702,1067.2,85.166667,316.0,1242.0,261734.14,388940.411,1,1,1,1
