In [12]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder

In [5]:
df = pd.read_csv("../data/our_data/merged_2015.csv",encoding='latin1')

In [6]:
df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE_TIME']).dt.date
df['END_DATE'] = pd.to_datetime(df['END_DATE_TIME']).dt.date
df['event_duration'] = (pd.to_datetime(df['END_DATE']) - pd.to_datetime(df['BEGIN_DATE'])).dt.days

event_duration_map = df.groupby('EVENT_TYPE')['event_duration'].mean().to_dict()
df['avg_event_duration_by_type'] = df['EVENT_TYPE'].map(event_duration_map)

In [7]:
season_map = {
    'DEC': 'Winter', 'JAN': 'Winter', 'FEB': 'Winter',
    'MAR': 'Spring', 'APR': 'Spring', 'MAY': 'Spring',
    'JUN': 'Summer', 'JUL': 'Summer', 'AUG': 'Summer',
    'SEP': 'Fall', 'OCT': 'Fall', 'NOV': 'Fall'
}
df['season'] = df['MONTH_NAME'].map(season_map)

In [8]:
outage_freq = df[df['customers_out'] > 0].groupby('STATE').size()
df['region_outage_freq'] = df['STATE'].map(outage_freq)
df['region_outage_freq'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['region_outage_freq'].fillna(0, inplace=True)


In [9]:
scale_map = {
    'EF0': 0.5, 'EF1': 1, 'EF2': 2, 'EF3': 3,
    'EF4': 4, 'EF5': 5, 'F0': 0.5, 'F1': 1, 'F2': 2,
    'F3': 3, 'F4': 4, 'F5': 5
}
df['TOR_F_SCALE_NUM'] = df['TOR_F_SCALE'].map(scale_map)
df['TOR_F_SCALE_NUM'].fillna(0, inplace=True)
df['event_severity'] = df['MAGNITUDE'].fillna(0) + df['TOR_F_SCALE_NUM']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TOR_F_SCALE_NUM'].fillna(0, inplace=True)


In [10]:
state_centers = df.groupby('STATE')[['BEGIN_LAT', 'BEGIN_LON']].mean()
df = df.merge(state_centers, on='STATE', suffixes=('', '_CENTER'))

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = phi2 - phi1
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

df['dist_to_state_center'] = haversine(
    df['BEGIN_LAT'], df['BEGIN_LON'],
    df['BEGIN_LAT_CENTER'], df['BEGIN_LON_CENTER']
)

In [13]:
monthly_outage_risk = df.groupby(['STATE', 'MONTH_NAME'])['customers_out'].mean().to_dict()
df['monthly_outage_risk_index'] = df.apply(
    lambda row: monthly_outage_risk.get((row['STATE'], row['MONTH_NAME']), 0), axis=1)

event_freq = df['EVENT_TYPE'].value_counts().to_dict()
df['event_type_freq'] = df['EVENT_TYPE'].map(event_freq)

region_demand_proxy = df.groupby('STATE')['customers_out'].mean().to_dict()
df['region_demand_proxy'] = df['STATE'].map(region_demand_proxy)

df['BEGIN_HOUR'] = df['BEGIN_TIME'].astype(str).str.zfill(4).str[:2].astype(int)
df['event_hour_norm'] = df['BEGIN_HOUR'] / 24.0

min_year = df['YEAR'].min()
df['year_trend'] = df['YEAR'] - min_year

df['flood_cause_cat'] = df['FLOOD_CAUSE'].fillna('Unknown')
df['flood_cause_cat'] = df['flood_cause_cat'].where(df['EVENT_TYPE'] == 'Flood', 'None')

avg_length_map = df[df['TOR_LENGTH'].notna()].groupby('EVENT_TYPE')['TOR_LENGTH'].mean().to_dict()
df['tornado_length_proxy'] = df.apply(
    lambda row: avg_length_map.get(row['EVENT_TYPE'], 0) if 'Tornado' in row['EVENT_TYPE'] else 0, axis=1)

avg_width_map = df[df['TOR_WIDTH'].notna()].groupby('EVENT_TYPE')['TOR_WIDTH'].mean().to_dict()
df['tornado_width_proxy'] = df.apply(
    lambda row: avg_width_map.get(row['EVENT_TYPE'], 0) if 'Tornado' in row['EVENT_TYPE'] else 0, axis=1)

df['TOR_F_SCALE_NUM'] = df['TOR_F_SCALE'].map(scale_map).fillna(0)
severity_by_region = df.groupby('STATE')[['MAGNITUDE', 'TOR_F_SCALE_NUM']].mean().sum(axis=1).to_dict()
df['region_avg_severity'] = df['STATE'].map(severity_by_region)

df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE_TIME'])
df['is_weekday'] = df['BEGIN_DATE'].dt.weekday < 5
df['is_weekday'] = df['is_weekday'].astype(int)

neighbor_impact = df.groupby('TOR_OTHER_CZ_STATE')['customers_out'].mean().to_dict()
df['neighbor_outage_impact'] = df['TOR_OTHER_CZ_STATE'].map(neighbor_impact).fillna(0)

wfo_freq = df['WFO'].value_counts().to_dict()
df['wfo_influence'] = df['WFO'].map(wfo_freq).fillna(0)

df['lat_grid'] = df['BEGIN_LAT'].round()
df['lon_grid'] = df['BEGIN_LON'].round()
grid_density = df.groupby(['lat_grid', 'lon_grid']).size().to_dict()
df['grid_density'] = df.apply(lambda row: grid_density.get((row['lat_grid'], row['lon_grid']), 0), axis=1)

df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE_TIME'])

df = df.sort_values('BEGIN_DATE')
df['event_7day_density'] = 0

for idx, row in df.iterrows():
    end_date = row['BEGIN_DATE']
    start_date = end_date - timedelta(days=7)
    same_region = (df['STATE'] == row['STATE']) & (df['BEGIN_DATE'] >= start_date) & (df['BEGIN_DATE'] < end_date)
    df.at[idx, 'event_7day_density'] = same_region.sum()

df['event_season_combo'] = df['EVENT_TYPE'].astype(str) + '_' + df['season'].astype(str)

source_validity = df.groupby('DATA_SOURCE').apply(lambda g: 1 - g.isnull().mean().mean()).to_dict()
df['data_source_reliability'] = df['DATA_SOURCE'].map(source_validity).fillna(0)

df['month_sin'] = np.sin(2 * np.pi * df['BEGIN_DATE'].dt.month / 12)

load_proxy = df.groupby('STATE')['customers_out'].mean().to_dict()
df['grid_load_proxy'] = df['STATE'].map(load_proxy)

  source_validity = df.groupby('DATA_SOURCE').apply(lambda g: 1 - g.isnull().mean().mean()).to_dict()


JORDAN PART

In [14]:
# df = pd.read_csv("chatgpt_features_full.csv",encoding='latin1')
# print(df.columns.tolist())

df['FIPS'] = df['STATE_FIPS'].astype(str).str.zfill(2) + df['CZ_FIPS'].astype(str).str.zfill(3)
df['FIPS'] = df['FIPS'].astype(int)


In [15]:
features = df[['customers_out', 'BEGIN_DATE_TIME', 'END_DATE_TIME',
    'BEGIN_LAT', 'BEGIN_LON', 'grid_density', 'grid_load_proxy', 'month_sin', 'is_weekday', 'region_avg_severity',
    'event_severity', 'EVENT_TYPE', 'avg_event_duration_by_type', 'region_outage_freq', 'FIPS']]

In [16]:
#change begin and end to datetime
features['BEGIN_DATE_TIME'] = pd.to_datetime(features['BEGIN_DATE_TIME'])
features['END_DATE_TIME'] = pd.to_datetime(features['END_DATE_TIME'])

#extract features
features['begin_month'] = features['BEGIN_DATE_TIME'].dt.month
features['begin_weekday'] = features['BEGIN_DATE_TIME'].dt.dayofweek  # Monday=0

#damage features are given in weird string format 
#features['DAMAGE_PROPERTY'] = features['DAMAGE_PROPERTY'].apply(parse_k).astype(int)
#features['DAMAGE_CROPS'] = features['DAMAGE_CROPS'].apply(parse_k).astype(int)

event_mapping = {event: idx for idx, event in enumerate(sorted(features['EVENT_TYPE'].unique()))}

def map_event_to_num(event):
    return event_mapping.get(event, -1)

features['event_type_num'] = features['EVENT_TYPE'].apply(map_event_to_num)

features = features.drop(columns=['BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_TYPE'])
print(features.columns.tolist())
features = features.dropna() #only one row contains NaNs

['customers_out', 'BEGIN_LAT', 'BEGIN_LON', 'grid_density', 'grid_load_proxy', 'month_sin', 'is_weekday', 'region_avg_severity', 'event_severity', 'avg_event_duration_by_type', 'region_outage_freq', 'FIPS', 'begin_month', 'begin_weekday', 'event_type_num']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['BEGIN_DATE_TIME'] = pd.to_datetime(features['BEGIN_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['END_DATE_TIME'] = pd.to_datetime(features['END_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['begin_month'] = features['BEGIN_DATE_TIME'].dt.mont

In [None]:
features.to_csv("featuresXGBoost.csv", index=False)