In [54]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split

### Loading data

In [66]:
# data=pd.read_csv('data_leadtime.csv')
data=pd.read_csv('all_years_combined.csv')
train, test = train_test_split(data, test_size=0.2, random_state=42)

### Convert Property and Crops into numerical data

In [68]:
def parse_damage(damage_str):
    if pd.isna(damage_str):
        return 0
    damage_str = damage_str.upper().strip()
    if damage_str[-1] == 'K':
        return float(damage_str[:-1]) * 1_000
    elif damage_str[-1] == 'M':
        return float(damage_str[:-1]) * 1_000_000
    elif damage_str[-1] == 'B':
        return float(damage_str[:-1]) * 1_000_000_000
    else:
        # 如果没有单位，尝试直接转换
        try:
            return float(damage_str)
        except ValueError:
            return 0  # 非法字符串默认返回 0

# 应用函数转换
train['DAMAGE_PROPERTY'] = train['DAMAGE_PROPERTY'].apply(parse_damage).astype(int)
train['DAMAGE_CROPS'] = train['DAMAGE_CROPS'].apply(parse_damage).astype(int) 

test['DAMAGE_PROPERTY'] = test['DAMAGE_PROPERTY'].apply(parse_damage).astype(int)
test['DAMAGE_CROPS'] = test['DAMAGE_CROPS'].apply(parse_damage).astype(int) 



### Data Feature Engineering

In [None]:
# Convert basic fields first
train['BEGIN_DATE'] = pd.to_datetime(train['BEGIN_DATE_TIME']).dt.date
train['END_DATE'] = pd.to_datetime(train['END_DATE_TIME']).dt.date
# train['BEGIN_HOUR'] = train['BEGIN_TIME'].astype(str).str.zfill(4).str[:2].astype(int)

train['event_duration'] = (pd.to_datetime(train['END_DATE']) - pd.to_datetime(train['BEGIN_DATE'])).dt.days

# Average event duration by event type
event_duration_map = train.groupby('EVENT_TYPE')['event_duration'].mean().to_dict()
train['avg_event_duration_by_type'] = train['EVENT_TYPE'].map(event_duration_map)

# Historical outage frequency by region
outage_counts = train[train['customers_out_sum'] > 0].groupby('STATE').size().to_dict()
train['region_outage_freq'] = train['STATE'].map(outage_counts).fillna(0)

# Distance to state center (approximated using mean latitude and longitude)
state_centers = train.groupby('STATE')[['BEGIN_LAT', 'BEGIN_LON']].mean().to_dict('index')
train['dist_to_state_center'] = train.apply(
    lambda row: np.sqrt(
        (row['BEGIN_LAT'] - state_centers.get(row['STATE'], {'BEGIN_LAT':0})['BEGIN_LAT'])**2 +
        (row['BEGIN_LON'] - state_centers.get(row['STATE'], {'BEGIN_LON':0})['BEGIN_LON'])**2
    ), axis=1
)

# Monthly outage risk index
monthly_outage_risk = train.groupby(['STATE', 'MONTH_NAME'])['customers_out_sum'].mean().to_dict()
train['monthly_outage_risk_index'] = train.apply(
    lambda row: monthly_outage_risk.get((row['STATE'], row['MONTH_NAME']), 0), axis=1)

# Frequency of each event type
event_freq = train['EVENT_TYPE'].value_counts().to_dict()
train['event_type_freq'] = train['EVENT_TYPE'].map(event_freq)

# Proxy for regional population demand
region_demand_proxy = train.groupby('STATE')['customers_out_sum'].mean().to_dict()
train['region_demand_proxy'] = train['STATE'].map(region_demand_proxy)

# Yearly trend factor
min_year = train['YEAR'].min()
train['year_trend'] = train['YEAR'] - min_year

# Whether it's a weekday
train['BEGIN_DATE'] = pd.to_datetime(train['BEGIN_DATE_TIME'])
train['is_weekday'] = train['BEGIN_DATE'].dt.weekday < 5
train['is_weekday'] = train['is_weekday'].astype(int)

# Neighboring region impact factor
neighbor_impact = train.groupby('TOR_OTHER_CZ_STATE')['customers_out_sum'].mean().to_dict()
train['neighbor_outage_impact'] = train['TOR_OTHER_CZ_STATE'].map(neighbor_impact).fillna(0)

# Weather forecast office influence
wfo_freq = train['WFO'].value_counts().to_dict()
train['wfo_influence'] = train['WFO'].map(wfo_freq).fillna(0)

# Latitude and longitude grid density
train['lat_grid'] = train['BEGIN_LAT'].round()
train['lon_grid'] = train['BEGIN_LON'].round()
grid_density = train.groupby(['lat_grid', 'lon_grid']).size().to_dict()
train['grid_density'] = train.apply(lambda row: grid_density.get((row['lat_grid'], row['lon_grid']), 0), axis=1)

# Event density in the past 7 days (may be inefficient for large datasets, consider optimization)
train = train.sort_values('BEGIN_DATE')
train['event_7day_density'] = 0
for idx, row in train.iterrows():
    end_date = row['BEGIN_DATE']
    start_date = end_date - timedelta(days=7)
    same_region = (train['STATE'] == row['STATE']) & (train['BEGIN_DATE'] >= start_date) & (train['BEGIN_DATE'] < end_date)
    train.at[idx, 'event_7day_density'] = same_region.sum()

# Reliability of historical data sources
source_validity = train.groupby('DATA_SOURCE').apply(lambda g: 1 - g.isnull().mean().mean()).to_dict()
train['data_source_reliability'] = train['DATA_SOURCE'].map(source_validity).fillna(0)

# Month as a cyclic feature
train['month_sin'] = np.sin(2 * np.pi * train['BEGIN_DATE'].dt.month / 12)

# Proxy for regional power grid load
load_proxy = train.groupby('STATE')['customers_out_sum'].mean().to_dict()
train['grid_load_proxy'] = train['STATE'].map(load_proxy)

# Calculate average damage per FIPS region
train['DAMAGE_PROPERTY_MEAN'] = train.groupby('FIPS')['DAMAGE_PROPERTY'].transform('mean')
train['DAMAGE_CROPS_MEAN'] = train.groupby('FIPS')['DAMAGE_CROPS'].transform('mean')



  train['BEGIN_DATE'] = pd.to_datetime(train['BEGIN_DATE_TIME']).dt.date
  train['END_DATE'] = pd.to_datetime(train['END_DATE_TIME']).dt.date
  train['BEGIN_DATE'] = pd.to_datetime(train['BEGIN_DATE_TIME'])
  source_validity = train.groupby('DATA_SOURCE').apply(lambda g: 1 - g.isnull().mean().mean()).to_dict()


### Mapping Train Set Features data into Test Dataset

In [None]:
# Assume test is already loaded
test = test.copy()

# Date conversion
test['BEGIN_DATE'] = pd.to_datetime(test['BEGIN_DATE_TIME']).dt.date
test['END_DATE'] = pd.to_datetime(test['END_DATE_TIME']).dt.date
test['event_duration'] = (pd.to_datetime(test['END_DATE']) - pd.to_datetime(test['BEGIN_DATE'])).dt.days

# Average event duration by event type
test['avg_event_duration_by_type'] = test['EVENT_TYPE'].map(event_duration_map)

# Historical outage frequency by region
test['region_outage_freq'] = test['STATE'].map(outage_counts).fillna(0)

# Distance to state center
test['dist_to_state_center'] = test.apply(
    lambda row: np.sqrt(
        (row['BEGIN_LAT'] - state_centers.get(row['STATE'], {'BEGIN_LAT': 0})['BEGIN_LAT'])**2 +
        (row['BEGIN_LON'] - state_centers.get(row['STATE'], {'BEGIN_LON': 0})['BEGIN_LON'])**2
    ), axis=1
)

# Monthly outage risk index
test['monthly_outage_risk_index'] = test.apply(
    lambda row: monthly_outage_risk.get((row['STATE'], row['MONTH_NAME']), 0), axis=1)

# Event type frequency
test['event_type_freq'] = test['EVENT_TYPE'].map(event_freq)

# Proxy for regional population demand
test['region_demand_proxy'] = test['STATE'].map(region_demand_proxy)

# Yearly trend factor
test['year_trend'] = test['YEAR'] - min_year

# Whether it is a weekday
test['BEGIN_DATE'] = pd.to_datetime(test['BEGIN_DATE_TIME'])
test['is_weekday'] = (test['BEGIN_DATE'].dt.weekday < 5).astype(int)

# Neighboring region impact factor
test['neighbor_outage_impact'] = test['TOR_OTHER_CZ_STATE'].map(neighbor_impact).fillna(0)

# Weather forecast office influence
test['wfo_influence'] = test['WFO'].map(wfo_freq).fillna(0)

# Latitude and longitude grid density
test['lat_grid'] = test['BEGIN_LAT'].round()
test['lon_grid'] = test['BEGIN_LON'].round()
test['grid_density'] = test.apply(
    lambda row: grid_density.get((row['lat_grid'], row['lon_grid']), 0), axis=1
)

# Reliability of historical data sources
test['data_source_reliability'] = test['DATA_SOURCE'].map(source_validity).fillna(0)

# Month as a cyclic feature
test['month_sin'] = np.sin(2 * np.pi * test['BEGIN_DATE'].dt.month / 12)

# Proxy for regional power grid load
test['grid_load_proxy'] = test['STATE'].map(load_proxy)

# FIPS-related averages (use merge for safety)
fips_property_mean = train.groupby('FIPS')['DAMAGE_PROPERTY'].mean().reset_index().rename(columns={'DAMAGE_PROPERTY': 'DAMAGE_PROPERTY_MEAN'})
fips_crops_mean = train.groupby('FIPS')['DAMAGE_CROPS'].mean().reset_index().rename(columns={'DAMAGE_CROPS': 'DAMAGE_CROPS_MEAN'})
test = test.merge(fips_property_mean, on='FIPS', how='left')
test = test.merge(fips_crops_mean, on='FIPS', how='left')



  test['BEGIN_DATE'] = pd.to_datetime(test['BEGIN_DATE_TIME']).dt.date
  test['END_DATE'] = pd.to_datetime(test['END_DATE_TIME']).dt.date
  test['BEGIN_DATE'] = pd.to_datetime(test['BEGIN_DATE_TIME'])


### Keep Columns The Same

In [None]:
test_columns = test.columns

train = train[test_columns]

### Check NAN values and INF values

In [None]:
# Import necessary module
import numpy as np

# Check for missing and infinite values in the training set
print("=== Train NaN Summary ===")
print(train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False))

print("\n=== Train Inf Summary ===")
print((~np.isfinite(train.select_dtypes(include=[np.number]))).sum().sort_values(ascending=False))

# Check for missing and infinite values in the test set
print("\n=== Test NaN Summary ===")
print(test.isna().sum()[test.isna().sum() > 0].sort_values(ascending=False))

print("\n=== Test Inf Summary ===")
print((~np.isfinite(test.select_dtypes(include=[np.number]))).sum().sort_values(ascending=False))


In [73]:
train.to_csv('final_leadtimeData_train.csv', index=False)
test.to_csv('final_leadtimeData_test.csv', index=False)