In [2]:
import pandas as pd

state_data = pd.read_csv('datasets/merged_state_data.csv')
weather_data = pd.read_csv('datasets/weather_monthly_state_aggregates.csv')
wildfire_data = pd.read_csv('datasets/wildfire_sizes_before_2010.csv')

print(len(state_data))
print(len(weather_data))
print(len(wildfire_data))

50
11775
9632


In [3]:
state_data = state_data.rename(columns={'State': 'STATE', 'mean_elevation': 'MEAN_ELEVATION', 'Land Area (sq mi)': 'LAND_AREA', 'Water Area (sq mi)': 'WATER_AREA', 'Total Area (sq mi)': 'TOTAL_AREA', 'Percentage of Federal Land': 'PERCENTAGE_FEDERAL_LAND', 'Urbanization Rate (%)': 'URBANIZATION_RATE'})
state_data.head()

Unnamed: 0,STATE,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE
0,AK,580,570641,94743,665384,60.9%,64.9
1,AL,150,50645,1775,52420,2.7%,57.7
2,AR,200,52035,1143,53179,9.4%,55.5
3,AZ,1250,113594,396,113990,38.6%,89.3
4,CA,880,155779,7916,163695,45.4%,94.2


In [4]:
weather_data = weather_data.rename(columns={'State': 'STATE', 'year_month': 'PERIOD'})
weather_data.head()

Unnamed: 0,STATE,PERIOD,PRCP,EVAP,TMIN,TMAX
0,AK,1992-05,5.36,44.17,-56.0,233.0
1,AK,1992-06,14.03,39.27,-17.0,322.0
2,AK,1992-07,16.78,30.91,28.0,300.0
3,AK,1992-08,10.46,20.14,-33.0,267.0
4,AK,1992-09,17.25,15.27,-61.0,183.0


In [5]:
weather_data.describe()

Unnamed: 0,PRCP,EVAP,TMIN,TMAX
count,11775.0,11775.0,11775.0,11775.0
mean,22.792403,62.285918,11.948875,318.659788
std,18.153277,185.920998,82.541805,71.463314
min,0.0,-96.87,-456.0,-167.0
25%,9.57,31.28,-39.0,283.0
50%,19.82,48.4,6.0,328.0
75%,32.0,64.895,67.0,361.0
max,259.0,15445.7,233.0,539.0


In [6]:
wildfire_data = wildfire_data.rename(columns={'month': 'PERIOD', 'total_fire_size': 'TOTAL_FIRE_SIZE'})
wildfire_data.head()

Unnamed: 0,STATE,PERIOD,TOTAL_FIRE_SIZE
0,AK,1992-04,10.1
1,AK,1992-05,4309.5
2,AK,1992-06,86460.6
3,AK,1992-07,48578.5
4,AK,1992-08,3321.8


In [7]:
wildfire_data.describe()

Unnamed: 0,TOTAL_FIRE_SIZE
count,9632.0
mean,10649.28
std,82483.42
min,0.01
25%,24.0
50%,299.765
75%,2439.59
max,4779145.0


In [8]:
weather_wildfire_data = pd.merge(wildfire_data, weather_data, on=['STATE', 'PERIOD'], how='left')
print(len(weather_wildfire_data))
# weather_wildfire_data.head()
weather_wildfire_data.describe()

9632


Unnamed: 0,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX
count,9632.0,6583.0,6583.0,6583.0,6583.0
mean,10649.28,21.221559,50.931341,10.677199,328.046939
std,82483.42,15.636771,23.654458,73.950807,61.07945
min,0.01,0.0,-6.86,-456.0,-83.0
25%,24.0,9.775,34.37,-39.0,289.0
50%,299.765,18.79,49.32,-6.0,328.0
75%,2439.59,29.48,63.15,56.0,367.0
max,4779145.0,213.0,392.33,217.0,539.0


In [9]:
state_weather_wildfire_data = pd.merge(weather_wildfire_data, state_data, on=['STATE'], how='left')
print(len(state_weather_wildfire_data))
state_weather_wildfire_data.head()

9632


Unnamed: 0,STATE,PERIOD,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE
0,AK,1992-04,10.1,,,,,580,570641,94743,665384,60.9%,64.9
1,AK,1992-05,4309.5,5.36,44.17,-56.0,233.0,580,570641,94743,665384,60.9%,64.9
2,AK,1992-06,86460.6,14.03,39.27,-17.0,322.0,580,570641,94743,665384,60.9%,64.9
3,AK,1992-07,48578.5,16.78,30.91,28.0,300.0,580,570641,94743,665384,60.9%,64.9
4,AK,1992-08,3321.8,10.46,20.14,-33.0,267.0,580,570641,94743,665384,60.9%,64.9


#### More data cleaning

In [10]:
import numpy as np

dataset_loaded = state_weather_wildfire_data.copy()
dataset_loaded = dataset_loaded.sort_values(by='PERIOD')
dataset_loaded.head()

# convert date to sin encoded month / convert strings to numeric
dataset_loaded['PERIOD'] = pd.to_datetime(dataset_loaded['PERIOD'], format='%Y-%m')
dataset_loaded['MONTH'] = dataset_loaded['PERIOD'].dt.month
dataset_loaded['YEAR'] = dataset_loaded['PERIOD'].dt.year
dataset_loaded['MONTH_SIN'] = np.sin(2 * np.pi * dataset_loaded['MONTH'] / 12)
dataset_loaded['PERCENTAGE_FEDERAL_LAND'] = pd.to_numeric(
    dataset_loaded['PERCENTAGE_FEDERAL_LAND'].astype(str).str.rstrip('%').astype(float),
    errors='coerce'
)


dataset_loaded.head()
dataset_loaded.describe()

target_feature = 'MONTH'
features_to_check = ['PRCP', 'EVAP', 'TMIN', 'TMAX']

null_counts_by_target = dataset_loaded.groupby(target_feature)[features_to_check].apply(lambda group: group.isnull().sum())
print("\nNull counts in defined features for each target value:")
print(null_counts_by_target)



Null counts in defined features for each target value:
       PRCP  EVAP  TMIN  TMAX
MONTH                        
1       438   438   438   438
2       494   494   494   494
3       548   548   548   548
4       232   232   232   232
5        85    85    85    85
6        70    70    70    70
7        77    77    77    77
8        71    71    71    71
9        72    72    72    72
10      127   127   127   127
11      406   406   406   406
12      429   429   429   429


In [11]:
for feature in features_to_check:
    # First, fill missing values using averages computed by month and state
    group_avg_ms = dataset_loaded.groupby(['MONTH', 'STATE'])[feature].transform('mean')
    dataset_loaded[feature].fillna(group_avg_ms, inplace=True)

    # If there are still missing values, fill them using monthly averages
    if dataset_loaded[feature].isna().sum() > 0:
        group_avg_month = dataset_loaded.groupby('MONTH')[feature].transform('mean')
        dataset_loaded[feature].fillna(group_avg_month, inplace=True)
        print(f"Missing values for {feature} filled with monthly averages as fallback.")
    else:
        print(f"Missing values for {feature} filled with monthly-state averages.")

Missing values for PRCP filled with monthly averages as fallback.
Missing values for EVAP filled with monthly averages as fallback.
Missing values for TMIN filled with monthly averages as fallback.
Missing values for TMAX filled with monthly averages as fallback.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset_loaded[feature].fillna(group_avg_ms, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset_loaded[feature].fillna(group_avg_month, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [12]:
null_counts_by_target = dataset_loaded.groupby(target_feature)[features_to_check].apply(lambda group: group.isnull().sum())
print("\nNull counts in defined features for each target value:")
print(null_counts_by_target)


Null counts in defined features for each target value:
       PRCP  EVAP  TMIN  TMAX
MONTH                        
1         0     0     0     0
2         0     0     0     0
3         0     0     0     0
4         0     0     0     0
5         0     0     0     0
6         0     0     0     0
7         0     0     0     0
8         0     0     0     0
9         0     0     0     0
10        0     0     0     0
11        0     0     0     0
12        0     0     0     0


In [13]:
# for feature in features_to_check:
#     # Count the missing values per target group, calculate mean per group
#     missing_counts = dataset_loaded.groupby(target_feature)[feature].apply(lambda x: x.isna().sum())
#     group_means = dataset_loaded.groupby(target_feature)[feature].mean()
#
#     # Consider only groups that actually have missing values
#     groups_with_missing = missing_counts[missing_counts > 0]
#
#     if groups_with_missing.sum() > 0:
#         weighted_avg = (group_means[groups_with_missing.index] * groups_with_missing).sum() / groups_with_missing.sum()
#         print(f"Weighted average for {feature}: {weighted_avg}")
#
#         # Fill missing values with the computed weighted average
#         dataset_loaded[feature].fillna(weighted_avg, inplace=True)
#     else:
#         print(f"No missing values for {feature}.")
#
#
# null_counts_by_target = dataset_loaded.groupby(target_feature)[features_to_check].apply(lambda group: group.isnull().sum())
# print("\nNull counts in defined features for each target value:")
# print(null_counts_by_target)


In [14]:
dataset_loaded.head()

Unnamed: 0,STATE,PERIOD,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE,MONTH,YEAR,MONTH_SIN
3226,LA,1992-01-01,1047.4,48.61,18.3,-39.0,211.0,30,43204,9174,52378,4.7,71.5,1,1992,0.5
2276,ID,1992-01-01,10.0,6.26,0.0,-306.0,39.0,1520,82643,926,83569,61.9,69.2,1,1992,0.5
4961,NC,1992-01-01,2095.0,21.6,10.73,-33.0,189.0,210,48618,5201,53819,7.8,66.7,1,1992,0.5
5189,ND,1992-01-01,0.1,24.118084,25.327012,-75.203212,237.337856,580,69001,1698,70698,3.9,61.0,1,1992,0.5
5587,NH,1992-01-01,2.5,24.118084,25.327012,-75.203212,237.337856,300,8953,397,9349,14.0,58.3,1,1992,0.5


In [15]:
dataset_loaded = dataset_loaded.drop(columns=['MONTH', 'YEAR', 'PERIOD'])
dataset_loaded.describe()

Unnamed: 0,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE,MONTH_SIN
count,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0,9632.0
mean,10649.28,21.295254,45.52612,-4.571491,304.384016,546.721034,74150.715947,5075.553156,79226.212313,16.331676,72.499201,-0.002842887
std,82483.42,14.051364,23.359645,71.600384,69.467577,538.988922,78942.638002,12927.298639,88882.864369,20.535084,13.565915,0.7162742
min,0.01,0.0,-6.86,-456.0,-83.0,20.0,1034.0,192.0,1545.0,0.3,35.1,-1.0
25%,24.0,11.5575,29.2775,-49.43966,261.0,180.0,40861.0,791.0,42775.0,2.4,63.2,-0.8660254
50%,299.765,20.494181,41.885,-22.0,306.0,300.0,55519.0,1509.0,65496.0,5.4,71.9,1.224647e-16
75%,2439.59,27.145,58.1325,33.0,356.0,670.0,82170.0,4537.0,86936.0,28.6,85.6,0.8660254
max,4779145.0,213.0,392.33,217.0,539.0,2070.0,570641.0,94743.0,665384.0,80.1,94.2,1.0


In [16]:
dataset_loaded.head()

Unnamed: 0,STATE,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE,MONTH_SIN
3226,LA,1047.4,48.61,18.3,-39.0,211.0,30,43204,9174,52378,4.7,71.5,0.5
2276,ID,10.0,6.26,0.0,-306.0,39.0,1520,82643,926,83569,61.9,69.2,0.5
4961,NC,2095.0,21.6,10.73,-33.0,189.0,210,48618,5201,53819,7.8,66.7,0.5
5189,ND,0.1,24.118084,25.327012,-75.203212,237.337856,580,69001,1698,70698,3.9,61.0,0.5
5587,NH,2.5,24.118084,25.327012,-75.203212,237.337856,300,8953,397,9349,14.0,58.3,0.5


In [18]:
dataset_loaded.to_csv('datasets/dataset_cleaned_3.csv', index=False)

In [19]:
from sklearn.preprocessing import TargetEncoder

encoded_state = TargetEncoder(smooth='auto').fit_transform(pd.DataFrame(dataset_loaded['STATE']), dataset_loaded['TOTAL_FIRE_SIZE'])
print(len(encoded_state))
pd.DataFrame(encoded_state).head()


9632


Unnamed: 0,0
0,2321.179683
1,39261.344783
2,2164.012662
3,1767.534205
4,23.50701


In [20]:
dataset_loaded['STATE'] = encoded_state
dataset_loaded.head()

Unnamed: 0,STATE,TOTAL_FIRE_SIZE,PRCP,EVAP,TMIN,TMAX,MEAN_ELEVATION,LAND_AREA,WATER_AREA,TOTAL_AREA,PERCENTAGE_FEDERAL_LAND,URBANIZATION_RATE,MONTH_SIN
3226,2321.179683,1047.4,48.61,18.3,-39.0,211.0,30,43204,9174,52378,4.7,71.5,0.5
2276,39261.344783,10.0,6.26,0.0,-306.0,39.0,1520,82643,926,83569,61.9,69.2,0.5
4961,2164.012662,2095.0,21.6,10.73,-33.0,189.0,210,48618,5201,53819,7.8,66.7,0.5
5189,1767.534205,0.1,24.118084,25.327012,-75.203212,237.337856,580,69001,1698,70698,3.9,61.0,0.5
5587,23.50701,2.5,24.118084,25.327012,-75.203212,237.337856,300,8953,397,9349,14.0,58.3,0.5


In [22]:
dataset_loaded.to_csv('datasets/dataset_final_3_target_encoding.csv', index=False)