### View 4
fd
Train a regular regression model and see if the fires close to cities perform worse.

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


df = pd.read_csv("with_wui.csv")
df = df.sort_values(["fire_id", "date"])

df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

df["lat_bin"] = (df["lat"] // 2).astype(int)
df["lon_bin"] = (df["lon"] // 2).astype(int)
df["spatial_bin"] = df["lat_bin"].astype(str) + "_" + df["lon_bin"].astype(str)

bin_start_date = df.groupby("spatial_bin")["date"].min()
ordered_bins = bin_start_date.sort_values().index

cut = int(len(ordered_bins) * 0.8)
train_bins = set(ordered_bins[:cut])
test_bins  = set(ordered_bins[cut:])

train_idx = df["spatial_bin"].isin(train_bins)
test_idx  = df["spatial_bin"].isin(test_bins)

feature_cols = df.columns.difference(
    ["fire_id", "date", "next_day_fire", "spatial_bin", "lat_bin", "lon_bin"]
)

X_train, y_train = df.loc[train_idx, feature_cols], df.loc[train_idx, "next_day_fire"]
X_test,  y_test  = df.loc[test_idx,  feature_cols], df.loc[test_idx,  "next_day_fire"]

print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")
print(f"Train bins: {len(train_bins)}, Test bins: {len(test_bins)}")


model = RandomForestRegressor(
    n_estimators=500,
    min_samples_leaf=3,
    random_state=0,
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)


print("\nOverall performance:")
print("MAE:", mean_absolute_error(y_test, preds))
print("R^2:", r2_score(y_test, preds))


imps = model.feature_importances_
order = np.argsort(imps)[::-1]

print("\nTop 20 features:")
for i in order[:20]:
    print(f"{feature_cols[i]}: {imps[i]:.4f}")


Train rows: 21742, Test rows: 1716
Train bins: 104, Test bins: 27

Overall performance:
MAE: 48.769939348127394
R^2: 0.22947643752442837

Top 20 features:
active_fire_count: 0.6933
wind_dir: 0.0338
ERC: 0.0223
wind_speed: 0.0191
forecast_specific_humidity: 0.0160
forecast_wind_speed: 0.0154
slope: 0.0147
M11: 0.0140
temp_max: 0.0138
PDSI: 0.0124
NDVI: 0.0122
temp_min: 0.0118
elevation: 0.0116
forecast_temp: 0.0114
lon: 0.0107
aspect: 0.0106
precip: 0.0101
I2: 0.0094
lat: 0.0086
EVI2: 0.0081


In [17]:
df = pd.read_csv("with_wui.csv")
df = df.sort_values(["fire_id", "date"])

df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

feature_cols = df.columns.difference(
    ["fire_id", "date", "next_day_fire"]
)

X = df[feature_cols]
y = df["next_day_fire"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=True
)

model = RandomForestRegressor(
    n_estimators=500,
    min_samples_leaf=3,
    random_state=0,
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, preds))
print("R^2:", r2_score(y_test, preds))

imps = model.feature_importances_
order = np.argsort(imps)[::-1]

print("\nTop 20 features:")
for i in order[:20]:
    print(f"{feature_cols[i]}: {imps[i]:.4f}")



MAE: 55.78501264424611
R^2: 0.7134890015333826

Top 20 features:
active_fire_count: 0.6855
wind_dir: 0.0332
ERC: 0.0212
forecast_specific_humidity: 0.0175
temp_max: 0.0168
forecast_wind_speed: 0.0160
wind_speed: 0.0151
M11: 0.0142
slope: 0.0140
precip: 0.0139
temp_min: 0.0139
lon: 0.0138
forecast_temp: 0.0123
elevation: 0.0119
NDVI: 0.0119
EVI2: 0.0102
I2: 0.0098
aspect: 0.0096
PDSI: 0.0095
forecast_wind_dir: 0.0081
