### View 4
fd
Train a regular regression model and see if the fires close to cities perform worse.

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.read_csv("with_wui.csv")
df = df.sort_values(["fire_id", "date"])

df["lat_bin"] = (df["lat"] // 2).astype(int)
df["lon_bin"] = (df["lon"] // 2).astype(int)
df["spatial_bin"] = df["lat_bin"].astype(str) + "_" + df["lon_bin"].astype(str)

bin_start_date = df.groupby("spatial_bin")["date"].min()
ordered_bins = bin_start_date.sort_values().index

cut = int(len(ordered_bins) * 0.8)
train_bins = set(ordered_bins[:cut])
test_bins  = set(ordered_bins[cut:])

train_idx = df["spatial_bin"].isin(train_bins)
test_idx  = df["spatial_bin"].isin(test_bins)

feature_cols = df.columns.difference(
    ["fire_id", "date", "next_day_fire", "spatial_bin", "lat_bin", "lon_bin"]
)

X_train, y_train = df.loc[train_idx, feature_cols], df.loc[train_idx, "next_day_fire"]
X_test,  y_test  = df.loc[test_idx,  feature_cols], df.loc[test_idx,  "next_day_fire"]

print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")
print(f"Train bins: {len(train_bins)}, Test bins: {len(test_bins)}")

model = RandomForestRegressor(
    n_estimators=300,
    min_samples_leaf=2,
    random_state=0,
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

print("\nOverall performance:")
print("MAE:", mean_absolute_error(y_test, preds))
print("R²:", r2_score(y_test, preds))

imps = model.feature_importances_
order = np.argsort(imps)[::-1]

print("\nTop 20 features:")
for i in order[:20]:
    print(f"{feature_cols[i]}: {imps[i]:.4f}")

test_df = df.loc[test_idx].copy()
test_df["pred"] = preds

fire_r2 = {}
for fire in test_df["fire_id"].unique():
    sub = test_df[test_df["fire_id"] == fire]
    if len(sub) > 3:
        fire_r2[fire] = r2_score(sub["next_day_fire"], sub["pred"])

fire_r2 = pd.Series(fire_r2).sort_values()

print("\nWorst 10 fires:")
print(fire_r2.head(10))

print("\nBest 10 fires:")
print(fire_r2.tail(10))


Train rows: 21742, Test rows: 1716
Train bins: 104, Test bins: 27

Overall performance:
MAE: 52.023936925356644
R²: 0.22113513995510314

Top 20 features:
active_fire_count: 0.6673
wind_dir: 0.0391
ERC: 0.0229
wind_speed: 0.0206
forecast_wind_speed: 0.0170
forecast_specific_humidity: 0.0167
slope: 0.0158
M11: 0.0156
temp_max: 0.0142
NDVI: 0.0137
aspect: 0.0128
temp_min: 0.0126
forecast_temp: 0.0125
PDSI: 0.0124
elevation: 0.0120
lon: 0.0111
I2: 0.0106
precip: 0.0100
specific_humidity: 0.0099
lat: 0.0091

Worst 10 fires:
fire_24604832                -507.360045
fire_SD4411109928220221010   -166.608793
fire_MI4514808434720220513   -154.108856
fire_24604731                 -82.475961
fire_TX3072910023420220516    -20.600774
fire_25295966                 -17.590839
fire_23757522                 -15.905999
fire_24103572                  -8.606516
fire_25088018                  -7.661421
fire_KY3727108427720221106     -6.045683
dtype: float64

Best 10 fires:
fire_TX3219909910320220317    0.04