### View 4
fd
Train a regular regression model and see if the fires close to cities perform worse.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df_filled = pd.read_csv("../filled_output.csv")
df = df_filled.copy()

df = df.sort_values(["fire_id", "date"])
df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

feature_cols = df.columns.difference(["fire_id", "date", "next_day_fire"])
X = df[feature_cols]
y = df["next_day_fire"]

fires = df["fire_id"].unique()
train_fires, test_fires = train_test_split(fires, test_size=0.2, random_state=42)
train_idx = df["fire_id"].isin(train_fires)
test_idx  = df["fire_id"].isin(test_fires)

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, preds))
print("R²:", r2_score(y_test, preds))

importances = model.feature_importances_
idx = np.argsort(importances)[::-1]
print("\nTop 20 features:")
for i in idx[:20]:
    print(f"{feature_cols[i]}: {importances[i]:.4f}")


MAE: 62.371646480254896
R²: 0.6956379769613534

Top 20 features:
active_fire_count: 0.6658
wind_dir: 0.0329
ERC: 0.0234
forecast_wind_speed: 0.0193
forecast_specific_humidity: 0.0191
wind_speed: 0.0174
M11: 0.0172
forecast_temp: 0.0169
temp_max: 0.0165
NDVI: 0.0146
PDSI: 0.0144
aspect: 0.0142
lon: 0.0139
slope: 0.0137
elevation: 0.0131
specific_humidity: 0.0123
precip: 0.0111
temp_min: 0.0110
EVI2: 0.0108
I2: 0.0102


In [2]:
df['next_day_fire'].describe()


count    23458.000000
mean       111.737957
std        312.479667
min          0.000000
25%          0.000000
50%          4.000000
75%         81.000000
max       9134.000000
Name: next_day_fire, dtype: float64

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
df = df_filled.copy()


df = df.sort_values(["fire_id", "date"])
df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

# log-transform target
df["target_log"] = np.log1p(df["next_day_fire"])

feature_cols = df.columns.difference(["fire_id", "date", "next_day_fire", "target_log"])
X = df[feature_cols]
y = df["target_log"]

fires = df["fire_id"].unique()
train_fires, test_fires = train_test_split(fires, test_size=0.2, random_state=42)
train_idx = df["fire_id"].isin(train_fires)
test_idx  = df["fire_id"].isin(test_fires)

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# predict in log space → convert back
preds_log = model.predict(X_test)
preds = np.expm1(preds_log)

y_true = df.loc[test_idx, "next_day_fire"]

print("MAE:", mean_absolute_error(y_true, preds))
print("R²:", r2_score(y_true, preds))

# feature importance
importances = model.feature_importances_
idx = np.argsort(importances)[::-1]
print("\nTop 20 features:")
for i in idx[:20]:
    print(f"{feature_cols[i]}: {importances[i]:.4f}")

MAE: 53.37723470878723
R²: 0.6497351279521258

Top 20 features:
active_fire_count: 0.6344
precip: 0.0336
NDVI: 0.0203
forecast_wind_speed: 0.0196
forecast_specific_humidity: 0.0190
slope: 0.0186
wind_speed: 0.0172
wind_dir: 0.0172
lon: 0.0160
PDSI: 0.0160
I2: 0.0155
ERC: 0.0155
forecast_temp: 0.0149
temp_max: 0.0148
forecast_wind_dir: 0.0148
M11: 0.0144
elevation: 0.0133
lat: 0.0124
I1: 0.0122
temp_min: 0.0120


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df = df_filled.copy()

df = df.sort_values(["fire_id", "date"])
df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

# log-transform target
df["target_log"] = np.log1p(df["next_day_fire"])

feature_cols = df.columns.difference(["fire_id", "date", "next_day_fire", "target_log"])
X = df[feature_cols]
y = df["target_log"]

fires = df["fire_id"].unique()
train_fires, test_fires = train_test_split(fires, test_size=0.2, random_state=42)
train_idx = df["fire_id"].isin(train_fires)
test_idx  = df["fire_id"].isin(test_fires)

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# predict in log space → convert back
preds_log = model.predict(X_test)
preds = np.expm1(preds_log)

y_true = df.loc[test_idx, "next_day_fire"]

print("MAE:", mean_absolute_error(y_true, preds))
print("R²:", r2_score(y_true, preds))

# feature importance
importances = model.feature_importances_
idx = np.argsort(importances)[::-1]
print("\nTop 20 features:")
for i in idx[:20]:
    print(f"{feature_cols[i]}: {importances[i]:.4f}")

MAE: 53.37723470878723
R²: 0.649735127952126

Top 20 features:
active_fire_count: 0.6344
precip: 0.0336
NDVI: 0.0203
forecast_wind_speed: 0.0196
forecast_specific_humidity: 0.0190
slope: 0.0186
wind_speed: 0.0172
wind_dir: 0.0172
lon: 0.0160
PDSI: 0.0160
I2: 0.0155
ERC: 0.0155
forecast_temp: 0.0149
temp_max: 0.0148
forecast_wind_dir: 0.0148
M11: 0.0144
elevation: 0.0133
lat: 0.0124
I1: 0.0122
temp_min: 0.0120


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ----------------------------------------------------------
# 1. Prepare dataframe (assumes df_filled already exists)
# ----------------------------------------------------------
df = df_filled.copy()
df = df.sort_values(["fire_id", "date"])

# Build target: next-day growth
df["next_day_fire"] = df.groupby("fire_id")["active_fire_count"].shift(-1)
df = df.dropna(subset=["next_day_fire"])
df["next_day_fire"] = df["next_day_fire"].astype(float)

# ----------------------------------------------------------
# 2. Spatial binning (2° x 2° bins)
# ----------------------------------------------------------
df["lat_bin"] = (df["lat"] // 2).astype(int)
df["lon_bin"] = (df["lon"] // 2).astype(int)
df["spatial_bin"] = df["lat_bin"].astype(str) + "_" + df["lon_bin"].astype(str)

# ----------------------------------------------------------
# 3. Spatial + temporal ordering of bins
# ----------------------------------------------------------
bin_start_date = df.groupby("spatial_bin")["date"].min()
ordered_bins = bin_start_date.sort_values().index

cut = int(len(ordered_bins) * 0.8)
train_bins = set(ordered_bins[:cut])
test_bins  = set(ordered_bins[cut:])

train_idx = df["spatial_bin"].isin(train_bins)
test_idx  = df["spatial_bin"].isin(test_bins)

# ----------------------------------------------------------
# 4. Feature matrix
# ----------------------------------------------------------
feature_cols = df.columns.difference(
    ["fire_id", "date", "next_day_fire", "spatial_bin", "lat_bin", "lon_bin"]
)

X_train, y_train = df.loc[train_idx, feature_cols], df.loc[train_idx, "next_day_fire"]
X_test,  y_test  = df.loc[test_idx,  feature_cols], df.loc[test_idx,  "next_day_fire"]

print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")
print(f"Train bins: {len(train_bins)}, Test bins: {len(test_bins)}")

# ----------------------------------------------------------
# 5. Train model
# ----------------------------------------------------------
model = RandomForestRegressor(
    n_estimators=300,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

# ----------------------------------------------------------
# 6. Metrics
# ----------------------------------------------------------
print("\nOverall performance:")
print("MAE:", mean_absolute_error(y_test, preds))
print("R²:", r2_score(y_test, preds))

# ----------------------------------------------------------
# 7. Feature importances
# ----------------------------------------------------------
imps = model.feature_importances_
order = np.argsort(imps)[::-1]

print("\nTop 20 features:")
for i in order[:20]:
    print(f"{feature_cols[i]}: {imps[i]:.4f}")

# ----------------------------------------------------------
# 8. Per-fire R²: which fires are being predicted well?
# ----------------------------------------------------------
test_df = df.loc[test_idx].copy()
test_df["pred"] = preds

fire_r2 = {}
for fire in test_df["fire_id"].unique():
    sub = test_df[test_df["fire_id"] == fire]
    if len(sub) > 3:
        fire_r2[fire] = r2_score(sub["next_day_fire"], sub["pred"])

fire_r2 = pd.Series(fire_r2).sort_values()

print("\nWorst 10 fires:")
print(fire_r2.head(10))

print("\nBest 10 fires:")
print(fire_r2.tail(10))


Train rows: 21742, Test rows: 1716
Train bins: 104, Test bins: 27

Overall performance:
MAE: 52.484592610990184
R²: 0.22060330434861253

Top 20 features:
active_fire_count: 0.6678
wind_dir: 0.0404
ERC: 0.0238
wind_speed: 0.0206
forecast_specific_humidity: 0.0175
forecast_wind_speed: 0.0175
M11: 0.0160
slope: 0.0159
temp_max: 0.0150
NDVI: 0.0142
aspect: 0.0134
elevation: 0.0134
temp_min: 0.0129
PDSI: 0.0128
forecast_temp: 0.0127
lon: 0.0121
I2: 0.0117
lat: 0.0104
precip: 0.0101
specific_humidity: 0.0100

Worst 10 fires:
fire_24604832                -417.907101
fire_SD4411109928220221010   -157.521004
fire_MI4514808434720220513   -121.384094
fire_24604731                 -72.538119
fire_TX3072910023420220516    -22.863481
fire_25295966                 -19.720935
fire_23757522                 -14.320296
fire_KY3727108427720221106     -7.949422
fire_NE4194610014320220423     -6.835608
fire_25088018                  -6.571006
dtype: float64

Best 10 fires:
fire_24604783                 0.17