# Pre-processing

In [None]:
# install the dotenv package to read the .env file
# !pip install python-dotenv scikit-learn joblib 

In [None]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the MAP_KEY
MAP_KEY = os.getenv("MAP_KEY")

print("MAP_KEY:", MAP_KEY)  # Optional, remove in production


In [None]:
import pandas as pd
from pathlib import Path

root_dir = Path("../data/processed")
file_list = []
for file in os.listdir(root_dir):
    if "combined.csv" in file.split("_"):
        file_path = root_dir / file
        file_list.append(file_path)
print(file_list)

In [None]:
# Combine Nepal and Korea data into a single CSV file
combined_data_path = Path("../data")
nepal_df = pd.read_csv(file_list[0])
korea_df = pd.read_csv(file_list[1])
# rename column datetime to date and datetime.1 to time
nepal_df.rename(columns={"datetime": "date", "datetime.1": "time"}, inplace=True)
combined_data_df = pd.concat([nepal_df, korea_df], ignore_index=True)
combined_data_df.to_csv(combined_data_path / "combined_data.csv", index=False)
combined_data_df.head()

# Model Training

### Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# these are the columns we don't need
unnecessary_columns = ["version", "instrument", "type", "satellite", "date", "time"]
# filter out unnecessary columns
selected_columns = [
    col for col in combined_data_df.columns if col not in unnecessary_columns
]
combined_data_df = combined_data_df[selected_columns]

X = combined_data_df.drop(columns=["confidence"])
X["daynight"] = X["daynight"].map({"D": 1, "N": 0})

y = combined_data_df["confidence"].values.ravel()

# Split the combined data ensuring stratification if applicable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Train the Model

In [6]:
from sklearn.ensemble import RandomForestRegressor

model_dir = Path("../models")
# regressor model
model = RandomForestRegressor(
    max_features="log2", min_samples_leaf=2, n_estimators=200, random_state=42
)
model.fit(X_train_scaled, y_train)
# Save the model
joblib.dump(model, model_dir / "wildfire_predictor_model.pkl")

['../models/wildfire_predictor_model.pkl']

### Model testing

In [7]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

# Evaluate the model
evaluate_model(model, X_test_scaled, y_test)

# Save the scaler
joblib.dump(scaler, model_dir/"wildfire_predictor_scaler.pkl")

Mean Absolute Error: 8.897474547974468
Mean Squared Error: 149.4493040714875
R^2 Score: 0.6300527366727633


['../models/wildfire_predictor_scaler.pkl']

# Get new data

In [8]:
import pandas as pd

try:
    npl_url = f"https://firms.modaps.eosdis.nasa.gov/api/country/csv/{MAP_KEY}/MODIS_NRT/NPL/10"
    npl_data = pd.read_csv(npl_url)
    display(npl_data.head())
    npl_original = npl_data.copy()
except Exception as e:
    print("🔥 Failed to load Nepal wildfire data:", e)    

Unnamed: 0,country_id,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight
0,NPL,28.43337,81.10894,331.01,1.0,1.0,2025-05-13,417,Terra,MODIS,81,6.1NRT,307.6,16.2,D
1,NPL,28.20702,81.38847,332.81,1.02,1.01,2025-05-14,858,Aqua,MODIS,77,6.1NRT,317.55,12.78,D
2,NPL,28.55418,80.84904,327.92,1.0,1.0,2025-05-14,858,Aqua,MODIS,70,6.1NRT,311.51,10.81,D
3,NPL,28.55987,80.88885,326.4,1.01,1.0,2025-05-14,858,Aqua,MODIS,61,6.1NRT,310.56,9.7,D
4,NPL,28.75919,80.41531,326.49,1.0,1.0,2025-05-14,858,Aqua,MODIS,56,6.1NRT,313.54,7.13,D


In [9]:
npl_data = npl_data.drop(columns=["version", "country_id", "instrument", "acq_date", "acq_time", "satellite"])
npl_data["daynight"] = npl_data["daynight"].map({"D": 1, "N": 0})
npl_data = npl_data.drop(columns=["confidence"])
npl_data_scaled = scaler.transform(npl_data)
npl_data_predictions = model.predict(npl_data_scaled)

for i, pred in enumerate(npl_data_predictions):
    print(f"Prediction for row {i+1}: {pred:.1f}")

Prediction for row 1: 74.2
Prediction for row 2: 67.2
Prediction for row 3: 60.4
Prediction for row 4: 57.3
Prediction for row 5: 49.8
Prediction for row 6: 55.7
Prediction for row 7: 73.2
Prediction for row 8: 61.4
Prediction for row 9: 63.6
Prediction for row 10: 48.1
Prediction for row 11: 51.0
Prediction for row 12: 86.9
Prediction for row 13: 96.8
Prediction for row 14: 55.3
Prediction for row 15: 63.4
Prediction for row 16: 51.3
Prediction for row 17: 69.5
Prediction for row 18: 78.2
Prediction for row 19: 69.1
Prediction for row 20: 47.5
Prediction for row 21: 67.2
Prediction for row 22: 39.6
Prediction for row 23: 37.8
Prediction for row 24: 98.4
Prediction for row 25: 77.1
Prediction for row 26: 40.3
Prediction for row 27: 52.7
Prediction for row 28: 52.5
Prediction for row 29: 67.6
Prediction for row 30: 97.3
Prediction for row 31: 67.0
Prediction for row 32: 44.9
Prediction for row 33: 62.1
Prediction for row 34: 27.6
Prediction for row 35: 44.4
Prediction for row 36: 47.6
P