In [5]:
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm
import pandas as pd

Examine why model is performing poorly now 

# Logistic Model

Note the high precision score of the logistic model, at ~0.90.

In [12]:
gpkg_path = "D:/terrain_generation_project/NAIP_processed/structures_ndvi_points.gpkg"
layer_name = "points"

gdf = gpd.read_file(gpkg_path, layer=layer_name)

In [13]:
def train_logistic_model_with_pvalues(gdf, features, target='burned'):
    # Drop rows with missing values in features or target
    data = gdf.dropna(subset=features + [target])

    X = data[features]
    y = data[target]

    # Add constant for intercept
    X = sm.add_constant(X)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

    # Fit model using statsmodels for p-values
    model = sm.Logit(y_train, X_train).fit(disp=False)

    # Predict on test set
    y_pred = (model.predict(X_test) >= 0.5).astype(int)

    # Print results
    print(f"\n=== Logistic Regression Report: {features} ===")
    print(classification_report(y_test, y_pred))

    print("\n--- Coefficients and p-values ---")
    summary = pd.DataFrame({
        "Coefficient": model.params,
        "p-value": model.pvalues
    })
    print(summary)

    return model

# === Define feature sets for four models ===
feature_sets = [
    ["burned_proportion_20m"],
    ["burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m"],
    ["burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m", "ndvi_dist_m"],
    ["ndvi_dist_m"]
]

In [14]:
# Run all models
models = {}
for i, features in enumerate(feature_sets, start=1):
    model_name = f"model_{i}"
    models[model_name] = train_logistic_model_with_pvalues(gdf, features)


=== Logistic Regression Report: ['burned_proportion_20m'] ===
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       853
           1       0.70      1.00      0.83      2015

    accuracy                           0.70      2868
   macro avg       0.35      0.50      0.41      2868
weighted avg       0.49      0.70      0.58      2868


--- Coefficients and p-values ---
                       Coefficient        p-value
const                     0.163985   7.499087e-07
burned_proportion_20m     2.381560  3.503176e-199

=== Logistic Regression Report: ['burned_proportion_20m', 'burned_proportion_50m', 'burned_proportion_100m'] ===
              precision    recall  f1-score   support

           0       0.78      0.69      0.73       853
           1       0.87      0.92      0.90      2015

    accuracy                           0.85      2868
   macro avg       0.83      0.80      0.81      2868
weighted avg       0.85      0.85      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
