In [5]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tabulate import tabulate 
import geopandas as gpd

Examine why model is performing poorly now 

# Logistic Model

Note the high precision score of the logistic model, at ~0.90.

In [None]:
gpkg_path = "data/structures_ndvi_points_min_and_mean.gpkg"
layer_name = "points"

gdf = gpd.read_file(gpkg_path, layer=layer_name)

In [11]:
def train_logistic_model_with_summary(gdf, features, target='burned'):
    data = gdf.dropna(subset=features + [target])
    X = data[features]
    y = data[target]

    X = sm.add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

    model = sm.Logit(y_train, X_train).fit(disp=False)
    y_pred = (model.predict(X_test) >= 0.5).astype(int)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    metrics_row = {
        'Features': ", ".join(features),
        'Accuracy': accuracy,
        'Precision_0': report['0']['precision'],
        'Recall_0': report['0']['recall'],
        'F1_0': report['0']['f1-score'],
        'Precision_1': report['1']['precision'],
        'Recall_1': report['1']['recall'],
        'F1_1': report['1']['f1-score']
    }

    coef_df = pd.DataFrame({
        'Feature': model.params.index,
        'Coefficient': model.params.values,
        'p-value': model.pvalues.values,
        'Model': ", ".join(features)
    })

    return metrics_row, coef_df


# === Define your feature sets ===
feature_sets = [
    ["burned_proportion_20m"],
    ["burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m"],
    ["burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m", "ndvi_min_dist_m", "ndvi_mean_dist_m"],
    ["ndvi_min_dist_m"],
    ["ndvi_mean_dist_m"],
    ["ndvi_min_dist_m", "ndvi_mean_dist_m"]
]

# === Run all models ===
metrics_list = []
coef_list = []

for fs in feature_sets:
    metrics, coefs = train_logistic_model_with_summary(gdf, fs)
    metrics_list.append(metrics)
    coef_list.append(coefs)

# === Combine and display results ===
metrics_df = pd.DataFrame(metrics_list)
coef_df = pd.concat(coef_list, ignore_index=True)

# Print the tables
print("\n=== Model Performance Summary ===")
print(tabulate(metrics_df, headers='keys', tablefmt='psql'))

print("\n=== Logistic Regression Coefficients ===")
print(tabulate(coef_df, headers='keys', tablefmt='psql'))

# # Optionally save to CSV
# metrics_df.to_csv("model_performance_summary.csv", index=False)
# coef_df.to_csv("logistic_regression_coefficients.csv", index=False)


=== Model Performance Summary ===
+----+---------------------------------------------------------------------------------------------------------+------------+---------------+------------+-----------+---------------+------------+----------+
|    | Features                                                                                                |   Accuracy |   Precision_0 |   Recall_0 |      F1_0 |   Precision_1 |   Recall_1 |     F1_1 |
|----+---------------------------------------------------------------------------------------------------------+------------+---------------+------------+-----------+---------------+------------+----------|
|  0 | burned_proportion_20m                                                                                   |   0.70258  |      0        | 0          | 0         |      0.70258  |   1        | 0.825312 |
|  1 | burned_proportion_20m, burned_proportion_50m, burned_proportion_100m                                    |   0.85007  |      0.7808