In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tabulate import tabulate 
import geopandas as gpd

In [7]:
gpkg_path = "outputs/merged_structures.gpkg"
layer_name = "merged_structures"

gdf = gpd.read_file(gpkg_path, layer=layer_name)

In [8]:
gdf

Unnamed: 0,OBJECTID,index_right,BUILD_ID,OCC_CLS,PRIM_OCC,SEC_OCC,PROP_ADDR,PROP_CITY,PROP_ST,PROP_ZIP,...,total_neighbors_20m,burned_neighbors_20m,burned_proportion_20m,total_neighbors_50m,burned_neighbors_50m,burned_proportion_50m,total_neighbors_100m,burned_neighbors_100m,burned_proportion_100m,geometry
0,10340,2092,15239379,Residential,Multi - Family Dwelling,,,,California,,...,1,0,0.0,3,1,0.333333,10,4,0.400000,POINT (-13153318.783 4053842.793)
1,11265,3125,15241102,Residential,Single Family Dwelling,,,,California,,...,1,0,0.0,4,1,0.250000,12,4,0.333333,POINT (-13153141.694 4054235.977)
2,10933,9475,8100222,Residential,Single Family Dwelling,,,,California,,...,0,0,0.0,1,0,0.000000,3,0,0.000000,POINT (-13152989.148 4054094.15)
3,14585,4318,15242665,Residential,Single Family Dwelling,,,,California,,...,1,1,1.0,8,3,0.375000,12,4,0.333333,POINT (-13148330.439 4054674.044)
4,1779,5618,15244268,Residential,Single Family Dwelling,,,,California,,...,2,1,0.5,11,4,0.363636,13,4,0.307692,POINT (-13148491.095 4055047.733)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9555,8917,9510,8100893,Residential,Single Family Dwelling,,,,California,,...,2,0,0.0,3,1,0.333333,7,3,0.428571,POINT (-13152562.867 4054278.343)
9556,7852,7455,15246496,Residential,Multi - Family Dwelling,,,,California,,...,1,0,0.0,3,1,0.333333,5,1,0.200000,POINT (-13149193.358 4055667.054)
9557,9212,3702,15241912,Residential,Single Family Dwelling,,,,California,,...,2,1,0.5,4,1,0.250000,7,2,0.285714,POINT (-13149755.753 4054473.552)
9558,4683,3949,15242213,Residential,Single Family Dwelling,,,,California,,...,3,3,1.0,11,5,0.454545,25,8,0.320000,POINT (-13152090.605 4054515.688)


In [9]:
def train_logistic_model_with_summary(gdf, features, target='burned'):
    data = gdf.dropna(subset=features + [target])
    X = data[features]
    y = data[target]

    X = sm.add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

    model = sm.Logit(y_train, X_train).fit(disp=False)
    y_pred = (model.predict(X_test) >= 0.5).astype(int)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    metrics_row = {
        'Features': ", ".join(features),
        'Accuracy': accuracy,
        'Precision_0': report['0']['precision'],
        'Recall_0': report['0']['recall'],
        'F1_0': report['0']['f1-score'],
        'Precision_1': report['1']['precision'],
        'Recall_1': report['1']['recall'],
        'F1_1': report['1']['f1-score']
    }

    coef_df = pd.DataFrame({
        'Feature': model.params.index,
        'Coefficient': model.params.values,
        'p-value': model.pvalues.values,
        'Model': ", ".join(features)
    })

    return metrics_row, coef_df


# === Define your feature sets ===
feature_sets = [
    ["burned_proportion_10m"],
    ["burned_proportion_10m", "burned_proportion_20m"],
    ["burned_proportion_10m", "burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m"],
    ["burned_proportion_10m", "burned_proportion_20m", "burned_proportion_50m", "burned_proportion_100m", "ndvi_min_dist_m_x", "ndvi_mean_dist_m_x"],
    ["ndvi_min_dist_m_x"],
    ["ndvi_mean_dist_m_x"],
    ["ndvi_min_dist_m_x", "ndvi_mean_dist_m_x"]
]

# === Run all models ===
metrics_list = []
coef_list = []

for fs in feature_sets:
    metrics, coefs = train_logistic_model_with_summary(gdf, fs)
    metrics_list.append(metrics)
    coef_list.append(coefs)

# === Combine and display results ===
metrics_df = pd.DataFrame(metrics_list)
coef_df = pd.concat(coef_list, ignore_index=True)

# Print the tables
print("\n=== Model Performance Summary ===")
print(tabulate(metrics_df, headers='keys', tablefmt='psql'))

print("\n=== Logistic Regression Coefficients ===")
print(tabulate(coef_df, headers='keys', tablefmt='psql'))

# # Optionally save to CSV
# metrics_df.to_csv("model_performance_summary.csv", index=False)
# coef_df.to_csv("logistic_regression_coefficients.csv", index=False)


=== Model Performance Summary ===
+----+------------------------------------------------------------------------------------------------------------------------------------+------------+---------------+------------+-----------+---------------+------------+----------+
|    | Features                                                                                                                           |   Accuracy |   Precision_0 |   Recall_0 |      F1_0 |   Precision_1 |   Recall_1 |     F1_1 |
|----+------------------------------------------------------------------------------------------------------------------------------------+------------+---------------+------------+-----------+---------------+------------+----------|
|  0 | burned_proportion_10m                                                                                                              |   0.70258  |      0        | 0          | 0         |      0.70258  |   1        | 0.825312 |
|  1 | burned_proportion_10m,

### Model Performance Summary

| #  | Features                                                                                                                          | Accuracy | Precision_0 | Recall_0 | F1_0     | Precision_1 | Recall_1 | F1_1     |
|----|-----------------------------------------------------------------------------------------------------------------------------------|----------|-------------|----------|----------|--------------|----------|----------|
| 0  | burned_proportion_10m                                                                                                             | 0.70258  | 0           | 0        | 0        | 0.70258      | 1        | 0.825312 |
| 1  | burned_proportion_10m, burned_proportion_20m                                                                                      | 0.70258  | 0           | 0        | 0        | 0.70258      | 1        | 0.825312 |
| 2  | burned_proportion_10m, burned_proportion_20m, burned_proportion_50m, burned_proportion_100m                                      | 0.70258  | 0           | 0        | 0        | 0.70258      | 1        | 0.825312 |
| 3  | burned_proportion_10m, burned_proportion_20m, burned_proportion_50m, burned_proportion_100m, ndvi_min_dist_m_x, ndvi_mean_dist_m_x | 0.704712 | 0.4375      | 0.0166271| 0.0320366| 0.70773      | 0.991102 | 0.825783 |
| 4  | ndvi_min_dist_m_x                                                                                                                 | 0.705759 | 0.470588    | 0.0095012| 0.0186263| 0.707163     | 0.995551 | 0.826935 |
| 5  | ndvi_mean_dist_m_x                                                                                                                | 0.705061 | 0.448276    | 0.0154394| 0.0298507| 0.707687     | 0.992091 | 0.826096 |
| 6  | ndvi_min_dist_m_x, ndvi_mean_dist_m_x                                                                                             | 0.705061 | 0.451613    | 0.0166271| 0.0320733| 0.707833     | 0.991597 | 0.826024 |
