# New concept

2. Network Graph Model

    Analyze:
    - How burning "propagated" through the graph.
    - How graph centrality, degree (number of neighbors), clustering coefficient relate to burn probability.
    - Create animation for how spread occured between nodes

In [1]:
import geopandas as gpd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm

# Prepare Dataset

Add columns for `burned_proportion_20m`, `burned_proportion_50m`, `burned_proportion_100m`. 

In [2]:
def preprocess_structures(gpkg_path, layer_name):
    # Load the structures layer
    gdf = gpd.read_file(gpkg_path, layer=layer_name)

    gdf.set_crs(epsg=3857, inplace=True)

    # Reproject to meters if needed
    if not gdf.crs.is_projected:
        raise ValueError("CRS must be projected (meters). Please reproject your data first.")

    # Create a binary "burned" column: 1 = Destroyed, 0 = Otherwise
    gdf['burned'] = gdf['DAMAGE'].apply(lambda x: 1 if str(x).lower() == 'destroyed (>50%)' else 0)
    print("Unique values:")
    print(gdf['DAMAGE'].unique())
    
    return gdf

def calculate_neighbor_stats(gdf, radii_meters=[20, 50, 100]):
    coords = np.array(list(zip(gdf.geometry.x, gdf.geometry.y)))
    tree = BallTree(coords, metric='euclidean')

    for radius in radii_meters:
        indices = tree.query_radius(coords, r=radius)
        
        total_neighbors = []
        burned_neighbors = []
        burned_proportion = []

        burned_array = gdf['burned'].to_numpy()

        for i, neighbors in enumerate(indices):
            neighbors = neighbors[neighbors != i]  # exclude self
            total = len(neighbors)
            burned_count = burned_array[neighbors].sum() if total > 0 else 0

            total_neighbors.append(total)
            burned_neighbors.append(burned_count)
            burned_proportion.append(burned_count / total if total > 0 else 0)

        # Save to GeoDataFrame
        gdf[f'total_neighbors_{radius}m'] = total_neighbors
        gdf[f'burned_neighbors_{radius}m'] = burned_neighbors
        gdf[f'burned_proportion_{radius}m'] = burned_proportion

    return gdf

In [3]:
gpkg_file = "data/structures.gpkg"
layer_name = "postfire"
output_file = "data/structures_with_neighbors.gpkg"

# Step 1: Load and preprocess
gdf = preprocess_structures(gpkg_file, layer_name)

# Step 2: Calculate neighbor stats
gdf = calculate_neighbor_stats(gdf, radii_meters=[20, 50, 100])

# Step 3: Save enriched data
gdf.to_file(output_file, driver="GPKG")
print(f"Saved enriched structures to {output_file}")

# Step 4: Train a simple logistic regression model
feature_columns = [
    'burned_proportion_20m',
    'burned_proportion_50m',
    'burned_proportion_100m'
]

Unique values:
['No Damage' 'Destroyed (>50%)' 'Affected (1-9%)' 'Minor (10-25%)'
 'Inaccessible' 'Major (26-50%)']
Saved enriched structures to data/structures_with_neighbors.gpkg


# Logistic Model

Note the high precision score of the logistic model, at ~0.90.

In [4]:
def train_logistic_model(gdf, features, target='burned'):
    # Drop any NaN values if they exist
    data = gdf.dropna(subset=features + [target])

    X = data[features]
    y = data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print("=== Logistic Regression Report ===")
    print(classification_report(y_test, y_pred))

    return model

In [5]:
model = train_logistic_model(gdf, features=feature_columns)

=== Logistic Regression Report ===
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2687
           1       0.89      0.91      0.90      2841

    accuracy                           0.89      5528
   macro avg       0.89      0.89      0.89      5528
weighted avg       0.89      0.89      0.89      5528



In [6]:
def train_logistic_model_with_pvalues(gdf, features, target='burned'):
    # Drop NaNs
    data = gdf.dropna(subset=features + [target])

    X = data[features]
    y = data[target]

    # Add a constant (intercept) term manually
    X = sm.add_constant(X)

    # Build logistic model
    model = sm.Logit(y, X)
    result = model.fit()

    # Print the summary
    print(result.summary())

    return result

Note that the p value is less than 0.001 for each of the variables `burned_proportion_20m`, `burned_proportion_50m`, `burned_proportion_100m`. 

In [7]:
model = train_logistic_model_with_pvalues(gdf, features=feature_columns)

Optimization terminated successfully.
         Current function value: 0.273373
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 burned   No. Observations:                18426
Model:                          Logit   Df Residuals:                    18422
Method:                           MLE   Df Model:                            3
Date:                Sun, 27 Apr 2025   Pseudo R-squ.:                  0.6055
Time:                        23:01:42   Log-Likelihood:                -5037.2
converged:                       True   LL-Null:                       -12767.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -3.4742      0.057    -61.219      0.000      -3.585      