In [1]:
"""
Identify wildfire events with >= 1% pre-fire aspen forest cover across western U.S. ecoregions
Landcover data: LANDFIRE Existing Vegetation Type (EVT) ca. 2016
Author: maxwell.cook@colorado.edu
"""

import os, time, glob, gc
import numpy as np
import pandas as pd
import geopandas as gpd
import rioxarray as rxr
import rasterio as rio
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp

from shapely.geometry import box
from datetime import datetime
from rasterstats import zonal_stats

import warnings
warnings.filterwarnings("ignore") # suppresses annoying geopandas warning

proj = 'EPSG:5070'

maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/'
projdir = os.path.join(maindir, 'aspen-fire/Aim2/')

print("Ready to go !")

Ready to go !


In [40]:
def compute_band_stats(geom_chunk, band_data, affine, nodataval, id_col):
    """
    Function to compute band statistics for a chunk of geometries and a raster band.
    This function is passed to the multiprocessing workers.
    """
    stats = zonal_stats(
        vectors=geom_chunk[[id_col, 'geometry']],
        raster=band_data,
        categorical=True,
        affine=affine,
        all_touched=True,
        nodata=nodataval,
        geojson_out=True
    )
    
    # Convert the stats to a DataFrame and perform necessary operations
    stats_df = pd.DataFrame(stats)
    stats_df[id_col] = stats_df['properties'].apply(lambda x: x.get(id_col))
    stats_df['properties'] = stats_df['properties'].apply(lambda x: {key: val for key, val in x.items() if key != id_col})
    stats_df['props_list'] = stats_df['properties'].apply(lambda x: list(x.items()))

    props = stats_df.explode('props_list').reset_index(drop=True)
    props[['evt', 'count']] = pd.DataFrame(props['props_list'].tolist(), index=props.index)
    props = props[[id_col, 'evt', 'count']].reset_index(drop=True)

    total_pixels = props.groupby(props[id_col])['count'].transform('sum')
    props['total_pixels'] = total_pixels
    props['pct_cover'] = (props['count'] / props['total_pixels']) * 100

    return props
    
class BandStatistics:
    def __init__(self, geo_fp, img_fp, uid):
        """
        Initializes the BandStatistics object
        """
        self.geometries = gpd.read_file(geo_fp)
        self.id_col = str(uid)
        self.image_da = rxr.open_rasterio(img_fp, cache=False).squeeze()
        # Check the CRS information matches
        if self.image_da.rio.crs != self.geometries.crs:
            self.geometries = self.geometries.to_crs(self.image_da.rio.crs)
        
        # Crop the raster image to the extent of fire polygons
        bounds = self.geometries.total_bounds
        self.image_da = self.image_da.rio.clip_box(
            minx=bounds[0], 
            miny=bounds[1], 
            maxx=bounds[2], 
            maxy=bounds[3]
        )
        
        self.affine = self.image_da.rio.transform()
        self.band_da = self.image_da.values
        self.nodataval = self.image_da.rio.nodata

        del self.image_da
        gc.collect()
        
    def parallel_compute_stats(self):
        """
        Parallelizes the categorical statistics computation for all geometries for the single band.
        Automatically sets the number of workers to the number of available CPU cores minus one.
        """
        num_workers = max(1, os.cpu_count() - 2)
        print(f"Using {num_workers} workers.")

        # Split geometries into chunks for parallel processing
        chunks = np.array_split(self.geometries, num_workers)

        with mp.Pool(processes=num_workers) as pool:
            results = pool.starmap(
                compute_band_stats, 
                [(chunk, self.band_da, self.affine, self.nodataval, self.id_col) for chunk in chunks]
            )

        # Cancetenate the star objects (chunked results)
        results_df = pd.concat(results, ignore_index=True)
        
        return results_df


def create_bounds(gdf, buffer=None):
    """ Calculate a bounding rectangle for a given geometry and buffer """
    bounds = gdf.geometry.apply(lambda geom: box(*geom.bounds))
    if buffer is not None:
        bounds = bounds.buffer(buffer)
    # Assign the geometry to the geodataframe
    gdf_ = gdf.copy()
    gdf_.geometry = bounds
    return gdf_
    

print("Functions loaded !")

Functions loaded !


In [3]:
# Load the land cover data

In [3]:
# Load the LANDFIRE EVT (ca. 2016)
evt_fp = os.path.join(maindir,'data/landcover/LANDFIRE/LF2016_EVT_200_CONUS/Tif/LC16_EVT_200.tif')
evt = rxr.open_rasterio(evt_fp, masked=True, cache=False).squeeze()
shp, gt, wkt, nd = evt.shape, evt.spatial_ref.GeoTransform, evt.rio.crs, evt.rio.nodata
print(
    f"Shape: {shp}; \n"
    f"GeoTransform: {gt}; \n"
    f"WKT: {wkt}; \n"
    f"NoData Value: {nd}; \n"
    f"Data Type: {evt[0].dtype}")
del evt
gc.collect()

Shape: (97283, 154207); 
GeoTransform: -2362425.0 30.0 0.0 3177435.0 0.0 -30.0; 
WKT: EPSG:5070; 
NoData Value: nan; 
Data Type: float32


106

In [5]:
# Load FIRED perimeters

In [4]:
# Load the FIRED perimeters (2018-2024)
daily_fp = os.path.join(maindir,'aspen-fire/Aim2/data/spatial/mod/FIRED/fired-daily_west_2018_to_2024.gpkg')
daily = gpd.read_file(daily_fp)
daily = daily.to_crs(proj) # ensure albers projection
daily.columns

Index(['did', 'id', 'date', 'ig_date', 'ig_day', 'ig_month', 'ig_year',
       'last_date', 'event_day', 'event_dur', 'pixels', 'tot_pix', 'dy_ar_km2',
       'tot_ar_km2', 'fsr_px_dy', 'fsr_km2_dy', 'mx_grw_px', 'mn_grw_px',
       'mu_grw_px', 'mx_grw_km2', 'mn_grw_km2', 'mu_grw_km2', 'mx_grw_dte',
       'x', 'y', 'ig_utm_x', 'ig_utm_y', 'lc_code', 'lc_mode', 'lc_name',
       'lc_desc', 'lc_type', 'eco_mode', 'eco_name', 'eco_type', 'geometry'],
      dtype='object')

In [7]:
# Calculate the daily land cover (EVT) proportions

In [6]:
t0 = time.time()

if __name__ == '__main__':
    mp.set_start_method('fork')
    # Create the band stats class
    band_stats_obj = BandStatistics(daily_fp, evt_fp, uid='did')
    # Run parallel band statistics computation
    sampled = band_stats_obj.parallel_compute_stats()

t1 = (time.time() - t0) / 60
print(f"Total elapsed time: {t1:.2f} minutes.")
print("\n~~~~~~~~~~\n")

Using 6 workers.
Total elapsed time: 3.88 minutes.

~~~~~~~~~~



In [7]:
sampled.head()

Unnamed: 0,did,evt,count,total_pixels,pct_cover
0,0003d89c00208d55887fec4a95f58ac8,7080,1,600,0.166667
1,0003d89c00208d55887fec4a95f58ac8,7127,3,600,0.5
2,0003d89c00208d55887fec4a95f58ac8,7296,5,600,0.833333
3,0003d89c00208d55887fec4a95f58ac8,7297,1,600,0.166667
4,0003d89c00208d55887fec4a95f58ac8,7299,47,600,7.833333


In [9]:
gc.collect() # clear any unused memory

0

In [None]:
# Load the lookup info to get landcover type

In [8]:
lookup = os.path.join(maindir,'data/landcover/LANDFIRE/LF2016_EVT_200_CONUS/CSV_Data/LF16_EVT_200.csv')
lookup = pd.read_csv(lookup)
print(lookup.columns)

Index(['VALUE', 'EVT_NAME', 'LFRDB', 'EVT_FUEL', 'EVT_FUEL_N', 'EVT_LF',
       'EVT_PHYS', 'EVT_GP', 'EVT_GP_N', 'SAF_SRM', 'EVT_ORDER', 'EVT_CLASS',
       'EVT_SBCLS', 'R', 'G', 'B', 'RED', 'GREEN', 'BLUE'],
      dtype='object')


In [13]:
# Subset the codes we want to join, join back to the dataframe
lookup = lookup[['VALUE','EVT_NAME','EVT_PHYS','EVT_GP_N','EVT_CLASS']]
# Merge back to the data
props_df = sampled.merge(lookup, left_on='evt', right_on='VALUE', how='left')
props_df.head()

Unnamed: 0,did,evt,count,total_pixels,pct_cover,VALUE,EVT_NAME,EVT_PHYS,EVT_GP_N,EVT_CLASS
0,0003d89c00208d55887fec4a95f58ac8,7080,1,600,0.166667,7080,Inter-Mountain Basins Big Sagebrush Shrubland,Shrubland,Big Sagebrush Shrubland and Steppe,Shrubland
1,0003d89c00208d55887fec4a95f58ac8,7127,3,600,0.5,7127,Inter-Mountain Basins Semi-Desert Shrub-Steppe,Shrubland,Desert Scrub,Shrubland
2,0003d89c00208d55887fec4a95f58ac8,7296,5,600,0.833333,7296,Developed-Low Intensity,Developed-Low Intensity,Developed-Low Intensity,No Dominant Lifeform
3,0003d89c00208d55887fec4a95f58ac8,7297,1,600,0.166667,7297,Developed-Medium Intensity,Developed-Medium Intensity,Developed-Medium Intensity,No Dominant Lifeform
4,0003d89c00208d55887fec4a95f58ac8,7299,47,600,7.833333,7299,Developed-Roads,Developed-Roads,Developed-Roads,No Dominant Lifeform


In [15]:
# retrieve the FIRED_ID from the original dataframe
props_df = props_df.merge(daily[['id','did']], left_on='did', right_on='did', how='left')

In [17]:
print(f"There are [{len(props_df['id'].unique())}] unique fires in the sampled data.\n\t[{len(props_df['did'].unique())}] individual daily perimeters.")

There are [9936] unique fires in the sampled data.
	[32493] individual daily perimeters.


In [18]:
props_df.columns

Index(['did', 'evt', 'count', 'total_pixels', 'pct_cover', 'VALUE', 'EVT_NAME',
       'EVT_PHYS', 'EVT_GP_N', 'EVT_CLASS', 'id'],
      dtype='object')

In [None]:
# Calculate event-level EVT proportions

In [22]:
event_evt_count = props_df.groupby('id')['count'].sum().reset_index()
event_evt_count.rename(columns={'count': 'total_count'}, inplace=True)
event_evt_count.head()

Unnamed: 0,id,total_count
0,10,299
1,13,301
2,15,299
3,18,2784
4,19,602


In [23]:
props_df_ = pd.merge(props_df, event_evt_count, on='id', how='left')
props_df_['event_pct_cover'] = (props_df_['count'] / props_df_['total_count']) * 100
props_df_['event_pct_cover'].head()

0    0.047664
1    0.142993
2    0.238322
3    0.047664
4    2.240229
Name: event_pct_cover, dtype: float64

In [24]:
props_df_event = props_df_.groupby(['id', 'evt']).agg({
    'count': 'sum',
    'event_pct_cover': 'mean',
    'EVT_NAME': 'first',  # Add any additional columns you want in the summary
}).reset_index()
props_df_event.head()

Unnamed: 0,id,evt,count,event_pct_cover,EVT_NAME
0,10,7035,1,0.334448,North Pacific Dry Douglas-fir-(Madrone) Forest...
1,10,7037,124,41.471572,North Pacific Maritime Dry-Mesic Douglas-fir-W...
2,10,7039,99,33.110368,North Pacific Maritime Mesic-Wet Douglas-fir-W...
3,10,7043,67,22.408027,Mediterranean California Mixed Evergreen Forest
4,10,7063,2,0.668896,North Pacific Broadleaf Landslide Forest


In [None]:
# Save the files out.

In [27]:
results_dir = os.path.join(projdir,'data/tabular/mod/EVT/')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
print(f"Saving EVT summary tables to {results_dir}")

Saving EVT summary tables to /Users/max/Library/CloudStorage/OneDrive-Personal/mcook/aspen-fire/Aim2/data/tabular/mod/EVT/


In [28]:
# Save the daily-level EVT summary
out_fp = os.path.join(results_dir, 'fired-daily_west_2018_to_2024-EVT.csv')
props_df.to_csv(out_fp)

# Save the event-level EVT summary
out_fp = os.path.join(results_dir, 'fired-events_west_2018_to_2024-EVT.csv')
props_df_event.to_csv(out_fp)

print("Tables saved !")

Tables saved !


In [None]:
# Compare with a more broad scale approach (event summary within 3km)

In [44]:
event_fp = os.path.join(projdir,'data/spatial/mod/FIRED/fired-events_west_2018_to_2024.gpkg')
events = gpd.read_file(fp)
print(f"There are [{len(events)}] across western U.S. ecoregions (2018-2024).")
print(events.columns)

There are [9936] across western U.S. ecoregions (2018-2024).
Index(['id', 'ig_date', 'ig_day', 'ig_month', 'ig_year', 'last_date',
       'event_dur', 'tot_pix', 'tot_ar_km2', 'fsr_px_dy', 'fsr_km2_dy',
       'mx_grw_px', 'mn_grw_px', 'mu_grw_px', 'mx_grw_km2', 'mn_grw_km2',
       'mu_grw_km2', 'mx_grw_dte', 'x', 'y', 'ig_utm_x', 'ig_utm_y', 'lc_code',
       'lc_mode', 'lc_name', 'lc_desc', 'lc_type', 'eco_mode', 'eco_name',
       'eco_type', 'tot_perim', 'na_l3name', 'geometry'],
      dtype='object')


In [45]:
# Create new geometry (bounds) with 1km buffer
bounds = create_bounds(events, buffer=3000)
event_fp_3k = os.path.join(projdir,'data/spatial/mod/FIRED/fired-events_west_2018_to_2024_3km.gpkg')
bounds.to_file(event_fp_3k)
len(bounds)

9936

In [46]:
t0 = time.time()

if __name__ == '__main__':
    # mp.set_start_method('fork')
    # Create the band stats class
    band_stats_obj = BandStatistics(event_fp_3k, evt_fp, uid='id')
    # Run parallel band statistics computation
    sampled = band_stats_obj.parallel_compute_stats()

t1 = (time.time() - t0) / 60
print(f"Total elapsed time: {t1:.2f} minutes.")
print("\n~~~~~~~~~~\n")

Using 6 workers.
Total elapsed time: 3.80 minutes.

~~~~~~~~~~



In [47]:
sampled.head()

Unnamed: 0,id,evt,count,total_pixels,pct_cover
0,10,7008,21,44300,0.047404
1,10,7035,710,44300,1.602709
2,10,7037,20936,44300,47.259594
3,10,7039,12813,44300,28.923251
4,10,7043,7282,44300,16.437923


In [48]:
# Load the lookup table
lookup = os.path.join(maindir,'data/landcover/LANDFIRE/LF2016_EVT_200_CONUS/CSV_Data/LF16_EVT_200.csv')
lookup = pd.read_csv(lookup)
# Subset the codes we want to join, join back to the dataframe
lookup = lookup[['VALUE','EVT_NAME','EVT_PHYS','EVT_GP_N','EVT_CLASS']]
# Merge back to the data
props_df_3k = sampled.merge(lookup, left_on='evt', right_on='VALUE', how='left')
props_df_3k.head()

Unnamed: 0,id,evt,count,total_pixels,pct_cover,VALUE,EVT_NAME,EVT_PHYS,EVT_GP_N,EVT_CLASS
0,10,7008,21,44300,0.047404,7008,North Pacific Oak Woodland,Hardwood,Western Oak Woodland and Savanna,Open tree canopy
1,10,7035,710,44300,1.602709,7035,North Pacific Dry Douglas-fir-(Madrone) Forest...,Conifer,Douglas-fir Forest and Woodland,Closed tree canopy
2,10,7037,20936,44300,47.259594,7037,North Pacific Maritime Dry-Mesic Douglas-fir-W...,Conifer,Douglas-fir-Western Hemlock Forest and Woodland,Closed tree canopy
3,10,7039,12813,44300,28.923251,7039,North Pacific Maritime Mesic-Wet Douglas-fir-W...,Conifer,Douglas-fir-Western Hemlock Forest and Woodland,Closed tree canopy
4,10,7043,7282,44300,16.437923,7043,Mediterranean California Mixed Evergreen Forest,Conifer,California Mixed Evergreen Forest and Woodland,Closed tree canopy


In [49]:
# Save the daily-level EVT summary
out_fp = os.path.join(results_dir, 'fired-events_west_2018_to_2024-EVT-3km.csv')
props_df_3k.to_csv(out_fp)

In [50]:
del lookup, sampled
gc.collect()

0