In [3]:
import pandas as pd
import numpy as np
import xarray as xr
import cdsapi 

# viz
import matplotlib.pyplot as plt
import plotly.express as px
import cartopy
import cartopy.crs as ccrs
from cartopy.io import shapereader
import plotly.graph_objects as go

import glob
import sys
import os

import geopandas as gpd
import rioxarray
from shapely.geometry import mapping

# creds
sys.path.append('/Users/max/Deep_Sky')
from creds import CDS_UID, CDS_API_key

In [2]:
# point to local UNSEEN open dir to import retrieve and preprocess
sys.path.append('/Users/max/Deep_Sky/GitHub/UNSEEN-open-Deep-Sky')

import src.cdsretrieve as retrieve
import src.preprocess as preprocess

c = cdsapi.Client()

In [4]:
ERA5 = xr.open_mfdataset('../../data/UNSEEN/wildfire/cems/ERA5/ERA5_????.nc',combine='by_coords')
ERA5

OSError: no files to open

In [4]:
mean_fwinx = ERA5['fwinx'].mean(['latitude','longitude'])

fig = go.Figure(data=go.Scatter(
    x=mean_fwinx.time.values, 
    y=mean_fwinx.values,
    mode='markers',
    marker=dict(
        size=6,
        color='LightSeaGreen',
    ),
    text=mean_fwinx.time.values  # Display the year on hover
))

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Mean fwinx",
    title="Mean FWI",
)

fig.show()

In [1]:
def season_mean(ds, years, calendar='standard'):
    # Make a DataArray with the number of days in each month, size = len(time)
    month_length = ds.time.dt.days_in_month

    # Calculate the weights by grouping by 'time.season'
    weights = month_length.groupby('time.year') / month_length.groupby('time.year').sum()

    # Test that the sum of the weights for each season is 1.0
    np.testing.assert_allclose(weights.groupby('time.year').sum().values, np.ones(years))

    # Calculate the weighted average
    return (ds * weights).groupby('time.year').sum(dim='time', min_count = 1)

def count_extreme_fwi_days(location_data):
    # Calculate the 90th, 95th, and 99th percentiles between 1994 and 2014
    ds_1983_2023 = location_data.sel(time=slice('1973-01-01', '1999-12-31')).chunk({'time': -1})  # rechunking here
    p90 = ds_1983_2023['fwinx'].quantile(0.9).values
    p95 = ds_1983_2023['fwinx'].quantile(0.95).values
    p99 = ds_1983_2023['fwinx'].quantile(0.99).values
    p995 = ds_1983_2023['fwinx'].quantile(0.995).values

    # Initialize a DataFrame to store the counts
    extreme_fwi_days_df = pd.DataFrame(columns=['Year', 'p90_count', 'p95_count', 'p99_count', 'p995_count'])

    # For each year, count the number of days above each percentile in the specific location
    for year in pd.to_datetime(location_data['time'].values).year.unique():
        ds_year = location_data.sel(time=str(year))
        p90_count = (ds_year['fwinx'] > p90).sum().values
        p95_count = (ds_year['fwinx'] > p95).sum().values
        p99_count = (ds_year['fwinx'] > p99).sum().values
        p995_count = (ds_year['fwinx'] > p995).sum().values

        new_row = pd.DataFrame({'Year': [year], 'p90_count': [p90_count], 'p95_count': [p95_count], 'p99_count': [p99_count], 'p995_count': [p995_count]})
        extreme_fwi_days_df = pd.concat([extreme_fwi_days_df, new_row], ignore_index=True)
    return extreme_fwi_days_df

def count_extreme_fwi_days_by_month(ds):
    # Initialize a DataFrame to store the counts
    extreme_fwi_days_df = pd.DataFrame(index=pd.to_datetime(ds['time'].values).year.unique(), 
                                       columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

    # Calculate the 95th percentile for the entire period between 1994 and 2004
    ds_1983_2023 = ds.sel(time=slice('1973-01-01', '1999-12-31')).chunk({'time': -1})
    p95_total = ds_1983_2023.quantile(0.95)['fwinx'].values

    # For each year and each month, count the number of days above the 95th percentile
    for year in extreme_fwi_days_df.index:
        for month in range(1, 13):
            ds_year_month = ds.sel(time=(ds['time.year'] == year) & (ds['time.month'] == month))
            count = (ds_year_month['fwinx'] > p95_total).sum().values
            extreme_fwi_days_df.loc[year, extreme_fwi_days_df.columns[month-1]] = count

    # Reset the index to make 'Year' a column
    extreme_fwi_days_df.reset_index(inplace=True)
    extreme_fwi_days_df.rename(columns={'index': 'Year'}, inplace=True)

    return extreme_fwi_days_df    

In [50]:
def get_shape_us(shapefile, state_abbreviation):
    shapefile = shapefile.to_crs("EPSG:4326")
    state_shape = shapefile[shapefile.STUSPS == state_abbreviation].geometry
    return state_shape

def get_shape_ca(shapefile, province_name):
    shapefile = shapefile.to_crs("EPSG:4326")
    province_shape = shapefile[shapefile.PRENAME == province_name].geometry
    return province_shape

def mask_xr_dataset(xr_data, shape):
    xr_data = xr_data.assign_coords(longitude=(((xr_data.longitude + 180) % 360) - 180)).sortby('longitude')
    xr_data.rio.write_crs("EPSG:4326", inplace=True)
    xr_data_masked = xr_data.rio.clip(shape.geometry.apply(mapping), shape.crs)
    return xr_data_masked

def aggregate_annually(xr_data, vars):
    xr_data_yearly = season_mean(xr_data, len(pd.to_datetime(xr_data.time.values).year.unique()))
    area_weights = np.cos(np.deg2rad(xr_data.latitude))
    events = (
        xr_data_yearly[vars].       
        weighted(area_weights).
        mean(['longitude', 'latitude'])
    )
    return events

def store_xr_events_data(events, location):
    df_output = events.to_dataframe().reset_index()
    df_output = df_output.rename(columns={'fwinx': 'mean_fire_weather_index', 'year': 'Year'}, )  # rename 'fwinx' to 'mean_fire_weather_index'
    df_output = df_output[['Year', 'mean_fire_weather_index']]  # keep only 'year' and 'fwinx' columns
    df_output.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/ERA5_mean_fwi_{location}.csv', index=False)

In [2]:
us_shapefile = gpd.read_file('../../data/shapefiles/us/tl_2023_us_state.shp')
ca_shapefile = gpd.read_file('../../data/shapefiles/canada/lpr_000b16a_e.shp')
ERA5 = xr.open_mfdataset('../../data/UNSEEN/wildfire/cems/ERA5/ERA5_????.nc', combine='by_coords')
states_of_interest = ['ND', 'SD', 'MT', 'WY', 'ID', 'WA', 'OR', 'CA', 'NV', 'UT', 'CO', 'AZ', 'NM', 'TX']
states_of_interest_2 = ['CO', 'KS', 'OR', 'WA', 'CA', 'NE']
provinces_of_interest = ['Alberta', 'British Columbia', 'Saskatchewan', 'Quebec', 'Ontario', 'Manitoba']

NameError: name 'gpd' is not defined

In [49]:
# preprocess and store data
# calculate annual means and number of extreme fwi days
for state in states_of_interest_2:
    shape = get_shape_us(us_shapefile, state)
    state_data_era5 = mask_xr_dataset(ERA5, shape)
    era5_events = aggregate_annually(state_data_era5, ['fwinx'])
    era5_extreme_fwi_days = count_extreme_fwi_days(state_data_era5)
    #era5_extreme_fwi_days_by_month = count_extreme_fwi_days_by_month(state_data_era5)
    store_xr_events_data(era5_events, state)
    era5_extreme_fwi_days.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/ERA5_extreme_fwi_days_{state}.csv', index=False)    
    #era5_extreme_fwi_days_by_month.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/ERA5_extreme995_fwi_days_by_month_{state}.csv', index=False)


In [31]:
for province in provinces_of_interest:
    shape = get_shape_ca(ca_shapefile, province)
    province_data_era5 = mask_xr_dataset(ERA5,shape)
    #era5_events = aggregate_annually(province_data_era5, ['fwinx'])
    #era5_extreme_fwi_days = count_extreme_fwi_days(province_data_era5, ERA5)
    #era5_extreme_fwi_days_by_month = count_extreme_fwi_days_by_month(province_data_era5)
    era5_max_fwinx_by_month = max_fwinx_by_month(province_data_era5)
    era5_max_fwinx_by_month.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/canada/ERA5_max_fwi_by_month_{province}.csv')
    #store_xr_events_data(era5_events, province)
    #era5_extreme_fwi_days.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/canada/ERA5_extreme_fwi_days_{province}.csv', index=False)    
    #era5_extreme_fwi_days_by_month.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/canada/ERA5_extreme995_fwi_days_by_month_{province}.csv', index=False)

KeyboardInterrupt: 

In [8]:
era5_extreme_fwi_days = count_extreme_fwi_days(ERA5)
era5_extreme_fwi_days = era5_extreme_fwi_days[era5_extreme_fwi_days['Year'] != 2024]
era5_extreme_fwi_days.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/ERA5_extreme_fwi_days_N_America_726.csv', index=False)

In [10]:
era5_extreme_fwi_days = pd.read_csv('../../data/UNSEEN/wildfire/cems/preprocessed/ERA5_extreme_fwi_days_CO.csv')

In [11]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=era5_extreme_fwi_days['Year'], y=era5_extreme_fwi_days['p90_count'], mode='lines', name='p90_count'))
fig.add_trace(go.Scatter(x=era5_extreme_fwi_days['Year'], y=era5_extreme_fwi_days['p95_count'], mode='lines', name='p95_count'))
fig.add_trace(go.Scatter(x=era5_extreme_fwi_days['Year'], y=era5_extreme_fwi_days['p99_count'], mode='lines', name='p99_count'))
fig.add_trace(go.Scatter(x=era5_extreme_fwi_days['Year'], y=era5_extreme_fwi_days['p995_count'], mode='lines', name='p995_count'))

fig.show()

In [32]:
import pandas as pd

# Create an empty list to store the results
results = []

for province in provinces_of_interest:
    shape = get_shape_ca(ca_shapefile, province)
    province_data_era5 = mask_xr_dataset(ERA5,shape)
    
    # Group the data by year
    yearly_data = province_data_era5.groupby('time.year')
    
    for year, data in yearly_data:
        # Calculate the min, max, and mean of the 'fwinx' values
        min_fwinx = data['fwinx'].min().values
        max_fwinx = data['fwinx'].max().values
        mean_fwinx = data['fwinx'].mean().values
        
        # Store the results in the list
        results.append({
            'Province': province,
            'Year': year,
            'Min of fwinx': min_fwinx,
            'Max of fwinx': max_fwinx,
            'Mean of fwinx': mean_fwinx
        })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
results_df.to_csv('../../data/UNSEEN/wildfire/province_stats.csv')

KeyboardInterrupt: 

In [36]:
import pandas as pd

# Create an empty list to store the results
results = []

for province in provinces_of_interest:
    shape = get_shape_ca(ca_shapefile, province)
    province_data_era5 = mask_xr_dataset(ERA5,shape)
    
    # Create a new 'year_month' coordinate
    province_data_era5['year_month'] = province_data_era5['time'].dt.strftime('%Y-%m')
    
    # Group the data by 'year_month'
    monthly_data = province_data_era5.groupby('year_month')
    
    for year_month, data in monthly_data:
        # Calculate the max of the 'fwinx' values
        max_fwinx = data['fwinx'].max().values
        
        # Store the results in the list
        results.append({
            'Province': province,
            'Month': pd.to_datetime(year_month),  # Convert 'year_month' to a datetime object
            'fwinx_max': max_fwinx,
        })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
results_df.to_csv('../../data/UNSEEN/wildfire/cems/preprocessed/canada/province_stats_monthly.csv')

In [44]:
ERA5 = xr.open_mfdataset('../../data/UNSEEN/wildfire/cems/ERA5/ERA5_????.nc',combine='by_coords')

for province in provinces_of_interest:
    shape = get_shape_ca(ca_shapefile, province)
    province_data_era5 = mask_xr_dataset(ERA5,shape)
    ERA5_fwinx_max = province_data_era5['fwinx'].max(dim=['latitude', 'longitude'])
    ERA5_fwinx_max_yearly = ERA5_fwinx_max.resample(time='1YS').max()

    df = ERA5_fwinx_max_yearly.to_dataframe()
    df = df.reset_index().loc[:, ['time', 'fwinx']]
    df.to_csv(f'../../data/UNSEEN/wildfire/cems/preprocessed/canada/ERA5_yearly_max_{province}.csv', index = False)

In [4]:
import pandas as pd
import geopandas as gpd
import xarray as xr
from rioxarray.exceptions import NoDataInBounds
import rioxarray

def get_masked_data(xr_data, shape):
    xr_data = xr_data.assign_coords(longitude=(((xr_data.longitude + 180) % 360) - 180)).sortby('longitude')
    xr_data.rio.write_crs("EPSG:4326", inplace=True)
    minx, miny, maxx, maxy = shape.bounds
    buffer = 0.1
    minx -= buffer
    miny -= buffer
    maxx += buffer
    maxy += buffer
    try:
        clipped_data = xr_data.rio.clip_box(minx, miny, maxx, maxy)
        if clipped_data['fwinx'].shape[0] < 2 or clipped_data['fwinx'].shape[1] < 2:
            return None
        else:
            return clipped_data
    except rioxarray.exceptions.NoDataInBounds:
        return None

def get_extreme_fwi_occurrences(data, percentile_threshold):
    yearly_data = data.groupby('time.year')
    p95_count = yearly_data.map(lambda x: (x['fwinx'] > percentile_threshold).sum()).values
    mean_count = p95_count.mean()
    return mean_count

def get_filtered_data(data, start_year, end_year, positive_threshold=1e5, negative_threshold=-1e5):
    condition = (data['time'].dt.year >= start_year) & (data['time'].dt.year <= end_year)
    filtered_data = data.where(condition, drop=True)
    filtered_data = filtered_data.where((filtered_data < positive_threshold) & (filtered_data > negative_threshold))
    return filtered_data.chunk({'time': -1})

ERA5 = xr.open_mfdataset('../../data/UNSEEN/wildfire/cems/ERA5/ERA5_????.nc',combine='by_coords')
counties = gpd.read_file('../../data/shapefiles/us/counties/cb_2018_us_county_500k.shp')

unique_years = pd.to_datetime(ERA5['time'].values).year.unique()

results = []
for i, county_row in counties.iterrows():
    print(f"Processing county {i+1} of {len(counties)}: {county_row['NAME']}")
    try:
        county_data = get_masked_data(ERA5, county_row.geometry)
        if county_data is None:
            print(f"Clipped data for {county_row['NAME']} is too small, skipping...")
            continue
    except NoDataInBounds:
        print(f"No data found for {county_row['NAME']}, skipping...")
        continue

    baseline_data = get_filtered_data(county_data, 1973, 1999)
    current_data = get_filtered_data(county_data, 2000, 2023)

    # frequency
    p95 = baseline_data['fwinx'].quantile(0.95).values
    baseline_95_count = get_extreme_fwi_occurences(baseline_data, p95)
    current_95_count = get_extreme_fwi_occurences(current_data, p95)

    # severity
    baseline_annual_max = baseline_data.groupby('time.year').max(['latitude', 'longitude'])
    baseline_annual_max_fwinx = baseline_annual_max['fwinx']
    baseline_max = baseline_annual_max_fwinx.mean(dim='time').values

    current_annual_max = current_data.groupby('time.year').max(['latitude', 'longitude'])
    current_annual_max_fwinx = current_annual_max['fwinx']
    current_max = current_annual_max_fwinx.mean(dim='time').values

    percentage_increase_count = 100 * (current_95_count - baseline_95_count) / baseline_95_count
    percentage_increase_max = 100 * (current_max - baseline_max) / baseline_max


    # print(f'baseline_95_count: {baseline_95_count}')
    # print(f'current_95_count: {current_95_count}')
    # print(f'baseline_max: {baseline_max}')
    # print(f'current_max: {current_max}')
    # print(f'percentage_increase_count: {percentage_increase_count}')
    # print(f'percentage_increase_max: {percentage_increase_max}')
    result =  {
        'county_name': county_row['NAME'],
        'statefp': county_row.STATEFP,
        'countyfp': county_row.COUNTYFP,
        'percentage_increase_count': percentage_increase_count,
        'percentage_increase_max': percentage_increase_max
    }
    results.append(result)

result_df = pd.DataFrame(results)
result_df.to_csv('../../data/UNSEEN/wildfire/cems/preprocessed/county_stats_granular.csv', index=False)

Processing county 1 of 3233: Ballard


NameError: name 'get_extreme_fwi_occurences' is not defined

In [38]:
# examine the fwi of major famous wildfires

def get_filtered_data(data, start_date, end_date, positive_threshold=1e5, negative_threshold=-1e5):
    condition = (data['time'] >= start_date) & (data['time'] <= end_date)
    filtered_data = data.where(condition, drop=True)
    filtered_data = filtered_data.where((filtered_data < positive_threshold) & (filtered_data > negative_threshold))
    return filtered_data.chunk({'time': -1})

def get_extreme_fwi_occurrences(data, percentile_threshold):
    count = (data['fwinx'] > percentile_threshold).sum().values
    total = data['fwinx'].count().values
    if total == 0 or np.isnan(total):
        return np.nan  # Return NaN if total is zero or NaN
    else:
        percentage = (count / total) * 100  # Calculate the percentage
        return percentage
    
def get_months_between(start_date, end_date):
    dates = pd.date_range(start_date, end_date, freq='M')
    months = dates.month.unique().tolist()
    return months

ERA5 = xr.open_mfdataset('../../data/UNSEEN/wildfire/cems/ERA5/ERA5_????.nc',combine='by_coords')
counties = gpd.read_file('../../data/shapefiles/us/counties/cb_2018_us_county_500k.shp')

def process_fire(county, state_fips, state_abb, months):
    shape = counties[(counties['NAME'] == county) & (counties['STATEFP'] == state_fips)].geometry.iloc[0]
    data = get_masked_data(ERA5, shape)
    data = data.where(data['time.year'] != 2024, drop=True)  # Use .where() instead of indexing
    years = np.unique(data['time.year'].values)

    baseline_data = get_filtered_data(data, start_date=pd.to_datetime('1973-01-01'), end_date=pd.to_datetime('1999-12-31'))
    filtered_baseline_data = baseline_data.where((baseline_data.time.dt.month >= months[0]) & (baseline_data.time.dt.month <= months[-1]), drop=True)

    p95 = filtered_baseline_data['fwinx'].quantile(0.95).values

    # Initialize an empty list to store the counts for each year
    extreme_fwi_percents = []

    # Iterate over the years
    for year in years:
        start_date = pd.to_datetime(f'{year}-{months[0]}-01')
        end_date = pd.to_datetime(f'{year}-{months[-1]+1}-01')
        data_for_year = get_filtered_data(data, start_date=start_date, end_date=end_date)
        # Calculate the count for the current year and append it to the list
        extreme_fwi_percents.append(get_extreme_fwi_occurrences(data_for_year, p95))  # Index the first element of the array


    # Create a DataFrame with 'Year' and 'Count' columns
    df = pd.DataFrame({
        'year': years,
        'months': [months]*len(years),
        'county': [county]*len(years),
        'state': [state_fips]*len(years),
        'county_name': [f'{county} County, {state_abb}']*len(years),
        'extreme_fwi_percentage': extreme_fwi_percents
    })
    return df

# tested and not good
# dixie = process_fire('Butte', '06', 'CA', [7,10])
# texas = process_fire('Hutchinson', '48', 'TX', [2])


# ones to add
yellowstone = process_fire('Park', '56', 'WY', [6,9])
campfire = process_fire('Butte', '06', 'CA', [11])
bay_area = process_fire('Glenn', '06', 'CA', [8,11])
dixie_plumas = process_fire('Plumas', '06', 'CA', [7,9])

combined = pd.concat([yellowstone, campfire, bay_area, dixie_plumas])
combined.to_csv('../../data/UNSEEN/wildfire/cems/preprocessed/famous_fires.csv', index=False)



