In [7]:
import pandas as pd
import numpy as np
import xarray as xr
import cdsapi 

# viz
import matplotlib.pyplot as plt
import cartopy
import cartopy.crs as ccrs
from cartopy.io import shapereader
import plotly.graph_objects as go
import plotly.express as px
import folium
import seaborn as sns

import glob
import sys
import os

from scipy.interpolate import griddata
from numpy.polynomial.polynomial import Polynomial

ModuleNotFoundError: No module named 'folium'

In [8]:
import geopandas as gpd
import rioxarray
from shapely.geometry import mapping

# functions for state masking

def get_shape_us(shapefile, state_abbreviation):
    shapefile = shapefile.to_crs("EPSG:4326")
    state_shape = shapefile[shapefile.STUSPS == state_abbreviation].geometry
    return state_shape

def mask_xr_dataset(xr_data, shape):
    xr_data = xr_data.assign_coords(longitude=(((xr_data.longitude + 180) % 360) - 180)).sortby('longitude')
    xr_data.rio.write_crs("EPSG:4326", inplace=True)
    xr_data_masked = xr_data.rio.clip(shape.geometry.apply(mapping), shape.crs)
    return xr_data_masked

In [3]:
from numpy.polynomial import Polynomial

# function to add trend line

def fit_polynomial(df):
    # add trend line
    p = Polynomial.fit(df.index, df['extreme_frequency'], 1)
    print('Slope: ', p.convert().coef[1])
    x_values = np.linspace(df.index.min(), df.index.max(), len(df.index))
    y_values = p(x_values)
    df['trend'] = y_values
    df = df[['year', 'extreme_frequency', 'trend']]
    return df

In [12]:
################################################
# MDK Final Frequency Data preprocessing
################################################

ERA5 = xr.open_mfdataset('../../../data/UNSEEN/hurricane/europe/ERA5_hourly_Jan_21_update/ERA5_????.nc', combine='by_coords')
LSMask = xr.open_dataset('../../../data/UNSEEN/hurricane/lsm_1279l4_0.1x0.1.grb_v4_unpack.nc')
# print("ERA5 Coordinates:")
# print(ERA5.coords)

# print("LSMask Coordinates:")
# print(LSMask.coords)

# LSMask_interp = LSMask.interp(latitude=ERA5.latitude, longitude=ERA5.longitude, method='nearest')

# # Apply the mask to the ERA5 data
# ERA5_land = ERA5.where(LSMask_interp.lsm > 0.5)

# # Print the resulting dataset
# print(ERA5['tp'].values)
# print(ERA5_land['tp'].values)

LSMask['longitude'] = (((LSMask['longitude'] + 180) % 360) - 180)
LSMask_interp = LSMask.interp(latitude=ERA5.latitude, longitude=ERA5.longitude, method='nearest')

ERA5_land = (ERA5.where(LSMask_interp.lsm > 0.5))
print(ERA5_land)
# # Check the unique values in the interpolated LSMask
# unique_lsm_values = np.unique(LSMask_interp['lsm'].values)
# print("Unique values in the interpolated LSMask:", unique_lsm_values)

# # Check the unique values in the masked ERA5 data
# unique_era5_land_values = np.unique(ERA5_land['tp'].values)
# print("Unique values in the masked ERA5 data:", unique_era5_land_values)

<xarray.Dataset> Size: 10GB
Dimensions:     (valid_time: 253896, latitude: 88, longitude: 113, time: 1)
Coordinates:
    number      int64 8B 0
  * valid_time  (valid_time) datetime64[ns] 2MB 1994-01-01 ... 2024-12-31T23:...
  * latitude    (latitude) float64 704B 58.35 58.1 57.85 ... 37.1 36.85 36.6
  * longitude   (longitude) float64 904B -11.8 -11.55 -11.3 ... 15.7 15.95 16.2
    expver      (valid_time) <U4 4MB dask.array<chunksize=(8184,), meta=np.ndarray>
  * time        (time) datetime64[ns] 8B 2013-11-29
Data variables:
    tp          (valid_time, latitude, longitude, time) float32 10GB dask.array<chunksize=(2728, 30, 38, 1), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2025-01-21T21:48 GRIB to CDM+CF via cf

In [13]:
area_weights = np.cos(np.deg2rad(ERA5_land.latitude))
area_weights = area_weights.expand_dims({'longitude': ERA5_land.longitude})

ERA5_land['tp'] = ERA5_land['tp'] * area_weights

ERA5_coarsened = ERA5_land.coarsen(latitude=3, longitude=3, boundary='trim').sum()
ERA5_agg = ERA5_coarsened.resample(time='1D').sum()

# ERA5_agg_df = ERA5_agg.to_dataframe().reset_index()
# ERA5_agg_df = ERA5_agg_df.loc[ERA5_agg_df['time'].dt.month.between(6, 10)]
# ERA5_agg_df = ERA5_agg_df[['time', 'tp']]
# ERA5_agg_df.to_csv('../../../data/UNSEEN/hurricane/europe/ERA5_1d_raw.csv', index=False)


In [14]:

# data preprocessing for data viz of 999th percentile count data

ERA5_agg = ERA5_agg.chunk({'valid_time': -1})
extreme_precipitation_threshold = ERA5_agg.sel(valid_time=slice('2000', '2010')).tp.quantile(.999, dim='valid_time')
count_above_threshold = xr.where(ERA5_agg['tp'] > extreme_precipitation_threshold, 1, 0)
ERA5_agg_annual = count_above_threshold.resample(valid_time='YE').sum()
ERA5_agg_annual_total = ERA5_agg_annual.sum(dim=['latitude', 'longitude'])
print(ERA5_agg_annual_total)
ERA5_extreme_frequency_df = ERA5_agg_annual_total.to_dataframe(name='extreme_frequency').reset_index()
ERA5_extreme_frequency_df['year'] = ERA5_extreme_frequency_df['valid_time'].dt.year
# plot_df = fit_polynomial(ERA5_extreme_frequency_df)
ERA5_extreme_frequency_df.to_csv('../../../data/UNSEEN/hurricane/europe/ERA5_999_counts_all_months.csv', index=False)

<xarray.DataArray 'tp' (time: 1, valid_time: 31)> Size: 248B
dask.array<sum-aggregate, shape=(1, 31), dtype=int64, chunksize=(1, 1), chunktype=numpy.ndarray>
Coordinates:
    number      int64 8B 0
  * time        (time) datetime64[ns] 8B 2013-11-29
    quantile    float64 8B 0.999
  * valid_time  (valid_time) datetime64[ns] 248B 1994-12-31 ... 2024-12-31


In [22]:
import xarray as xr

# Assuming ERA5_agg is already defined and loaded

# Chunk the data
ERA5_agg = ERA5_agg.chunk({'time': -1})

# Calculate the extreme precipitation threshold
extreme_precipitation_threshold = ERA5_agg.sel(time=slice('2000', '2010')).tp.quantile(.999, dim='time')

# Count occurrences above the threshold
count_above_threshold = xr.where(ERA5_agg['tp'] > extreme_precipitation_threshold, 1, 0)

# Resample to annual counts
ERA5_agg_annual = count_above_threshold.resample(time='YE').sum()

# Sum over latitude and longitude to get total annual counts
ERA5_agg_annual_total = ERA5_agg_annual.sum(dim=['latitude', 'longitude'])

# Ensure the DataArray is reduced to a single dimension and compute the result
ERA5_agg_annual_total = ERA5_agg_annual_total.compute()

# Print the resulting DataArray
print(ERA5_agg_annual_total)

# Convert to DataFrame
ERA5_extreme_frequency_df = ERA5_agg_annual_total.to_dataframe(name='extreme_frequency').reset_index()

# Print the DataFrame
print(ERA5_extreme_frequency_df)

TypeError: only integer scalar arrays can be converted to a scalar index

In [24]:
# data preprocessing for state level frequency analysis for damages

state_abbr = 'MS'

def count_extreme_rainfall_events(xr_data, percentile):
    era5_5d = xr_data.resample(time='1D').sum()
    era5_5d = era5_5d.chunk({'time': -1})
    extreme_precipitation_treshold = era5_5d.sel(time=slice(None, '2000')).tp.quantile(percentile, dim=['time'])
    count_above_threshold = xr.where(era5_5d['tp'] > extreme_precipitation_treshold, 1, 0)
    count_above_threshold = count_above_threshold.sel(time=count_above_threshold['time'].dt.month.isin([6, 7, 8, 9, 10]))
    count_above_threshold_df = count_above_threshold.to_dataframe().reset_index()
    return count_above_threshold_df

ERA5 = xr.open_mfdataset('../../../data/UNSEEN/hurricane/ERA5_hourly/ERA5_????.nc', combine='by_coords')
# lat_era5 = ERA5.latitude.values
# lon_era5 = ERA5.longitude.values

# LSMask = xr.open_dataset('../../../data/UNSEEN/hurricane/ERA5_land_sea_mask_new.nc')
# LSMask['longitude'] = (((LSMask['longitude'] + 180) % 360) - 180)
# LSMask_interp = LSMask.interp(latitude=lat_era5, longitude=lon_era5)

# uncomment below lines to filter for a specific state

us_shapefile = gpd.read_file('../../../data/shapefiles/us/tl_2023_us_state.shp')
shape = get_shape_us(us_shapefile, state_abbr)
filtered_era5 = mask_xr_dataset(ERA5, shape)

# ERA5_land = (
#     ERA5.where(LSMask_interp['lsm'].sel(time = '1981').squeeze('time') > 0.5)
# )

# area_weights = np.cos(np.deg2rad(filtered_era5.latitude))
# area_weights = area_weights.expand_dims({'longitude': filtered_era5.longitude})

#filtered_era5['tp'] = filtered_era5['tp'] * area_weights

ERA5_coarsened = filtered_era5.coarsen(latitude=3, longitude=3, boundary='trim').sum()
ERA5_extreme_count = count_extreme_rainfall_events(ERA5_coarsened, 0.999)
ERA5_extreme_count['year'] = ERA5_extreme_count['time'].dt.year
ERA5_extreme_count_grouped = ERA5_extreme_count.groupby('year')['tp'].sum().reset_index(name='tp')
print(ERA5_extreme_count_grouped.head())
ERA5_extreme_count_grouped.to_csv(f'../../../data/UNSEEN/hurricane/rainfall_preprocessed/states/ERA5_999_counts_{state_abbr}.csv', index=False)

   year  tp
0  1981   0
1  1982   1
2  1983   0
3  1984   3
4  1985   6
