#### Testing the Copula methodology

In [1]:
# Import libraries
import os
import xarray as xr
# Disable warnings for data download via API
import urllib3 
urllib3.disable_warnings()
# Disable xarray runtime warnings
import warnings
warnings.simplefilter("ignore", category=RuntimeWarning)
import pandas as pd
import numpy as np
from lmoments3 import distr
from scipy.stats import gumbel_r, kstest
from tqdm import tqdm

#### Prepare GLOFAS data

In [6]:
# Set up data directory
DATADIR = r"D:\projects\sovereign-risk\Thailand\data\flood\dependence\glofas"
os.makedirs(DATADIR, exist_ok=True)

start_year = 1979
end_year = 2023

def combine_datasets(start, end, dir):
    '''
    combine all datasets into one xarray for analysis
    '''
    all_files = [os.path.join(dir, f"glofas_THA_{year}.grib") for year in range(start, end+1)]
    # Load all datasets into array
    datasets = [xr.open_dataset(file, engine='cfgrib') for file in all_files]
    # Concatenate all datasets along the time dimension
    combined_dataset = xr.concat(datasets, dim='time')
    # Make sure datasets are sorted by time
    combined_dataset = combined_dataset.sortby('time')
    
    return combined_dataset

# Load glofas data and combine
glofas_data = combine_datasets(start_year, end_year, DATADIR)
glofas_data

# Reduce the Upstream area data to the domain of the river discharge
upstream_area_fname = f"uparea_glofas_v4_0.nc"
upstream_area_file = os.path.join(DATADIR, upstream_area_fname)
# Filter glofas timeseries based on upstream accumulating area
area_filter = 500

# Open the file and print the contents
upstream_area = xr.open_dataset(upstream_area_file, engine='netcdf4')

# Get the latitude and longitude limits of the data
lat_limits = [glofas_data.latitude.values[i] for i in [0, -1]]
lon_limits = [glofas_data.longitude.values[i] for i in [0, -1]]
up_lats = upstream_area.latitude.values.tolist()
up_lons = upstream_area.longitude.values.tolist()

lat_slice_index = [
    round((i-up_lats[0])/(up_lats[1]-up_lats[0]))
    for i in lat_limits
]
lon_slice_index = [
    round((i-up_lons[0])/(up_lons[1]-up_lons[0]))
    for i in lon_limits
]

# Slice upstream area to Thailand region:
red_upstream_area = upstream_area.isel(
    latitude=slice(lat_slice_index[0], lat_slice_index[1]+1),
    longitude=slice(lon_slice_index[0], lon_slice_index[1]+1),
)

# There are very minor rounding differences, so we update with the lat/lons from the glofas data
red_upstream_area = red_upstream_area.assign_coords({
    'latitude': glofas_data.latitude,
    'longitude': glofas_data.longitude,
})

# Add the upstream area to the main data object and print the updated glofas data object:
glofas_data['uparea'] = red_upstream_area['uparea']
glofas_data

# Mask the river discharge data
glofas_data_masked = glofas_data.where(glofas_data.uparea>=area_filter*1e6)

# Load the basin outlet data
basin_outlet_file = r"D:\projects\sovereign-risk\Thailand\data\flood\dependence\thailand-basins\lev06_outlets_final_clipped_Thailand_no_duplicates.csv"
basin_outlet_df = pd.read_csv(basin_outlet_file)
# Note to align the two datasets we need to make the following adjustment to lat lons (based on previous trial and error)
basin_outlet_df['Latitude'] = basin_outlet_df['Latitude'] + 0.05/2
basin_outlet_df['Longitude'] = basin_outlet_df['Longitude'] - 0.05/2

# Define function for checking timeseries
def check_timeseries(array, latitude, longitude):
    test_point = array.sel(latitude=latitude, longitude=longitude, method='nearest')
    test_timeseries = test_point['dis24']
    test_acc = float(test_point['uparea'])
    # check for NaN values
    non_nan_count = test_timeseries.count().item()
    total_count = test_timeseries.size
    nan_ratio = non_nan_count/total_count

    # Does the timeseries pass the NaN threshold
    if nan_ratio < 1:
        return False, test_acc, "NaN values found"

    # Check for constant values
    if test_timeseries.min() == test_timeseries.max():
        return False, test_acc, "Constant timeseries values"

    # If all checks pass
    return True, test_acc, "Valid timeseries"

# Loop through basins and check whether timeseries is valid
results = []
for index, row in basin_outlet_df.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']

    valid, acc, message = check_timeseries(glofas_data_masked, latitude, longitude)

    # Store the results
    results.append({
        'HYBAS_ID': row['HYBAS_ID'],
        'Latitude': latitude,
        'Longitude': longitude,
        'Acc': acc,
        'Valid': valid,
        'Message': message
    })
    if not valid:
        print(f"ID: {row['HYBAS_ID']}, Lat: {latitude}, Lon: {longitude}, Acc: {acc}, Valid: {valid}, Message: {message}")

# over what years do we want to extract the data?
start_year = 1979
end_year = 2016
sliced_data = glofas_data_masked.sel(time=slice(str(start_year), str(end_year)))
# Dictionary to store timeseries data for each basin
basin_timeseries = {}

# Loop through basin outlets, storing each in turn
for index, row in basin_outlet_df.iterrows():
    basin_id = row['HYBAS_ID']
    lat = row['Latitude']
    lon = row['Longitude']
    point_data = sliced_data.sel(latitude=lat, longitude=lon, method='nearest')
    timeseries = point_data['dis24'].to_series()
    # store in dictionary
    basin_timeseries[basin_id] = timeseries

#### Perform EVA

In [7]:
# Dictionary to store fitted parameters for each basin
gumbel_params = {}
fit_quality = {}

# Loop through basins, calculating annual maxima and fitting Gumbel distribution using L-moments
for basin_id, timeseries in basin_timeseries.items():
    annual_maxima = timeseries.groupby(timeseries.index.year).max()

    # Fit Gumbel distribution using L-moments
    params = distr.gum.lmom_fit(annual_maxima)

    # Perform the Kolmogorov-Smirnov test (checking quality of fit)
    D, p_value = kstest(annual_maxima, 'gumbel_r', args=(params['loc'], params['scale']))

    gumbel_params[basin_id] = params
    fit_quality[basin_id] = (D, p_value)

# Will do this using the CDF of the fitted Gumbel distribution 

# Dictionary to story uniform marginals for each basin
uniform_marginals = {}

for basin_id, timeseries in basin_timeseries.items():
    annual_maxima = timeseries.groupby(timeseries.index.year).max()
    params = gumbel_params[basin_id]
    uniform_marginals[basin_id] = gumbel_r.cdf(annual_maxima, loc=params['loc'], scale=params['scale'])

basin_ids = list(uniform_marginals.keys())

#### Run Copula Analysis

In [19]:
from copulas.bivariate import Clayton
clayton_copula_models = {}
clayton_error_basins = [] # list to store basins that cause an error

for id1, margins1 in uniform_marginals.items():
    for id2, margins2 in uniform_marginals.items():
        if id1 < id2: # to avoid duplicate pairs
            try:
                # Prepare the data for copula
                data = np.column_stack((1-margins1, 1-margins2)) # interested in upper tail dependence so take inverse of CDF
                
                # Fit the Clayton copula
                flipped_clayton = Clayton()
                flipped_clayton.fit(data)
    
                # Store the copula model
                clayton_copula_models[(id1, id2)] = flipped_clayton
            except ValueError as e:
                # print(f"Error fitting Clayton copula for basins {id1} and {id2}: {e}")
                clayton_error_basins.append((id1, id2))

# Store these copula pairs in a matrix
basin_ids = list(uniform_marginals.keys()) # take the basin IDs from the uniform marginals dictionary
N = len(basin_ids)

# Initialize the matrix with NaNs
dependence_matrix = np.full((N, N), np.nan)

# Map from basin ID to matrix index
id_to_index = {basin_id: index for index, basin_id in enumerate(basin_ids)}

for (id1, id2), copula_model in clayton_copula_models.items():
    index1, index2 = id_to_index[id1], id_to_index[id2]
    dependence_matrix[index1, index2] = copula_model.theta
    dependence_matrix[index2, index1] = copula_model.theta

# For error basins do the same but set theta to -1
for (id1, id2) in clayton_error_basins:
    index1, index2 = id_to_index[id1], id_to_index[id2]
    dependence_matrix[index1, index2] = -1
    dependence_matrix[index2, index1] = -1

# Debug (for infinity values) - not sure if needed but there are a few where I had to reassign basin outlets.
dependence_matrix[np.isinf(dependence_matrix)] = 1000
                
# Step 1: Find the most dependent pair
# Initialize a set to keep track of selected basin indices
selected_indices = set()
# convert dependence_matrix to a masked array, so that NaN values and -1 are not considered in the operation
masked_dependence_matrix = np.ma.masked_less(dependence_matrix, 0) # masking out values < 0
np.fill_diagonal(masked_dependence_matrix, np.ma.masked) # we want to ignore diagonal (NaN values)
max_theta_index = np.unravel_index(np.argmax(masked_dependence_matrix, axis=None), masked_dependence_matrix.shape)
ordered_basins = [basin_ids[max_theta_index[0]], basin_ids[max_theta_index[1]]]
# Add indices to the set of selected indices
selected_indices.update([max_theta_index[0], max_theta_index[1]])

# Step 2-4: Loop until all basins are ordered
while len(ordered_basins) < len(basin_ids):
    # Step 2: Choose basin k that is dependent on both basin i, j (last two basins in ordered_basins). Minimax approach
    # Exclude already selected basins from the selection process
    potential_next_indices = [i for i in range(len(basin_ids)) if i not in selected_indices]
    # Find the indices of the last two basins in ordered_basins
    last_two_indices = [id_to_index[basin] for basin in ordered_basins[-2:]]
    # Find dependency vectors for the last two basins
    dependency_vectors = masked_dependence_matrix[last_two_indices, :]
    # Calculate the minimum dependency for each row of the vector
    min_deps = np.ma.min(dependency_vectors, axis=0)
    # Mask already selected indices
    min_deps_masked = np.ma.copy(min_deps)
    # Debug 
    # Ensure min_deps_masked.mask is an array
    if np.isscalar(min_deps_masked.mask):
        min_deps_masked.mask = np.zeros(min_deps_masked.shape, dtype=bool)
    for idx in selected_indices:
        min_deps_masked.mask[idx] = True # mask the index if it's already in selected indices
    # Step 3: Find the maximum dependency value over the minimized vector - which will be the next basin
    next_basin_index = np.ma.argmax(min_deps_masked, fill_value=-np.inf)
    # Step 4: Continue iterations until there are no more basins left to process
    # Check if all options are effectively masked
    if min_deps_masked.mask.all():
        print("No suitable next basin found. Ending process.")
        break
    next_basin = basin_ids[next_basin_index]
    ordered_basins.append(next_basin)
    selected_indices.add(next_basin_index)

#### Run risk analysis

In [28]:
def interpolate_damages(RPs, losses, sim_aep, protection_level=0.5):
    aeps = [1/i for i in RPs]
    # Ensure AEPs are in ascending order for np.interp
    aeps.sort() 
    print(losses)
    losses = losses[::-1]

    # test
    # sim_aep = 1 - sim_aep # convert extreme simulated AEPs (e.g. 0.95) to equivalent AEPs for interpolation (e.g. 0.05)

    # Interpolate based off simulated AEP
    if sim_aep >= protection_level: 
        return 0 
    else:
        interpolated_value = np.interp(sim_aep, aeps, losses)
        return interpolated_value

def basin_loss_curve(loss_df, basin_id, basin_col, rps):
    losses = {} # initialize empty dictionary to store losses and protection level
    basin_df = loss_df[loss_df[basin_col]==basin_id]
    grouped_basin_df = basin_df.groupby([basin_col, 'RP', 'Pr_L_AEP']).agg({'damages':'sum'}).reset_index()
    # # Pull unique protection levels from the grouped dataframe
    unique_protection_levels = grouped_basin_df['Pr_L_AEP'].unique()
    if len(unique_protection_levels) == 0:
        unique_protection_levels = [1]
    for i in unique_protection_levels:
        losses[i] = [grouped_basin_df.loc[(grouped_basin_df['RP'] == rp) & (grouped_basin_df['Pr_L_AEP']==i), 'damages'].sum() for rp in rps]
    return losses

In [29]:
# Load risk data
risk_data = pd.read_csv(r"D:\projects\sovereign-risk\Thailand\test\copulas\risk_basin_zonal_sum.csv")
# Add columne for annual exceedance probability
risk_data['AEP'] = 1 / risk_data['RP']
# Add a column converting current prorection level into AEP
risk_data['Pr_L_AEP'] = np.where(risk_data['Pr_L'] == 0, 0, 1 / risk_data['Pr_L']) # using numpy where avoids zero division errors
rps = [2, 5, 10, 25, 50, 100, 200, 500, 1000]
risk_data.head()

Unnamed: 0.1,Unnamed: 0,FID,GID_1,NAME,HB_L4,HB_L5,HB_L6,HB_L7,Pr_L,Add_Pr,New_Pr_L,damages,RP,AEP,Pr_L_AEP
0,0,0,THA.62_1,Si Sa Ket,4041145000.0,4051145000.0,4061140000.0,4071125000.0,16.387501,83.612499,100.0,0.0,2,0.5,0.061022
1,1,1,THA.62_1,Si Sa Ket,4041145000.0,4051145000.0,4061140000.0,4071125000.0,16.387501,83.612499,100.0,0.0,2,0.5,0.061022
2,2,2,THA.62_1,Si Sa Ket,4041145000.0,4051145000.0,4061131000.0,4071121000.0,16.387501,83.612499,100.0,0.0,2,0.5,0.061022
3,3,3,THA.62_1,Si Sa Ket,4041109000.0,4051109000.0,4061109000.0,4071109000.0,16.387501,83.612499,100.0,110669.8,2,0.5,0.061022
4,4,4,THA.62_1,Si Sa Ket,4041109000.0,4051110000.0,4061110000.0,4071111000.0,16.387501,83.612499,100.0,2922638.0,2,0.5,0.061022


In [30]:
def monte_carlo_dependence_simulation(
        loss_df, 
        rps, 
        basin_col, 
        protection_level, 
        num_years, 
        num_simulations=1000):
    '''
    Adjusted to account for urban protection
    Perform Monte Carlo simulations of yearly losses incorporating basin dependencies. This function is specifically for simulating urban flood protection

    :param loss_df: dataframe with losses from risk analysis
    :param rps: list of return periods to consider. 
    :param basin_col: name of column for basins (e.g. 'HB_L6')
    :param protection_level: what is the baseline protection level (e.g. 0.5 or 1 in 2 years)
    :param num_years: Number of years to simulate
    :param ordered_basins: List of basin IDs ordered by dependency
    :param copula_models: Dictionary holding copula model for each basin pair
    :param gumbel_params: Gumbel distribution parameters for each basin.
    :param num__simulations: Number of simulations (default is 10,000).
    :return: Dataframe of simulated national losses for each year.
    '''
    
    # To speed up the Monte-Carlo simulation we are going to pre-compute some variables
    # precompute loss-probability curves for each basin
    basin_loss_curves = {basin_id: basin_loss_curve(loss_df, basin_id, basin_col, rps) for basin_id in ordered_basins}
    # Initialize array for national losses
    national_losses_per_year = np.zeros((num_simulations, num_years))
    # Generate all random numbers in advance
    random_numbers = np.random.uniform(0, 1, (num_simulations, num_years, len(ordered_basins))).astype(np.float32)

    i = 0

    for sim in tqdm(range(num_simulations)):
        for year in range(num_years):
            # Initialize a list to store losses for each basin for the current year
            yearly_losses = []
            yearly_loss_values = []
            i += 1
            for i, basin_id in enumerate(ordered_basins):
                if i == 0:
                    # Handle first basin
                    r = random_numbers[sim, year, i]
                    loss_curve = basin_loss_curves[basin_id]
                    yearly_losses.append(r) # will store losses as AEP
                    yearly_loss_values.append(interpolate_damages(rps, loss_curve, r)) # store losses as interpolated values
                else:
                    loss_curve = basin_loss_curves[basin_id]
                    # Handle subsequent basins with dependencies
                    copula = get_copula_model(copula_models, ordered_basins[i-1], basin_id)
                    if copula is not None:
                        # Apply dependency model if theta exists
                        r = random_numbers[sim, year, i]
                        previous_loss = yearly_losses[i-1]
                        current_loss = generate_conditional_sample(previous_loss, copula.theta, r)
                        # print(copula.theta, previous_loss, r, current_loss)
                        yearly_losses.append(current_loss)
                        yearly_loss_values.append(interpolate_damages(rps, loss_curve, (1-current_loss))) # here we are inverting the random number to AEP to simulate tail dependence
                    else:
                        # Independent simulation for this basin
                        r = random_numbers[sim, year, i]
                        yearly_losses.append(r)
                        yearly_loss_values.append(interpolate_damages(rps, loss_curve, r))

            # Aggregate losses for the current year
            national_losses_per_year[simulation, year] = sum(yearly_loss_values)
            
    return pd.DataFrame(national_losses_per_year, columns=[f'Year_{i+1}' for i in range(num_years)])

monte_carlo_dependence_simulation(risk_data, rps, 'HB_L6',0.5, 100)

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

{1: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}





TypeError: unhashable type: 'slice'