### Preperation Notebook - Copula Flood Dependence


This notebook calculates the dependence structure between river basins using a T-Copula and GloFAS discharge data

In [1]:
# Import live code changes in
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import t, chi2

from sovereign.flood import combine_glofas, extract_discharge_timeseries, fit_gumbel_distribution, calculate_uniform_marginals

import warnings
warnings.filterwarnings("ignore")

##### 1. User Config

In [2]:
# Dependence analysis parameters
start_year = 1979 # start year for discharge data
end_year = 2016 # end year for discharge data
area_filter = 500 # not considering rivers with upstream areas below 500 km^2
n_samples = 100000 # number of copula samples to fit (will be used in subsequent Monte Carlo simulation)

##### 2. Set filepaths

In [3]:
root = Path.cwd().parent.parent # find project root
glofas_path = os.path.join(root, 'inputs', 'flood', 'dependence', 'glofas')
basin_outlets_path = os.path.join(root, 'inputs', 'flood', 'dependence', 'basin_outlets_match.csv') # Lat Lon points at basin outlets
copula_samples_path = os.path.join(root, 'outputs', 'flood', 'dependence', 'copulas', 'copula_random_numbers.gzip')

##### 3. Extract Discharge Data

In [4]:
# Load GloFAS river discharge data and upstream accumulating area data
glofas_data = combine_glofas(start_year, end_year, glofas_path, area_filter)
# Load the basin outlet file, perform some data checks (to ensure we have valid discharge timeseries at each basin outlet point), and then extract discharge timeseries for each basin
basin_outlets = pd.read_csv(basin_outlets_path)
# Note to align the two datasets we need to make the following adjustment to lat lons (based on previous trial and error)
basin_outlets['Latitude'] = basin_outlets['Latitude'] + 0.05/2
basin_outlets['Longitude'] = basin_outlets['Longitude'] - 0.05/2
# Extract discharge timeseries
basin_timeseries = extract_discharge_timeseries(basin_outlets, glofas_data)

##### 4. Fit Gumbel Distribution and Compute Uniform Marginals

In [5]:
# Fit gumbel distribution using annual maxima
gumbel_params, fit_quality = fit_gumbel_distribution(basin_timeseries)
# Compute uniform marginals
uniform_marginals = calculate_uniform_marginals(basin_timeseries, gumbel_params)

# Assign each basin to their L3 (major) river basin (assuming independence across major river basins). 5 total L3 basins
marginals = pd.DataFrame(uniform_marginals)
l3_basins = basin_outlets.HYBAS_ID_L3.unique()
l3_data = {}
for basin in l3_basins:
    associated_l6_basins = list(basin_outlets[basin_outlets.HYBAS_ID_L3 == basin].HYBAS_ID_L6.unique())
    data = marginals[associated_l6_basins]
    l3_data[basin] = data

##### 5. Fit T-Copula and generate samples

In [12]:
t_samples = {}

for basin, data in l3_data.items():
    corr_matrix = data.corr().values
    mu = np.zeros(len(corr_matrix))
    s = chi2.rvs(df=3, size=n_samples)[:, np.newaxis]
    Z = np.random.multivariate_normal(mu, corr_matrix, n_samples)
    X = np.sqrt(3/s)*Z
    U = t.cdf(X, df=3)
    t_samples[basin] = pd.DataFrame(U, columns=data.columns)

generated_samples = pd.DataFrame()
for basin, sample in t_samples.items():
    generated_samples = pd.concat([generated_samples, sample], axis=1)

In [15]:
# Save samples
# Create output directory if it doesn't already exist
Path(copula_samples_path).parent.mkdir(parents=True, exist_ok=True)
generated_samples.to_parquet(copula_samples_path, compression='gzip', index=False)