# Dependent Sample Generation

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t, chi2
import pickle
from matplotlib.pylab import plt

from prisk.analysis_functions import (
    combine_glofas, 
    extract_discharge_timeseries, 
    fit_gumbel_distribution, 
    calculate_uniform_marginals)

In [2]:
glofas_dir = "/Users/rubenkerkhofs/Desktop/glofas/" 
basin_outlet_file = "https://kuleuven-prisk.s3.eu-central-1.amazonaws.com/lev06_outlets_final_clipped_Thailand_no_duplicates.csv"

In a first step, we obtain discharge data for the basins:

In [3]:
# Step 1: Load GloFAS river discharge data and upstream accumulating area data
# Discharge data for producing GIRI maps is from 1979-2016
start_year = 1979
end_year = 2016
area_filter = 500 # not considering rivers with upstream areas below 500 km^2
glofas_data = combine_glofas(start_year, end_year, glofas_dir, area_filter)

# Step 2: Load the basin outlet file, perform some data checks (to ensure we have valid discharge timeseries at each basin outlet point), and then extract discharge timeseries for each basin
basin_outlets = pd.read_csv(basin_outlet_file)
# Note to align the two datasets we need to make the following adjustment to lat lons (based on previous trial and error)
basin_outlets['Latitude'] = basin_outlets['Latitude'] + 0.05/2
basin_outlets['Longitude'] = basin_outlets['Longitude'] - 0.05/2
# Extract discharge timeseries
basin_timeseries = extract_discharge_timeseries(basin_outlets, glofas_data)

Once, the timeseries are obtained, we fit the gumbel distribution to each individual basin:

In [4]:
gumbel_params, fit_quality = fit_gumbel_distribution(basin_timeseries)

Once the Gumbel distributions are fitted, we compute the uniform marginals:

In [5]:
uniform_marginals = calculate_uniform_marginals(basin_timeseries, gumbel_params)

These uniform marginals are used to estimate the dependency structure between basins.

### Gaussian Copula
The Gaussian copula only requires the correlation matrix as an input parameter. We use the GaussianMultivariate object of the copulas package to estimate and sample from this copula. Note that the GaussianMultivariate object also estimates the univariate distributions; however, we have already transformed the univariate distributions to the uniform distribution. For that reason, we fix the uniform distribution.

In [6]:
from copulas.multivariate import GaussianMultivariate
from copulas.univariate import UniformUnivariate

class UniformUnivariateFixed(UniformUnivariate):
    def _fit_constant(self, X):
        self._params = {
            'loc': 0,
            'scale': 1
        }

    def _fit(self, X):
        self._params = {
            'loc': 0,
            'scale': 1
        }

data = pd.DataFrame(uniform_marginals)
# Confusingly, the distribution parameter sets the marginals
copula = GaussianMultivariate(distribution=UniformUnivariateFixed)
copula.fit(data)

Then, the sample function can be used to obtain samples:

In [7]:
copula.sample(10)

Unnamed: 0,4060020990,4060019720,4060021260,4060021270,4060021320,4060019710,4060019420,4060021330,4060034180,4060019410,...,4061043000,4061041330,4061042900,4061040840,4061040790,4061041310,4061031450,4061031350,4061028990,4061028980
0,0.63649,0.988095,0.691865,0.838977,0.7454,0.940065,0.670337,0.456904,0.284302,0.278923,...,0.470945,0.344917,0.114695,0.193549,0.158586,0.133049,0.374829,0.166754,0.675015,0.485134
1,0.19928,0.531631,0.620359,0.662151,0.48934,0.564834,0.543732,0.078096,0.252351,0.158217,...,0.562068,0.545671,0.569693,0.711798,0.545718,0.549002,0.329975,0.570385,0.561057,0.639682
2,0.839681,0.395206,0.436182,0.401196,0.746892,0.612521,0.454982,0.840294,0.906488,0.392815,...,0.84014,0.064739,0.107914,0.347859,0.20413,0.160542,0.209342,0.224893,0.222524,0.303005
3,0.429144,0.35101,0.664871,0.05698,0.203546,0.438759,0.892694,0.315993,0.282647,0.498687,...,0.973934,0.918804,0.978862,0.952325,0.960676,0.974442,0.937931,0.935753,0.95227,0.964497
4,0.539915,0.33988,0.622111,0.313087,0.538497,0.361496,0.563959,0.858645,0.935492,0.896559,...,0.79659,0.903506,0.796039,0.322777,0.789507,0.782663,0.374254,0.812476,0.649138,0.22949
5,0.44912,0.606481,0.887919,0.459109,0.318746,0.449202,0.822613,0.106147,0.115349,0.284914,...,0.776838,0.659842,0.583251,0.833159,0.623485,0.610177,0.559023,0.634546,0.427989,0.607511
6,0.148687,0.160345,0.258517,0.190573,0.130086,0.281306,0.447597,0.246791,0.709867,0.112892,...,0.478282,0.638555,0.941552,0.64758,0.942997,0.94977,0.681794,0.932327,0.645254,0.901401
7,0.002482,0.014733,0.128656,0.06169,0.162172,0.03064,0.378706,0.273169,0.05144,0.500056,...,0.023168,0.365648,0.360027,0.446225,0.397558,0.3311,0.113627,0.466342,0.321224,0.663658
8,0.47645,0.598461,0.733198,0.760574,0.901954,0.854176,0.709876,0.677009,0.388889,0.447368,...,0.627236,0.210273,0.172686,0.394257,0.256823,0.194959,0.162208,0.286553,0.193514,0.874019
9,0.583935,0.694873,0.931833,0.936042,0.990597,0.697422,0.989171,0.965129,0.991116,0.978839,...,0.13528,0.573603,0.302156,0.337932,0.285647,0.285754,0.115016,0.299243,0.358565,0.241627


### T-Copula
The T-copula requires two model inputs: (1) the correlation matrix, and (2) the degrees of freedom. In this case, we set the degrees of freedom equal to 3. Samples of the T-Copula are obtained as follows:


In [8]:
n_samples = 5

corr_matrix = data.corr().values
mu = np.zeros(len(corr_matrix))
s = chi2.rvs(df=3, size=n_samples)[:, np.newaxis]
Z = np.random.multivariate_normal(mu, corr_matrix, n_samples)
X = np.sqrt(3/s)*Z
U = t.cdf(X, df=3)

t_samples = pd.DataFrame(U, columns=data.columns)

### Vine Copula
The vine copula is estimated using the vinecopulas package. The estimated parameters are pickled.

In [9]:
from vinecopulas.vinecopula import fit_vinecop

M, P, C = fit_vinecop(data.values, copsi=[8])

** Tree:  1
0,5  --->  Clayton180 : parameters =  1.0000063115605533
1,5  --->  Clayton180 : parameters =  2.5477230062564127
5,6  --->  Clayton180 : parameters =  1.0000063115605533
3,2  --->  Clayton180 : parameters =  2.254895231979731
2,6  --->  Clayton180 : parameters =  1.7852770881311841
6,4  --->  Clayton180 : parameters =  1.0577130637922114
24,16  --->  Clayton180 : parameters =  1.0000063115605533
16,15  --->  Clayton180 : parameters =  1.0000063115605533
15,13  --->  Clayton180 : parameters =  1.0000063115605533
4,8  --->  Clayton180 : parameters =  1.0000063115605533
13,10  --->  Clayton180 : parameters =  1.5739396571395023
36,35  --->  Clayton180 : parameters =  2.425712651471719
8,7  --->  Clayton180 : parameters =  2.7777431484774184
11,9  --->  Clayton180 : parameters =  1.592478286578292
7,10  --->  Clayton180 : parameters =  1.4144383309048647
30,32  --->  Clayton180 : parameters =  2.6848059177687404
32,29  --->  Clayton180 : parameters =  1.3541947529548404
35,45 

In [11]:
from vinecopulas.vinecopula import sample_vinecop
from prisk.analysis_functions import calculate_basin_copula_pairs, minimax_ordering

# The parameters where estimated on an ordered version of the basin data
clayton_copula_models, clayton_error_basins, dependence_matrix = calculate_basin_copula_pairs(uniform_marginals)
basin_ids = list(uniform_marginals.keys())
ordered_basins = minimax_ordering(dependence_matrix, basin_ids)


pd.DataFrame(sample_vinecop(M, P, C, 500), columns=data.columns)