# Comparison With Other Distributions
This notebook investigates the fit of the gamma distribution to the regional
aggregate heat flow distributions from the *Random Global $R$-Disk Coverings* (RGRDCs)
of the NGHF data set (Lucazeau, 2019).

In [None]:
import json
import numpy as np
from pyproj import Proj
from plotconfig import *
from cmcrameri.cm import *
from pickle import Unpickler
from cache import cached_call
import matplotlib.pyplot as plt
from zeal2022hf import get_cm_colors
from matplotlib.patches import FancyBboxPatch
from loaducerf3 import Polygon, PolygonSelector
from pdtoolbox.gof import LillieforsTable, AndersonDarlingTable
from pdtoolbox import FrechetDistribution, GammaDistribution, InverseGammaDistribution, \
                      LogLogisticDistribution, LogNormalDistribution, \
                      NakagamiDistribution, NormalDistribution, ShiftedGompertzDistribution, \
                      WeibullDistribution
from reheatfunq.coverings import random_global_R_disk_coverings

#### Load the Data

In [None]:
hf_continental = np.load('intermediate/heat-flow-selection-mW_m2.npy')

In [None]:
with open('intermediate/02-Geometry.pickle','rb') as f:
    saf_geometry = Unpickler(f).load()

proj_str = saf_geometry["proj_str"]
proj_saf = Proj(saf_geometry["proj_str"])

In [None]:
with open('intermediate/03-Buffered-Poly.pickle','rb') as f:
    buffered_poly = Unpickler(f).load()

In [None]:
mask = np.ones(hf_continental.shape[1], dtype=bool)
hf_xy = np.stack(proj_saf(*hf_continental[1:3,:]), axis=1)

for poly in saf_geometry["selection_polygons_xy"]:
    select = PolygonSelector(Polygon(*poly[:-1].T))
    mask &= ~select.array_mask(hf_xy)
hf_independent = (hf_continental.T)[mask]

#### All Critical Tables

In [None]:
with open("intermediate/A1-Critical-Frechet.json", 'r') as f:
    LA = json.load(f)
    LF, ADF = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Gamma.json", 'r') as f:
    LA = json.load(f)
    LG, ADG = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Inverse-Gamma.json", 'r') as f:
    LA = json.load(f)
    LIG, ADIG = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Log-Logistic.json", 'r') as f:
    LA = json.load(f)
    LLL, ADLL = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Log-Normal.json", 'r') as f:
    LA = json.load(f)
    LLN, ADLN = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Nakagami.json", 'r') as f:
    LA = json.load(f)
    LNAK, ADNAK = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Normal.json", 'r') as f:
    LA = json.load(f)
    LN, ADN = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Shifted-Gompertz.json", 'r') as f:
    LA = json.load(f)
    LSG, ADSG = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
with open("intermediate/A1-Critical-Weibull.json", 'r') as f:
    LA = json.load(f)
    LW, ADW = LillieforsTable.from_json(LA[0]), AndersonDarlingTable.from_json(LA[1])

In [None]:
DISTRIBUTIONS = [(GammaDistribution, LG, ADG),
                 (LogLogisticDistribution, LLL, ADLL),
                 (NormalDistribution, LN, ADN),
                 (FrechetDistribution, LF, ADF),
                 (InverseGammaDistribution, LIG, ADIG),
                 (LogNormalDistribution, LLN, ADLN),
                 (NakagamiDistribution,LNAK, ADNAK),
                 (ShiftedGompertzDistribution, LSG, ADSG),
                 (WeibullDistribution, LW, ADW)]

In [None]:
R = 80e3
MIN_POINTS = 10
DMIN = 20e3

In [None]:
def bic(x, logL, k):
    """
    Computes the Bayesian information criterion.

    Arguments:
       x     : Sample points
       logLs : List of log-likelihoods of the models
               to consider. The models have to be
               computed using the maximum likelihood
               estimator on the data x.
       k     : Number of parameter. Has to be a list or
               an integer, the latter if all models have
               the same number of parameters.
    """
    n = x.size
    return k * np.log(n) - 2*lL(x)

In [None]:
def analyze_distributions(data_sets, distributions, alpha=0.05, silent_fail=False):
    """
    Performs a probability distribution analysis: Enumerates
    the data set and for each perform goodness-of-fit tests
    for all the PDF models given (`distributions`).
    
    Arguments:
       data_sets     : List of data sets.
       distributions : List of tuples, each characterizing a
                       probability distribution. Each tuple should
                       have the following components:

                       (label, mle, logL, cdf, (test1, ...), nparams)

                       where `label` is the name of the distribution,
                       `mle` a maximum-likelihood estimator, `logL` the
                       distribution's log-likelihood, and `cdf` the
                       distribution's cumulative distribution function.
                       The latter three should be callable.
                       Finally, `(test1, ...)` should be an enumeration of
                       critical test table instances, i.e. LillieforsTable
                       and AndersonDarlingTable instances, and
                       `nparams` an integer quantifying the number of
                       parameters.
    """
    # Initialize the goodness-of-fit result arrays:
    M = len(distributions)
    reject = np.ones((len(data_sets), M, 2), dtype=bool)
    bic = np.zeros((len(data_sets), M))
    
    
    # Perform the analysis:
    for i,data in enumerate(data_sets):
        logLs = []
        for j in range(M):
            # Shortcut for negative data:
            if np.any(data < distributions[j][0].xinf()):
                aic[i,j] = np.inf
                reject[i,j,:] = True
                continue
                
            # Perform the tests:
            for l,test in enumerate(distributions[j][1:3]):
                try:
                    reject[i,j,l] = test.test_reject(data)
                except Exception as e:
                    if not silent_fail:
                        print("data:",data)
                        print("distribution:",distributions[j][0])
                        print("MLE:", distributions[j][0].mle(data)._params)
                        print("data size:", data.size)
                        #raise e
                        
            # MLE to establish parameters:
            bic[i,j] = distributions[j][0].mle(data).bic(data)        
    
    return reject, bic

The following code will output some hierachical results. The data will be indexable and the indices
iterate

In [None]:
def monte_carlo_regional_analysis_backend(N, R, min_points, hf_independent, exclusion_poly,
                                          distributions, dmin=DMIN, proj_str=proj_str, alpha=0.05,
                                          seed=98764655947253418677864477968794449493):
    """
    Performs a number of global analyses, for each determining a nearly-covering
    set of regional heat flow distributions
    """
    all_points = []
    all_lilliefors = []
    all_anderson_darling = []
    all_aic = []
    all_bic = []
    all_distributions = []
    gamma_params = []
    K = 0
    M = len(distributions)
    
    seeds = np.random.SeedSequence(seed).spawn(N)
    for i in range(N):
        print("i =",i,"/",N)
        
        # Determine the regional distributions:
        central_points, _, hf_distributions, hf_lola, distribution_indices \
           = cached_call(random_global_R_disk_coverings, R, min_points, hf_independent,
                         buffered_poly, proj_str, dmin=dmin, seed=seeds[i])
        K += len(central_points)
        
        # For reproducibility, save the distributions:
        all_distributions += hf_distributions
        
        # Obtain gamma fits:
        gamma_params.extend(GammaDistribution.mle(dist) for dist in hf_distributions)
        
        # Analyze the distributions:
        reject, bic = analyze_distributions(hf_distributions, distributions, alpha=0.05, silent_fail=False)
        
        # Collect the data:
        all_points.extend(central_points)
        all_lilliefors.append(reject[..., 0])
        all_anderson_darling.append(reject[..., 1])
        all_bic.append(bic)
        
        
    points = np.concatenate(all_points, axis=0)
    tests = np.empty((K, M, 3))
    tests[:, :, 0] = [l for L in all_lilliefors for l in L]
    tests[:, :, 1] = [l for L in all_anderson_darling for l in L]
    tests[:, :, 2] = [l for L in all_bic for l in L]
    
    rejection_rates = np.empty((N, M, 2))
    rejection_rates[:, :, 0] = [ll.mean(axis=0) for ll in all_lilliefors]
    rejection_rates[:, :, 1] = [ad.mean(axis=0) for ad in all_anderson_darling]
    

    return points, tests, rejection_rates, gamma_params, all_distributions, all_aic, all_bic


# We call this cached function from a convenience wrapper:

def monte_carlo_regional_analysis(N, R, min_points, hf_independent, exclusion_poly, distributions, dmin, alpha):
    """
    Performs a number of global analyses, for each determining a nearly-covering
    set of regional heat flow distributions
    """
    return cached_call(monte_carlo_regional_analysis_backend, N, R, min_points, hf_independent, exclusion_poly,
                       distributions, dmin=dmin, alpha=alpha)

In [None]:
mc_points, mc_tests, mc_rejection_rates, mc_gamma_params, mc_all_distributions, mc_all_aic, mc_all_bic\
    = monte_carlo_regional_analysis(100, R, MIN_POINTS, hf_independent, buffered_poly, DISTRIBUTIONS, dmin=DMIN,
                                    alpha=0.05)

In [None]:
def compute_delta_bic(bic):
    bic = bic.copy()
    bic[np.isnan(bic)] = np.inf
    select = np.argmin(bic)
    mask = np.ones(bic.size, dtype=bool)
    mask[select] = False
    delta_bic = bic[mask].min() - bic[select]
    if np.isnan(delta_bic):
        print(bic[mask])
    return delta_bic

bic_all = np.concatenate(mc_all_bic, axis=0)
bic_select = [np.argmin(bic, axis=1) for bic in mc_all_bic]
delta_bic = [np.array([compute_delta_bic(b) for b in bic]) for bic in mc_all_bic]
bic_select_all = np.concatenate(bic_select, dtype=int)
delta_bic_all = np.concatenate(delta_bic)
bic_critical_select = bic_select_all[delta_bic_all > 2]

delta_bic_nonselect = np.array([bic_all[i,bic_select_all[i]] for i in range(bic_all.shape[0])])[:,np.newaxis] \
                      - bic_all
delta_bic_nonselect = [delta_bic_nonselect[~np.isnan(delta_bic_nonselect[:,i]), i] for i in range(9)]

delta_bic_set = [delta_bic_all[bic_select_all == i] for i in range(9)]

In [None]:
bic_vs_gof = np.zeros((bic_select_all.size,9,2), dtype=bool)
for i in range(bic_select_all.size):
    bic_vs_gof[i,bic_select_all[i],0] = True
bic_vs_gof[:,:,1] = mc_tests[:,:,1]

In [None]:
label_dict = {
    GammaDistribution : "Gamma",
    FrechetDistribution : "Fréchet",
    InverseGammaDistribution : "Inverse gamma",
    LogNormalDistribution : "Log Normal",
    LogLogisticDistribution : "Log Logistic",
    NakagamiDistribution : "Nakagami",
    NormalDistribution : "Normal",
    ShiftedGompertzDistribution : "Shifted Gompertz",
    WeibullDistribution : "Weibull"
}

Print some information about which distributions have ΔBIC > 2 when being selected (all relative to the total number of samples investigated):

In [None]:
print("Fraction of total samples which have dBIC>2:")
for i,dist in enumerate(DISTRIBUTIONS):
    print(label_dict[dist[0]],":", 100*np.count_nonzero(delta_bic_set[i] >= 2.0) / sum(len(x) for x in mc_all_bic))
print("total:",100*sum(np.count_nonzero(delta_bic_set[i] >= 2.0) / sum(len(x) for x in mc_all_bic)
                      for i in range(len(DISTRIBUTIONS))))

## Publication Plot:

In [None]:
colors = get_cm_colors(vik, 7)
color0 = colors[0]
color1 = colors[4]
color2 = colors[5]

In [None]:
with plt.rc_context({'axes.labelpad': 0.1, 'xtick.major.pad': 1.2, 'ytick.major.pad': 1.2}):
    fig = plt.figure(figsize=(6.975, 4.0), dpi=200)
    # ax_bg = fig.add_axes((0,0,1,1)) # Design canvas

    vx = np.arange(9)+0.1
    
    ax0 = fig.add_axes((0.70, 0.2, 0.28, 0.78))
    ax0.text(0.94, -0.4, '(c)', ha='center', va='center')
    ax0.barh(np.arange(9)[::-1]+0.15, bic_vs_gof[:,:,0].mean(axis=0), height=0.3, label='BIC selection',
             color=color0)
    ax0.barh(np.arange(9)[::-1]-0.15, bic_vs_gof[:,:,1].mean(axis=0), height=0.3, label='AD rejection',
             color=color1)
    ax0.set_yticks(range(9)[::-1])
    ax0.set_yticklabels([label_dict[d[0]].replace(' ','\n') for d in DISTRIBUTIONS]);
    ax0.set_xlim(0,1.0)
    ax0.set_xticks([0,0.25, 0.5, 0.75, 1.0])
    ax0.set_xticklabels([0, 25, 50, 75, 100])
    ax0.set_xlabel('Selection rate (%)')
    ax0.legend()
    
    dy_pos = 6
    dy_neg = 40
    dy_nrm = dy_pos + dy_neg
    dy_pos_rel = dy_pos / dy_nrm
    dy_neg_rel = dy_neg / dy_nrm
    dy_tot = (0.185 + 0.61) # total axes height available in the figure
    
    #
    # The positive dBIC:
    #
    ax01 = fig.add_axes((0.062, 1.0 - dy_pos_rel * dy_tot - 0.011, 0.53, dy_pos_rel * dy_tot))# ax0.twinx()
    ax01.text(-0.3, 4.9, '(a)', ha='center', va='center')
    ax01.set_ylim(0, dy_pos)
    h0 = ax01.boxplot(delta_bic_set, positions=np.arange(9), patch_artist=True,
                      whis=(0, 95),
                      boxprops=dict(facecolor='w', linewidth=0.8),
                      medianprops=dict(color=color2),
                      whiskerprops=dict(linewidth=0.8),
                      capprops=dict(linewidth=0.8),
                      flierprops=dict(markersize=1.5, markeredgecolor='None',
                                      marker='s', markerfacecolor='k'))
    ax01.axhline(2.0, color='gray', linewidth=0.8, linestyle=':')
    ax01.set_ylabel('$\\Delta\\mathrm{BIC}$\nif selected', labelpad=1.3, loc='top')
    ax01.set_xticks(range(9))
    ax01.set_yticks([0,2,4,6])
    ax01.set_xticklabels([label_dict[d[0]].replace(' ','\n') for d in DISTRIBUTIONS], rotation=90);
    
    
    #
    # The negative dBIC:
    #
    ax1 = fig.add_axes((0.062, 0.02, 0.53, dy_neg_rel * dy_tot))
    ax1.text(-0.2, -38.0, '(b)', ha='center', va='center')
    h0 = ax1.boxplot(delta_bic_nonselect, positions=np.arange(9), patch_artist=True,
                     whis=(5, 100),
                     boxprops=dict(facecolor='w', linewidth=0.8),
                     medianprops=dict(color=color2),
                     whiskerprops=dict(linewidth=0.8),
                     capprops=dict(linewidth=0.8),
                     flierprops=dict(markersize=1.5, markeredgecolor='None',
                                     marker='s', markerfacecolor='k'))
    ax1.set_xlim(ax01.get_xlim())
    ax1.plot(ax01.get_xlim(), (-6, -6), zorder=0, color='gray', linewidth=0.8, linestyle=':')
    ax1.set_ylim(-40,0)
    ax1.set_ylabel('$\\Delta \\mathrm{BIC}$ if not selected', va='bottom')
    ax1.xaxis.tick_top()
    ax1.set_xticks(range(9))
    ax1.set_xticklabels([])
    ax1.set_yticks((-40,-20,-6))

    
    # Custom Boxplot legend:
    ax1.add_patch(FancyBboxPatch((4.5, -39), 3.9, 10, facecolor='none', edgecolor=[0.8]*3,
                                 boxstyle="round,pad=0,rounding_size=0.2"))
    ax2 = fig.add_axes((0.365, 0.119, 0.2, 0.07))
    ax2.set_xlim(0,100)
    ax2.set_ylim(0.9, 1.1)
    xbp = np.linspace(0,100,41)
    ax2.boxplot(xbp, vert=False, patch_artist=True,
                whis=(5, 100),
                     boxprops=dict(facecolor='w', linewidth=0.8),
                     medianprops=dict(color=color2),
                     whiskerprops=dict(linewidth=0.8, clip_on=False),
                     capprops=dict(linewidth=0.8, clip_on=False),
                     flierprops=dict(markersize=1.5, markeredgecolor='None',
                                     marker='s', markerfacecolor='k', clip_on=False))
    ax2.set_yticks([])
    ax2.set_xticks([0, 5, 25, 50, 75, 100])
    ax2.set_xticklabels(["0","5","25","50","75","100"], fontsize='small')
    ax2.spines.right.set_visible(False)
    ax2.spines.top.set_visible(False)
    ax2.spines.left.set_visible(False)
    ax2.set_xlabel('$\Delta$BIC Quantile (%)')
    
    fig.savefig('figures/A6-Comparison-Various-Distributions-BIC-GOF.pdf')

## References:
> Lucazeau, F. (2019). Analysis and mapping of an updated terrestrial heat
>    flow data set. Geochemistry, Geophysics, Geosystems, 20, 4001– 4024.
>    https://doi.org/10.1029/2019GC008389

### License
```
A notebook to perform model selection for regional aggregate heat flow
among various univariate probability distributions.

This file is part of the REHEATFUNQ model.

Author: Malte J. Ziebarth (ziebarth@gfz-potsdam.de)

Copyright © 2019-2022 Deutsches GeoForschungsZentrum Potsdam,
            2022 Malte J. Ziebarth
            

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
```