# Gamma Distribution Landscape
This notebook implements the point-of-interest (POI) sampling algorithm
and investigates how REHEATFUNQ's minimum distance criterion handles the
clustering.

In [None]:
%load_ext cython

In [None]:
import numpy as np
from plotconfig import *
from cmcrameri.cm import *
from pickle import Pickler
from pdtoolbox import gamma_cdf, gamma_pdf, gamma_mle
import matplotlib.pyplot as plt
from scipy.spatial import KDTree
from zeal2022hf import get_cm_colors
from scipy.spatial.distance import pdist, squareform
from matplotlib.patches import Circle
from joblib import Memory
from reheatfunq.coverings.poisampling import generate_point_of_interest_sampling
cache = Memory('.cache')

In [None]:
K = 10.0
THETA = 8.0

In [None]:
rng = np.random.default_rng(839782973432)
n = 200
q = THETA*rng.gamma(K, size=(n,n))

In [None]:
%%cython
import numpy as np
from libc.stdint cimport uint32_t, uint8_t
from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
from numpy.random cimport bitgen_t
cimport cython

@cython.boundscheck(False)
cdef double cost(double q, const double[:,::1] qc, size_t i, size_t j, const long[:,::1] stencil) nogil:
    cdef size_t n = qc.shape[1]
    cdef double cost = 0
    cdef long k,ik,jk
    for k in range(stencil.shape[0]):
        ik = (<long>i) + stencil[k,0]
        if ik < 0:
            ik += n
        ik = ik % n
        jk = (<long>j) + stencil[k,1]
        if jk < 0:
            jk += n
        jk = jk % n
        cost += (q - qc[ik,jk])**2
    return cost

# This function from NumPy "extending.pyx" example:
cdef uint32_t bounded_uint(uint32_t lb, uint32_t ub, bitgen_t *rng) nogil:
    cdef uint32_t mask, delta, val
    mask = delta = ub - lb
    mask |= mask >> 1
    mask |= mask >> 2
    mask |= mask >> 4
    mask |= mask >> 8
    mask |= mask >> 16

    val = rng.next_uint32(rng.state) & mask
    while val > delta:
        val = rng.next_uint32(rng.state) & mask

    return lb + val

@cython.boundscheck(False)
def dmin_mask(const double[:,::1] xy, double dmin, rng):
    assert xy.shape[1] == 2
    cdef const long[::1] permutation = rng.permutation(np.arange(xy.shape[0]))
    cdef size_t i,j,k,l
    cdef uint8_t[::1] mask = np.ones(xy.shape[0], dtype=np.uint8)
    cdef double dmin2 = dmin * dmin
    with nogil:
        for i in range(xy.shape[0]):
            k = permutation[i]
            if not mask[k]:
                continue
            for j in range(i+1,xy.shape[0]):
                l = permutation[j]
                if (xy[k,0]-xy[l,0])**2 + (xy[k,1] - xy[l,1])**2 <= dmin2:
                    mask[l] = False
    
    return mask.base.astype(bool)


@cython.boundscheck(False)
def within_dmin(const double[:,::1] xy_src, const double[:,::] xy_query, double dmin):
    assert xy_src.shape[1] == 2
    assert xy_query.shape[1] == 2
    cdef size_t i,j,k,l
    cdef uint8_t[::1] mask = np.ones(xy_query.shape[0], dtype=np.uint8)
    cdef double dmin2 = dmin * dmin
    with nogil:
        for i in range(xy_query.shape[0]):
            for j in range(xy_src.shape[0]):
                if (xy_src[j,0]-xy_query[i,0])**2 + (xy_src[j,1] - xy_query[i,1])**2 <= dmin2:
                    mask[i] = False
                    break
    
    return mask.base.astype(bool)


@cython.boundscheck(False)
def erdnussflips(double[:,::1] qc, size_t steps, rng, size_t s):
    cdef size_t n= qc.shape[1]
    if qc.shape[0] != n:
        raise RuntimeError("Must be quadratic.")
    
    # Obtain rng:
    cdef bitgen_t *bitgen
    cdef const char *capsule_name = "BitGenerator"
    capsule = rng.bit_generator.capsule
    if not PyCapsule_IsValid(capsule, capsule_name):
        raise ValueError("Invalid pointer to anon_func_state")
    bitgen = <bitgen_t *> PyCapsule_GetPointer(capsule, capsule_name)
    
    # Generate stencil:
    cdef long i,j,l
    cdef double ds2 = s
    ds2 *= s
    cdef double dist_i, dist_j
    sten = np.empty(((2*s+1)*(2*s+1),2), dtype=np.int64)
    mask = np.zeros((2*s+1)*(2*s+1), dtype=bool)
    l = 0
    for j in range(0,2*s+1):
        j -= s
        for i in range(0,2*s+1):
            i -= s
            sten[l,0] = i
            sten[l,1] = j
            dist_i = i
            dist_j = j
            if i == 0 and j == 0:
                mask[l] = False
            elif (i*dist_i) + (j*dist_j) > ds2:
                mask[l] = False
            else:
                mask[l] = True
            
            l += 1
    
    cdef size_t k = 0
    cdef uint32_t i0,j0,i1,j1
    cdef long[:,::1] stencil = sten[mask,:]
    cdef double cost_old, cost_new, q0, q1
    cdef double cost_prop_0, cost_prop_1
    with rng.bit_generator.lock, nogil:
        for k in range(steps):
            i0 = bounded_uint(0, n-1, bitgen)
            j0 = bounded_uint(0, n-1, bitgen)
            i1 = bounded_uint(0, n-1, bitgen)
            j1 = bounded_uint(0, n-1, bitgen)
            q0 = qc[i0,j0]
            q1 = qc[i1,j1]
            cost_old = cost(q0, qc, i0, j0, stencil) + cost(q1, qc, i1, j1, stencil)
            cost_new = cost(q1, qc, i0, j0, stencil) + cost(q0, qc, i1, j1, stencil)
            if cost_new < cost_old:
                qc[i0,j0] = q1
                qc[i1,j1] = q0

In [None]:
@cache.cache
def cached_flips(q, steps, seed, s):
    qc = q.copy()
    rng = np.random.default_rng(seed)
    erdnussflips(qc, steps, rng, s)
    return qc

In [None]:
qc = cached_flips(q, 100000000, 899283989, 3)

In [None]:
with open('intermediate/A13-gamma-landscape.pickle','wb') as f:
    Pickler(f).dump(qc)

## "Geothermal" Sampling
Generate some sampling points that want to explore "geothermal reservoirs":

*Note: this algorithm is currently unused.*

In [None]:
M = 5000
xy = (n-1)*rng.random((M,2))
closest_ij = np.round(xy).astype(int)
def probability(q):
    return np.exp((q-qc.max()) / 40)

qxy = qc[*closest_ij.T]
p = probability(qxy)
mask = rng.random(M) < p
xy = xy[mask,:]
qxy = qxy[mask]

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
ax.imshow(q)
ax = fig.add_subplot(132)
ax.hist(q.flat, bins='auto');
twax = ax.twinx()
twax.plot(np.linspace(0, qc.max()), probability(np.linspace(0, qc.max())), color='tab:orange')
twax.set_ylim(0,1)
ax = fig.add_subplot(133)
ax.pcolormesh(np.arange(0,n), np.arange(0,n), qc.T)
ax.scatter(*xy.T)

In [None]:
I,J = rng.integers(q.shape[0],size=(2,200))
dist = squareform(pdist(np.stack((I,J),axis=1)))
mask = np.ones(I.size, dtype=bool)
for i,j in np.argwhere(dist <= 5):
    if i < j and mask[i]:
        mask[j] = False

In [None]:
mask2 = rng.random(qc.shape) < probability(qc)

In [None]:
I,J = np.argwhere(mask2).T
dist = squareform(pdist(np.stack((I,J),axis=1)))
mask3 = np.ones(I.size, dtype=bool)
for i,j in np.argwhere(dist <= 20):
    if i < j and mask3[i]:
        mask3[j] = False

In [None]:
dist = squareform(pdist(xy))
mask5 = np.ones(xy.shape[0], dtype=bool)
for i,j in np.argwhere(dist <= 25):
    if i < j and mask5[i]:
        mask5[j] = False

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolormesh(np.arange(0,n), np.arange(0,n), qc.T)
ax.scatter(*xy.T)
ax.scatter(*xy[mask5,:].T)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(q.flat, density=True)
#ax.hist(q[I,J], density=True, histtype='step', bins='auto')
#ax.hist(q[I[mask],J[mask]], density=True, histtype='step', bins='auto')
ax.hist(qc[mask2], density=True, histtype='step', bins='auto')
ax.hist(qc[I[mask3],J[mask3]], density=True, histtype='step', bins='auto')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
qlsp = np.linspace(q.min(), q.max())
ax.plot(qlsp, gamma_cdf(qlsp, K, THETA))
ax.plot(np.sort(q.flat), (np.arange(q.size)+1)/q.size)
ax.plot(np.sort(qc[mask2]), (np.arange(np.count_nonzero(mask2))+1) / np.count_nonzero(mask2))
ax.plot(np.sort(qc[I[mask3],J[mask3]]), (np.arange(np.count_nonzero(mask3))+1) / np.count_nonzero(mask3))

## Point-of-Interest Sampling
Here we check a type of borehole development where there is a chance to have "points of interest", in which
there will preferentially be follow-up boreholes in their neighborhood.

In [None]:
xy_poi, types_poi = generate_point_of_interest_sampling(100, 0.2, 0.7, 200, 8.0, 89212352)
xy_poi, types_poi = generate_point_of_interest_sampling(100, 0.05, 0.8, 200, 8.0, 89212352)

closest_ij_poi = np.round(xy_poi).astype(int)
q_poi = qc[*closest_ij_poi.T]

In [None]:
mask6 = dmin_mask(xy_poi, 25.0, rng)

#### Illustrating Figure

In [None]:
colors = get_cm_colors(vik, 13)
color0 = colors[0]
color1 = colors[8]
color2 = colors[5]
color3 = colors[9]
color4 = colors[2]

In [None]:
fig = plt.figure(figsize=(6.0,3.5), dpi=250)
#ax_bg = fig.add_axes((0,0,1,1))
ax = fig.add_axes((0.08, 0.18, 0.42, 0.89))
ax.set_xlabel('x (km)')
ax.set_ylabel('y (km)')
ax.pcolormesh(np.arange(0,n), np.arange(0,n), qc.T, cmap=bilbao, rasterized=True)
markers = ['o','s','^']
s = [20,20,15]
edgecolor=['#555555','#555555','#bbbbbb']
h = []
lbls = []
for t in [2,0,1]:
    maski = types_poi == t
    h.append(
        ax.scatter(*xy_poi[maski & ~mask6].T, marker=markers[t], s=s[t],
                   facecolor='none', edgecolor=edgecolor[t], zorder=2)
    )
    lbls.append(['Random','POI','Dependent'][t] + '\n(unselected)')
    h.append(
        ax.scatter(*xy_poi[mask6 & maski,:].T, marker=markers[t], s=s[t],
                    facecolor='k', edgecolor='k', zorder=3)
    )
    lbls.append(['Random','POI','Dependent'][t] + '\n(selected)')


for i in np.argwhere(mask6).flat:
    ax.add_artist(Circle(xy_poi[i,:], radius=25.0, facecolor='none', edgecolor='lightgray', linestyle='--',
                  alpha=0.5))
ax.set_aspect('equal')
ax.text(2, 190, '(a)')

ax_leg = fig.add_axes((0.505, 0.69, 0.41, 0.3))
ax_leg.set_axis_off()
ax_leg.legend(handles=h, labels=lbls, loc='center', ncols=2);


#
# Now plot the CDFs:
#
qpl = np.linspace(15, 140, 100)
ax2 = fig.add_axes((0.58, 0.345, 0.35, 0.225))
qplot1 = np.sort(q_poi)
ax2.step(np.concatenate(((0,),qplot1)),
        100*np.concatenate(((0,),(np.arange(qplot1.size)+1)/qplot1.size)),
         linewidth=1.0, color=color0, where='post',
         label='All data')
k,t = gamma_mle(qplot1)
ax2.plot(qpl, 100*gamma_cdf(qpl, k, t), color=color0, linewidth=0.8, linestyle='--',
         label='MLE')
ax2.yaxis.tick_right()
ax2.set_xlim(qpl.min(), qpl.max())
ax2.set_ylim(0,100)
ax2.set_ylabel('CDF (%)', labelpad=0)
ax2.text(17, 85, '(b)')
ax2.yaxis.set_label_position('right')
ax2.legend(fontsize='small', loc='lower right')


ax3 = fig.add_axes((ax2.get_position().extents[0], 0.12,
                    ax2.get_position().width, 0.225))
qplot2 = np.sort(q_poi[mask6])
ax3.step(np.concatenate(((0,),qplot2)),
         100*np.concatenate(((0,),(np.arange(qplot2.size)+1)/qplot2.size)),
         color=color3, linewidth=0.8, where='post',
         label='$d_\\mathrm{min}$ sample')
ax3.text(17, 85, '(c)')
k,t = gamma_mle(qplot2)
ax3.plot(qpl, 100*gamma_cdf(qpl, k, t), color=color3, linewidth=0.8, linestyle='--',
         label='MLE')
ax3.set_xlim(qpl.min(), qpl.max())
ax3.set_ylim(0,100)
ax3.set_ylabel('CDF (%)', labelpad=0)
ax3.set_xlabel('Heat flow $q$ ($\\mathrm{mW}\,\\mathrm{m}^{-2}$)')
ax3.legend(fontsize='small', loc='lower right')


fig.savefig('figures/A13-gamma-landscape-preferential-sampling-dmin.pdf')

## Anomaly Posterior

In [None]:
from reheatfunq.anomaly.bayes import *
from reheatfunq.anomaly.postbackend import *
from reheatfunq.anomaly import AnomalyLS1980

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
qplt = np.linspace(0, 200, 200)
ax.plot(qplt, gamma_pdf(qplt, 10.0, 8.0))

In [None]:
from reheatfunq.anomaly import HeatFlowAnomalyPosterior
from reheatfunq import GammaConjugatePrior
from reheatfunq.regional import default_prior

In [None]:
gcp = default_prior()

In [None]:
@cache.cache
def reheatfunq_batch_model_comparison_anomaly(Nsamp, Nperm, PH, N=30, Noff=10, k=10.0, theta=8.0, PH_true=100e6,
                                              L=100e3, R=5e3, dmin=5e3, d_ano=10e3,
                                              point_generator='point-of-interest',
                                              data_generator='independent', qc=qc,
                                              p_poi=0.05, p_follow_up=0.8, p=np.exp(gcp.lp),
                                              s=gcp.s, n=gcp.n, v=gcp.v,
                                              quantiles = np.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]),
                                              seed=75564900264581):
    """
    Compare the performance of the heat flow anomaly quantification for different
    point set generators.
    """
    rng = np.random.default_rng(seed)
    
    if point_generator not in ['point-of-interest','uniform']:
        raise ValueError("point_generator must be either 'point-of-interest' or 'uniform'.")
    if data_generator not in ['independent','spatial']:
        raise ValueError("data_generator must be either 'independent' or 'spatial'.")
        
    ano = AnomalyLS1980(np.array([(L/2, 0.0), (L/2, L)]), d_ano)

    KL = []
    tq = []
    mean = []
    dists = []
    for i in range(Nsamp):
        print("iteration",i+1,"of",Nsamp)
        # Generate new data:
        if point_generator == 'uniform':
            xy_i = (L-1) * rng.random(size=(N,2))
        else:
            xy_i = generate_point_of_interest_sampling(N, p_poi, p_follow_up, L, R, rng=rng)[0]
        
        if data_generator == 'independent':
            qi_i = theta * rng.gamma(k, size=N)
        else:
            closest_ij = np.round((np.array(qc.shape)[np.newaxis,:] - 1) * xy_i/L).astype(int)
            qi_i = qc[*closest_ij.T]
        
        # c_i according to unit (mW/m²)
        ci_i = ano(xy_i)
        ci_i *= 1e3

        qi_ano_i = qi_i + PH_true * ci_i
        
        if np.any(np.isnan(qi_ano_i)):
            print("------------------------------------------")
            print("xy_i:")
            print(xy_i)
            print("ci_i:")
            print(ci_i)
            print("qi_ano_i:")
            print(qi_ano_i)
            print("------------------------------------------")
            raise RuntimeError("Found qi_ano_i is nan!")
        
        # Full data set posterior:
        gcp = GammaConjugatePrior(p, s, n, v)
        try:
            post_i = HeatFlowAnomalyPosterior(qi_ano_i, *xy_i.T, ano, gcp, 0.0, rng=seed)
        except:
            with open('error-config.pickle','wb') as f:
                Pickler(f).dump(((qi_ano_i, *xy_i.T, ano, gcp, 0.0), dict(rng=seed)))
            raise RuntimeError("Error in init.")
            
        
        pdf_cmp_3 = post_i.pdf(PH)
        if quantiles is not None:
            try:
                tail_i = post_i.tail_quantiles(quantiles)
            except RuntimeError as e:
                with open('error-config.pickle','wb') as f:
                    Pickler(f).dump(((qi_ano_i, *xy_i.T, ano, gcp, 0.0), dict(rng=seed)))
                raise e
        mask = pdf_cmp_3 > 0

        try:
            post_batch_i = HeatFlowAnomalyPosterior(qi_ano_i, *xy_i.T, ano, gcp, dmin, n_bootstrap=Nperm, rng=seed)
        except RuntimeError as e:
            args = (qi_ano_i, *xy_i.T, None, gcp, dmin)
            kwargs = dict(n_bootstrap=Nperm, rng=seed)
            with open('error-config2.pickle','wb') as f:
                Pickler(f).dump({
                    "args" : args,
                    "kwargs" : kwargs,
                    "ano_args" : (np.array([(L/2, 0.0), (L/2, L)]), d_ano)
                })
            raise e
        qi_batch_i = [post_batch_i.q[ids] for w,j,ids in post_batch_i.bootstrap]
        ci_batch_i = [post_batch_i.c[j,ids] for w,j,ids in post_batch_i.bootstrap]

        pdf_batch_i = post_batch_i.pdf(PH)
        if quantiles is not None:
            try:
                tail_batch_i = post_batch_i.tail_quantiles(quantiles)
            except RuntimeError as e:
                with open('error-config.pickle','wb') as f:
                    Pickler(f).dump((qi_batch_i, ci_batch_i, np.ones(Nperm), p, s, n, v, 1.0, 1e-5))
                raise e

        pdf_batch_old_i = np.zeros_like(PH) + np.NaN
        tail_batch_old_i = np.zeros_like(quantiles)+ np.NaN

        # Kullback-Leibler distance (kind of)
        KL.append((np.sum(pdf_cmp_3[mask] * np.log(pdf_cmp_3[mask] / pdf_batch_i[mask])),
                   np.sum(pdf_cmp_3[mask] * np.log(pdf_cmp_3[mask] / pdf_batch_old_i[mask]))))
    
        # The distribution's mean:
        dPH = PH[1] - PH[0]
        mean.append(((PH * pdf_cmp_3 * dPH).sum(), (PH * pdf_batch_i * dPH).sum(),
                     (PH * pdf_batch_old_i * dPH).sum()))

        dists.append(np.stack((pdf_cmp_3, pdf_batch_i, pdf_batch_old_i)))
        
        if quantiles is not None:
            tq.append((tail_i, tail_batch_i, tail_batch_old_i))

    KL = np.array(KL)
    mean = np.array(mean)
    
    return KL, mean, dists, tq

In [None]:
PH = np.linspace(0.0, 8e8, 100)
PH_true=100e6
quantiles = np.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99])

In [None]:
def test_illustration(PH, N=50, k=10.0, theta=8.0, PH_true=100e6,
                      L=100e3, R=5e3, dmin=5e3, d_ano=10e3,
                       point_generator='point-of-interest',
                       data_generator='independent', qc=qc,
                       p_poi=0.12, p_follow_up=0.8, p=1.0, s=0.0, n=0.0, v=0.0,
                       quantiles = np.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]),
                       seed=75564900264581):
        
    rng = np.random.default_rng(seed)
    
    if point_generator not in ['point-of-interest','uniform']:
        raise ValueError("point_generator must be either 'point-of-interest' or 'uniform'.")
    if data_generator not in ['independent','spatial']:
        raise ValueError("data_generator must be either 'independent' or 'spatial'.")
        
    ano = AnomalyLS1980(np.array([(L/2, 0.0), (L/2, L)]), d_ano)

    if point_generator == 'uniform':
        xy_i = (L-1) * rng.random(size=(N,2))
    else:
        xy_i = generate_point_of_interest_sampling(N, p_poi, p_follow_up, L, R, rng=rng)[0]
    
    if data_generator == 'independent':
        qi_i = theta * rng.gamma(10.0, size=N)
    else:
        closest_ij = np.round((np.array(qc.shape)[np.newaxis,:] - 1) * xy_i/L).astype(int)
        qi_i = qc[*closest_ij.T]

    # c_i according to unit (mW/m²)
    ci_i = ano(xy_i)
    ci_i *= 1e3

    xy_plot = np.stack(
        (np.linspace(0, L, 101),
         np.zeros(101)),
        axis=1
    )
    ci_plot = ano(xy_plot) * 1e3

    qi_ano_i = qi_i + PH_true * ci_i

    fig = plt.figure(figsize=(12,4.5))
    ax = fig.add_subplot(131)
    ax.plot(xy_plot[:,0], ci_plot * PH_true)
    ax.scatter(xy_i[:,0], qi_i, marker='.')
    ax.scatter(xy_i[:,0], qi_ano_i, marker='^')

    ax2 = fig.add_subplot(132)
    ax2.pcolormesh(np.linspace(0, L, qc.shape[0]),
                   np.linspace(0, L, qc.shape[1]),
                   qc.T)
    ax2.scatter(*xy_i.T, c=qi_i, vmin=qc.min(),
                vmax=qc.max(), edgecolor='k')
    ax2.plot(*np.array([(L/2, 0.0), (L/2, L)]).T,
             color='k', linewidth=1.0)
    ax2.set_xlim(0, L)
    ax2.set_ylim(0, L)

    gcp = GammaConjugatePrior(p, s, n, v)
    ax3 = fig.add_subplot(133)
    post2 = HeatFlowAnomalyPosterior(qi_ano_i, *xy_i.T, ano, gcp, dmin, n_bootstrap=200, rng=seed)
    post = HeatFlowAnomalyPosterior(qi_ano_i, *xy_i.T, ano, gcp, 0.0, rng=seed)
    ax3.plot(PH, post.pdf(PH))
    ax3.axvline(PH_true, color='k', linewidth=1.0)
    ax3.plot(PH, post2.pdf(PH))

In [None]:
test_illustration(PH, PH_true=PH_true, quantiles=quantiles,
                  data_generator='spatial')

In [None]:
import cProfile, pstats, io
from pstats import SortKey

In [None]:
Nsamp = 10001

In [None]:
pr = cProfile.Profile()
pr.enable()
try:
    reheatfunq_batch_model_comparison_anomaly(Nsamp, 1000, PH, PH_true=PH_true, quantiles=quantiles,
                                              data_generator='spatial')
finally:
    pr.disable()

In [None]:
qc.shape, qc.size

In [None]:
s = io.StringIO()
sortby = SortKey.TIME
ps = pstats.Stats(pr).sort_stats(sortby)
ps.print_stats()

In [None]:
KL, mean, dists, tq = reheatfunq_batch_model_comparison_anomaly(Nsamp, 1000, PH, PH_true=PH_true, quantiles=quantiles,
                                              data_generator='spatial')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(PH, dists[0][1])

In [None]:
fig = plt.figure(figsize=(6,3))
ax = fig.add_subplot(121)
iq = 3
y0 = np.array([t[0][iq] for t in tq])
y1 = np.array([t[1][iq] for t in tq])
ax.hist(100*(y0-PH_true)/PH_true, label='all', bins='auto', density=True)
ax.hist(100*(y1-PH_true)/PH_true, label='in likelihood', histtype='step', bins='auto', density=True)

ax = fig.add_subplot(122)
#ax.set_yscale('log')
ax.hist(100*(y0-PH_true)/PH_true, label='all', bins='auto', density=True)
ax.hist(100*(y1-PH_true)/PH_true, label='in likelihood', histtype='step', bins='auto', density=True)
ax.legend()
ax.set_title(quantiles[iq]);
ax.set_xlim(-100, 150)

In [None]:
fig = plt.figure(figsize=(6,3))
#ax_bg = fig.add_axes((0,0,1,1))
ax = fig.add_axes((0.065, 0.15, 0.27, 0.78))
ax.hist(100*(y0-PH_true)/PH_true, label='all', bins='auto', density=True, color=color1,
        histtype='stepfilled')
ax.hist(100*(y1-PH_true)/PH_true, label='in likelihood', histtype='step', bins='auto', density=True, color=color0)
ax.axvline(0.0, linewidth=0.8, color='k', linestyle='--')
ax.set_ylabel('Density ($10^{-3}$)')
ax.set_xlabel('Deviation from true value (%)', x=0.85, ha='center')
ax.set_title("Median $P_H$", x=0.9, ha='center');
ax.set_xlim(-100, 150)
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([str(round(1e3*yt)) for yt in ax.get_yticks()])
ax.set_xticks([-100, 0, 100, 150])
ax.spines['right'].set_visible(False)
ax2 = fig.add_axes((ax.get_position().extents[2], 0.15, 0.22, 0.78))
ax2.set_yticks([])
ax2.set_ylim(ax.get_ylim())
ax2.hist(100*(y0-PH_true)/PH_true, label='All data', bins='auto', density=True, color=color1)
ax2.hist(100*(y1-PH_true)/PH_true, label='$d_\\mathrm{min}$ samples', histtype='step', bins='auto', density=True, color=color0)
ax2.set_xlim(ax.get_xlim()[1], ax2.get_xlim()[1])
ax2.set_xscale('log')
ax2.legend()
ax2.spines['left'].set_visible(False)
ax2.plot((150,150), ax2.get_ylim()[1]*np.array((0.995, 0.95)),
        clip_on=False, color='k', linewidth=ax2.spines["top"].get_linewidth())


ax = fig.add_axes((0.65, 0.15, 0.34, 0.78))
ax.set_title("Mean $P_H$")
ax.hist(100*(mean[:,0]-PH_true)/PH_true, bins='auto', density=True, color=color1,
        histtype='stepfilled')
ax.hist(100*(mean[:,1]-PH_true)/PH_true, bins='auto', histtype='step', density=True, color=color0)
#ax.hist(mean[:,2], bins='auto', histtype='step', density=True);
ax.axvline(0.0, color='k', linewidth=0.8, linestyle='--')
ax.set_ylabel('Density ($10^{-3}$)')
ax.set_xlabel('Deviation from true value (%)')
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([str(round(1e3*yt)) for yt in ax.get_yticks()]);

fig.savefig('figures/A13-POI-Sampling-Median-Mean-P_H.pdf')

## Posterior Predictive

In [None]:
from pdtoolbox.gof.statistics import _anderson_darling
from pdtoolbox.distributions import gamma_cdf

In [None]:
def preferential_sampling_dmin_monte_carlo(N_MC, Nsamp, p_poi, p_follow_up, rng, L, R, dmin):
    A2 = np.empty((N_MC, 2))
    A2_dmin = np.empty((N_MC,2))
    counter = []
    for i in range(N_MC):
        # Generate a sampling:
        xy_poi, types_poi = generate_point_of_interest_sampling(Nsamp, p_poi, p_follow_up, L, R, rng=rng)
        closest_ij_poi = np.round(xy_poi).astype(int)
        q_poi = qc[*closest_ij_poi.T]
        
        # Generate the filtered data:
        mask = dmin_mask(xy_poi, dmin, rng)
        q_poi_dmin = q_poi[mask]
        m_poi = q_poi_dmin.size
        
        counter.append(m_poi)
        
        # Evaluate the cdfs:
        cdf_poi = np.sort(gamma_cdf(q_poi, K, THETA))
        cdf_poi_dmin = np.sort(gamma_cdf(q_poi_dmin, K, THETA))
        
        # Anderson-Darling statistic:
        A2[i,0] = _anderson_darling(cdf_poi.reshape((1,-1)), 0.0, True)[0]
        A2_dmin[i,0] = _anderson_darling(cdf_poi_dmin.reshape((1,-1)), 0.0, True)[0]

    for i in range(N_MC):
        q_uni = THETA * rng.gamma(K, size=Nsamp)
        cdf = gamma_cdf(q_uni, K, THETA)
        
        # Evaluate the CDFs:
        cdf_uni = np.sort(cdf)
        cdf_uni_dmin = np.sort(cdf[:counter[i]])

        # Anderson-Darling statistic:
        A2[i,1] = _anderson_darling(cdf_uni.reshape((1,-1)), 0.0, True)[0]
        A2_dmin[i,1] = _anderson_darling(cdf_uni_dmin.reshape((1,-1)), 0.0, True)[0]

        i += 1

        
    return A2, A2_dmin

In [None]:
DMIN = [2.0, 8.0, 20.0]

In [None]:
A2 = []
A2_dmin = []
for dmin in [2.0, 8.0, 20.0]:
    A2_i, A2_dmin_i = preferential_sampling_dmin_monte_carlo(10000, 200,  0.2, 0.7, rng, 200.0, 8.0, dmin)
    A2.append(A2_i)
    A2_dmin.append(A2_dmin_i)

In [None]:
fig = plt.figure(figsize=(6,4))
#ax_bg = fig.add_axes((0,0,1,1))
axi_w = 0.24
x0l = 0.08
ax0_w = 0.33 * 2 + axi_w
ax0 = fig.add_axes((x0l, 0.6, ax0_w, 0.33))
ax0.hist(A2[0][:,1], bins='auto', label='i.i.d.', density=True, color=color1, histtype='stepfilled')
ax0.hist(A2[0][:,0], bins='auto', histtype='step', label='Preferential sampling', density=True, color=color0)
ax0.legend()
ax0.set_xlabel('Anderson-Darling statistic A²')
ax0.set_ylabel('Density')
ax0.set_title('No $d_\mathrm{min}$',loc='left')
ax0.set_xlim(0,10 * ax0_w/axi_w)
ax0.annotate('continues', (10 * ax0_w/axi_w, 0.1), (0.85 * 10 * ax0_w/axi_w, 0.1),
             arrowprops={
                 'arrowstyle' : '->'
             }, ha='left', va='center')
ax0.set_ylim(0, 1.8)
ax0.text(0.3, 1.6, '(a)')


for i in range(3):
    ax = fig.add_axes((0.08 + 0.33 * i, 0.1, axi_w, 0.33))
    ax.hist(A2_dmin[i][:,1], bins='auto', color=color1, density=True)
    ax.hist(A2_dmin[i][:,0], bins='auto', histtype='step', density=True, color=color0);
    ax.set_xlabel('A²')
    ax.set_title(f'$d_\mathrm{{min}}={DMIN[i]}\,\mathrm{{km}}$')
    ax.set_xlim(0,10)
    ax.set_ylabel('Density')
    ax.set_ylim(0, 1.8)
    ax.text(0.3 if i != 2 else 0.6, 1.6, ['(b)','(c)','(d)'][i])

fig.savefig('figures/A13-A2-improvement-with-dmin.pdf')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(np.sort(q.flat), (np.arange(q.size)+1)/q.size)
ax.plot(np.sort(q_poi), (np.arange(q_poi.size)+1) / q_poi.size)
ax.plot(np.sort(q_poi[mask6]), (np.arange(np.count_nonzero(mask6))+1) / np.count_nonzero(mask6))

### License
```
Investigating a model of preferentially clustered sampling on a
gamma heat flow landscape, and the mitigating effect of the dmin
criterion thereon.

This file is part of the REHEATFUNQ model.

Author: Malte J. Ziebarth (ziebarth@gfz-potsdam.de)

Copyright © 2019-2022 Deutsches GeoForschungsZentrum Potsdam,
            2023 Malte J. Ziebarth
            

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
```