In [None]:
# %load init.ipy
%reload_ext autoreload
%autoreload 2

# %load init.ipy
import os, sys, logging, datetime, warnings, shutil
from importlib import reload

import numpy as np
import scipy as sp
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
from nose import tools

import kalepy as kale
import kalepy.utils
import kalepy.plot
# The `nbshow` command runs `plt.show()` in interactive jupyter notebooks, but closes
#   figures when run from the command-line (notebooks are converted to scripts and run as tests)
from kalepy.plot import nbshow

import warnings
warnings.simplefilter("error")
mpl.style.use('default')   # avoid dark backgrounds from dark theme vscode

# Edge Construction

In [None]:
SIZE = 1e3

SIZE = int(SIZE)
xx = np.random.normal(0.0, 1.0, SIZE)
yy = 10**np.random.normal(xx, 0.2, SIZE)
xx = np.fabs(xx + np.random.normal(0.0, 0.2, SIZE))
yy += 10.0**np.random.normal(0.0, 0.1, SIZE)
data = [xx, yy]

# plt.scatter(xx, yy, color='b', alpha=0.25)
ax = plt.gca()
ax.set(xscale='linear', yscale='log')

plt.scatter(*data, color='r', alpha=0.05)

nbshow()

In [None]:
# reload(kale.utils)
# des = kale.utils.Data_Edges(data)

# Trapezoid Integration

In [None]:
nbins = [20, 30]
xx = np.linspace(-3.0, 3.0, nbins[0]+1)
yy = np.linspace(-4.0, 8.0, nbins[1]+1)

xc = kale.utils.midpoints(xx, log=False)
yc = kale.utils.midpoints(yy, log=False)

grid = np.meshgrid(xx, yy)
zz = np.random.multivariate_normal([0.0, 2.0], [[1.0, 0.2],[0.2, 4.0]], 3000).T
print(zz.shape)

fig, axes = plt.subplots(figsize=[10, 10], ncols=2, nrows=2)
axes[0, 1].set_visible(False)
axes = [axes[1, 0], axes[0, 0], axes[1, 1]]
# axes = [ax for ii, ax in enumerate(axes.flatten()) if ii != 1]

ax = axes[0]
grid_cc = np.meshgrid(xc, yc)
# the `density` parameter in `hist2d` is incompatible with python3.5
# pdf, *_ = ax.hist2d(zz[0].flatten(), zz[1].flatten(), bins=(xx, yy), density=True)
pdf, *_ = np.histogram2d(zz[0].flatten(), zz[1].flatten(), bins=(xx, yy), density=True)
ax.pcolormesh(*grid, pdf.T)
print(pdf.shape)


ax = axes[1]
xpdf, *_ = ax.hist(zz[0].flatten(), bins=xx, density=True, alpha=0.5, edgecolor='0.5')

proj_pdf = kale.utils.trapz_nd(pdf, [xc, yc], axis=1)
print(np.allclose(xpdf, proj_pdf, rtol=1e-1))
ax.plot(xc, proj_pdf, 'r-')

ax = axes[2]
ypdf, *_ = ax.hist(zz[1].flatten(), bins=yy, orientation='horizontal', density=True, alpha=0.5, edgecolor='0.5')

proj_pdf = kale.utils.trapz_nd(pdf, [xc, yc], axis=0)
print(np.allclose(ypdf, proj_pdf, rtol=1e-1))
ax.plot(proj_pdf, yc, 'r-')

nbshow()

In [None]:
nbins = [20, 30]
xx = np.linspace(-3.0, 3.0, nbins[0]+1)
yy = np.linspace(-4.0, 8.0, nbins[1]+1)

xc = kale.utils.midpoints(xx, log=False)
yc = kale.utils.midpoints(yy, log=False)

grid = np.meshgrid(xx, yy)
zz = np.random.multivariate_normal([0.0, 2.0], [[1.0, 0.2],[0.2, 4.0]], 3000).T
print(zz.shape)

fig, axes = plt.subplots(figsize=[10, 10], ncols=2, nrows=2)
axes[0, 1].set_visible(False)
axes = [axes[1, 0], axes[0, 0], axes[1, 1]]
# axes = [ax for ii, ax in enumerate(axes.flatten()) if ii != 1]

ax = axes[0]
# pdf, *_ = ax.hist2d(zz[0].flatten(), zz[1].flatten(), bins=(xx, yy), density=True)
pdf, *_ = np.histogram2d(zz[0].flatten(), zz[1].flatten(), bins=(xx, yy), density=True)
ax.pcolormesh(*grid, pdf.T)
print(pdf.shape)


ax = axes[1]
xpdf, *_ = ax.hist(zz[0].flatten(), bins=xx, density=True, alpha=0.5, edgecolor='0.5')

proj_pdf = kale.utils.trapz_nd(pdf, [xc, yc], axis=1)
print(np.allclose(xpdf, proj_pdf, rtol=1e-1))
ax.plot(xc, proj_pdf, 'r-')

ax = axes[2]
ypdf, *_ = ax.hist(zz[1].flatten(), bins=yy, orientation='horizontal', density=True, alpha=0.5, edgecolor='0.5')

proj_pdf = kale.utils.trapz_nd(pdf, [xc, yc], axis=0)
print(np.allclose(ypdf, proj_pdf, rtol=1e-1))
ax.plot(proj_pdf, yc, 'r-')

nbshow()

In [None]:
def _test_ndim_a1(ndim):
    from kalepy import utils

    BIN_SIZE_RANGE = [10, 30]
    num_bins = np.random.randint(*BIN_SIZE_RANGE, ndim)
    # num_bins = [3, 4]
    
    edges = []
    extr = []
    for nb in num_bins:
        ee = np.cumsum(np.random.uniform(0.0, 2.0, nb))
        edges.append(ee)
    
    grid = np.meshgrid(*edges, indexing='ij')
    shp = [len(ee) for ee in edges]
    
    for axis in range(ndim):
        not_axis = (axis + 1) % 2
        print("\nndim = {}, axis = {}, other = {}".format(ndim, axis, not_axis))

        bcast_norm = [np.newaxis for ii in range(ndim)]
        bcast_norm[not_axis] = slice(None)
        bcast_norm = tuple(bcast_norm)
        norm = np.random.uniform(0.0, 10.0, shp[not_axis])[bcast_norm]

        bcast_wids = [np.newaxis for ii in range(ndim)]
        bcast_wids[axis] = slice(None)
        bcast_wids = tuple(bcast_wids)
        wids = np.diff(edges[axis])[bcast_wids]
        
        pdf = np.ones_like(grid[0]) * norm     
        pmf = kale.utils.trapz_dens_to_mass(pdf, edges, axis=axis)

        new_shp = [ss for ss in shp]
        new_shp[axis] -= 1
        kale.utils.alltrue(np.shape(pmf) == np.array(new_shp), "Output shape is {fail:}correct")
    
        kale.utils.alltrue(pmf == norm*wids, 'Values do {fail:}match')
        
        # print(pdf)
        # print(wids)
        # print(pmf)
        
    return

for ii in range(2, 5):
    _test_ndim_a1(ii)

In [None]:
def _test_ndim_a2(ndim):
    from kalepy import utils

    BIN_SIZE_RANGE = [10, 30]
    num_bins = np.random.randint(*BIN_SIZE_RANGE, ndim)
    # num_bins = [3, 4, 5]
    
    edges = []
    extr = []
    for nb in num_bins:
        ee = np.cumsum(np.random.uniform(0.0, 2.0, nb))
        edges.append(ee)
    
    grid = np.meshgrid(*edges, indexing='ij')
    shp = np.array([len(ee) for ee in edges])
    
    for axis in np.ndindex(*([ndim]*2)):
        if len(np.unique(axis)) != len(axis):
            continue
        
        axis = np.asarray(axis)
        not_axis = np.array(list(set(range(ndim)) - set(axis)))
        print("\nndim = {}, axis = {}, other = {}".format(ndim, axis, not_axis))
        
        bcast_norm = [np.newaxis for ii in range(ndim)]
        for na in not_axis:
            bcast_norm[na] = slice(None)

        bcast_norm = tuple(bcast_norm)
        print(bcast_norm)
        norm = np.random.uniform(0.0, 10.0, shp[not_axis])[bcast_norm]

        bcast_wids = [np.newaxis for ii in range(ndim)]
        for aa in axis:
            bcast_wids[aa] = slice(None)

        bcast_wids = tuple(bcast_wids)

        widths = []
        for ii in range(ndim):
            dim_len_inn = shp[ii]
            if ii in axis:
                wid = np.diff(edges[ii])
            else:
                wid = np.ones(dim_len_inn)

            # Create new axes along all by the current dimension, slice along the current dimension
            cut = [np.newaxis for ii in range(ndim)]
            cut[ii] = slice(None)
            temp = wid[tuple(cut)]
            widths.append(temp)
        
        wids = np.product(np.array(widths, dtype=object), axis=0)        
        
        
        pdf = np.ones_like(grid[0]) * norm     
        pmf = kale.utils.trapz_dens_to_mass(pdf, edges, axis=axis)

        new_shp = [ss for ss in shp]
        for aa in axis:
            new_shp[aa] -= 1

        kale.utils.alltrue(np.shape(pmf) == np.array(new_shp), "Output shape is {fail:}correct")
    
        kale.utils.alltrue(pmf == norm*wids, 'Values do {fail:}match')
        
        # print(pdf)
        # print(wids)
        # print(pmf)
        
    return

for ii in range(3, 5):
    _test_ndim_a2(ii)

# Bound Indices

# ND cumsum

In [None]:
def pad(aa, axis=None):
    if axis is None:
        return np.pad(aa, [1, 0])
    
    aa = np.moveaxis(aa, axis, 0)
    aa = np.concatenate([[np.zeros_like(aa[0])], aa], axis=0)
    aa = np.moveaxis(aa, 0, axis)
    return aa

aa = np.random.randint(0, 10, (3, 4, 2))
print(aa.shape)
for ax in [None, 0, 1, 2]:
    bb = pad(aa, ax)
    print(ax, bb.shape)
    # print(bb)
    print()
    

# Bin Centroids

In [None]:
def gen_grid(ndim=2, size=[3, 6], seed=None):
    if seed is not None:
        np.random.seed(seed)
    edges = []
    shape = []
    for ii in range(ndim):
        lo, hi = 10.0**np.random.uniform(-1, 1, 2)
        ss = np.random.randint(*size)
        shape.append(ss)
        hi = lo + hi
        ee = np.random.uniform(*np.log10([lo, hi]), ss)
        ee = sorted(ee)
        edges.append(ee)        

    grid = np.meshgrid(*edges, indexing='ij')
    return edges, grid

def gen_dens_1(grid):
    shape = grid[0].shape
    # dens = np.zeros(shape)
    dens = np.random.uniform(0.0, 10.0, size=shape)
    return dens    

In [None]:
def test_1d():
    edges, grid = gen_grid(1)
    dens = gen_dens_1(grid)

    xx = grid[0]
    plt.plot(xx, dens)
    for x in xx:
        plt.axvline(x, ls='--', alpha=0.5, color='0.5')
        
    cent = kale.utils.centroids(grid, dens)
        
    # pass in a uniform (1.0) density to calculate midpoints of bin edges
    mids = kale.utils.centroids(grid, np.ones_like(dens))

    plt.scatter(cent, dens.mean() * np.ones_like(cent), color='r')
    plt.scatter(mids, dens.mean() * np.ones_like(cent), color='b', marker='x')

    # ---- Make sure uniform weightings give the exact mid-points
    # calculate midpoints manually
    xx = grid[0]
    mids_check = xx[:-1] + 0.5 * np.diff(xx)
    print("mids       = ", mids[0])
    print("mids_check = ", mids_check)
    assert np.allclose(mids[0], mids_check), "Uniform weighted centroids do not match midpoints!"

    # ---- Make sure weighted centroids do *not* match the midpoints (to machine precision)
    assert not np.any(cent == mids_check), "Weighted centroids exactly match midpoints!"

    # --- Make sure weightings match exact weighted centroids
    # calculate weighted averages manually
    xx = grid[0]
    yy = dens
    zz = [(xx[ii]*yy[ii] + xx[ii+1]*yy[ii+1])/(yy[ii] + yy[ii+1]) for ii in range(xx.size-1)]
    check = np.isclose(cent[0], zz)
    print("centroids: ", cent[0])
    print("manual   : ", zz)
    print("manual   : ", check)
    assert np.all(check), "centroids do not match manual calculation!"

    nbshow()

test_1d()

In [None]:
def draw_cents(ax, cent, **kw):
    defs = dict(color='r', alpha=0.5, s=30)
    defs.update(kw)
    xx, yy = [cc.flatten() for cc in cent]
    return plt.scatter(xx, yy, **defs)

def check2d(grid, dens):
    xs, ys = dens.shape
    shape = xs-1, ys-1
    xx = np.zeros(shape)
    yy = np.zeros(shape)
    
    for (ii, jj) in np.ndindex(shape):
        tot = 0.0
        for aa, bb in np.ndindex(2, 2):
            idx = (ii+aa, jj+bb)
            xx[ii, jj] += dens[idx] * grid[0][idx]
            yy[ii, jj] += dens[idx] * grid[1][idx]
            tot += dens[idx]

        xx[ii, jj] /= tot
        yy[ii, jj] /= tot

    return xx, yy

def test_2d():
    edges, grid = gen_grid(2)
    dens = gen_dens_1(grid)

    plt.pcolormesh(*grid, dens, shading='gouraud')
    plt.colorbar()
    ax = plt.gca()

    for edge, func in zip(edges, [plt.axvline, plt.axhline]):
        for ee in edge:
            func(ee, color='r', ls=':', zorder=100, alpha=0.5)

    cent = kale.utils.centroids(grid, dens)

    mids = kale.utils.centroids(grid, np.ones_like(dens))
    draw_cents(ax, cent)
    draw_cents(ax, mids, color='b', marker='x')
    # --- Manually calculate midpoints of each bin
    check = check2d(grid, np.ones_like(dens))
    assert np.allclose(mids, check), "2D midpoints do not match manual calculation!"

    # --- Manually calculate centroids of each bin, check that values are correct
    check = check2d(grid, dens)
    draw_cents(ax, check, color='r', marker='+', s=200)
    # make sure they match the calculated values
    assert np.allclose(cent, check), "2D centroids do not match manual calculation!"
    return

test_2d()
nbshow()

In [None]:
def checknd(grid, dens):
    grid = [np.array(gg) for gg in grid]
    dens = np.asarray(dens)
    dim = len(grid)
    shape = [nn-1 for nn in np.shape(dens)]
    # print(f"{dim=}, {shape=}")
    cent = [np.zeros(shape) for ii in range(dim)]
    
    # iterate over each bin
    for loc in np.ndindex(tuple(shape)):
        # print(loc)
        tot = 0.0
        offset = [2 for ii in range(dim)]
        for off in np.ndindex(tuple(offset)):
            idx = [ll+oo for ll, oo in zip(loc, off)]
            # print("\t", off, idx, dens[idx])
            idx = tuple(idx)
            tot += dens[idx]
            for ii in range(dim):
                # print("\t\t", grid[ii][idx])
                cent[ii][loc] += dens[idx] * grid[ii][idx]
                
        # print(f"{tot=}")
        for ii in range(dim):
            cent[ii][loc] /= tot

    return cent

# checknd([[1.0, 2.0, 3.0]], [1.0, 1.0, 1.0])

for ndim in range(1, 5):
    RTOL = 1e-10
    edges, grid = gen_grid(ndim)
    dens = gen_dens_1(grid)
    print(f"{ndim=}, {dens.shape=}")
    cent = kale.utils.centroids(grid, dens)

    # manually calculate the centroids for comparison
    check = checknd(grid, dens)
    # test whether function values `cent` match manual calculation `check`
    test = np.isclose(cent, check, rtol=RTOL)
    status = np.all(test)
    print(f"good is {status}")
    # print(f"{ndim=}\n{cent=}\n{check=}\n{test=}")
    assert status

    # Change the weighting in the manual calculation, these should fail!
    check = checknd(grid, dens+10.0)
    test = np.isclose(cent, check, rtol=RTOL*10)
    status = np.any(test)
    print(f"bad is {status}")
    # bads = np.where(test)
    # print(bads)
    # print(np.array(cent)[bads])
    # print(np.array(check)[bads])
    assert not status
    