In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

import warnings
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import numpy as np
import pandas as pd
import geopandas as gpd
import random

import matplotlib.pyplot as plt
from cartopy import crs as ccrs

from geographic_sampling_ensemble import (
    generate_random_spherical_points, 
    find_nearest_neighbors,
    make_polygon_find_tg_inside,
)


## Load global analysis and metadata

In [None]:
output_dir = "./output"

analysis = pd.read_csv(f"{output_dir}/global_analysis.csv")
analysis = gpd.GeoDataFrame(
    analysis, geometry=gpd.points_from_xy(analysis.lon, analysis.lat)
)

## Generate random geographic divisions on a sphere

In [None]:
# generate some number of random areas
np.random.seed(10)
N_iterations = 1000
random_areas = []
# loop over number of iterations desired, but add a buffer since some will be malformed
for _ in range(N_iterations + int(0.5 * N_iterations)):
    # Find the path using nearest neighbor algorithm
    working_on_it = True
    while working_on_it:
        # Generate random spherical points
        num_vertices = 10
        lat, lon = generate_random_spherical_points(num_vertices)
        result = find_nearest_neighbors(lat, lon)
        if result is not None:
            plat, plon = result[0], result[1]
            working_on_it = False
    random_areas.append(dict(plat=plat, plon=plon))

### Make a figure showing examples of the random geographic divisions

In [None]:
iterations = [3, 27, 54, 25]
iterations2 = [None, None, 109, 200]

fig = plt.figure(figsize=(9, 4.75))
# colors = ["#C9E7F8", "lightgray", "#D55E00"]  # polygon, tg outside, tg inside
# colors2 = ["#F9F4B4", "lightgray", "#008F69"]  # polygon, tg outside, tg inside

for n, (i, i2) in enumerate(zip(iterations, iterations2)):

    axis = fig.add_subplot(2, 2, n + 1, projection=ccrs.Mollweide())
    axis.coastlines(zorder=-10, lw=0.5)
    # axis.add_feature(cfeature.BORDERS, linewidth=0.5, edgecolor="lightgray", zorder=-10)

    labels = ["Decagon 1", None, "TG Group 1"] if n == 3 else None
    labels2 = ["Decagon 2", "Excluded", "TG Group 2"] if n == 3 else None

    if i2 is not None:
        colors = ["#C9E7F8", "darkgray", "#D55E00"]  # polygon, tg outside, tg inside
        colors2 = ["#F9F4B4", "lightgray", "#008F69"]  # polygon, tg outside, tg inside
    else:
        colors = ["#C9E7F8", "#008F69", "#D55E00"]  # polygon, tg outside, tg inside

    plon, plat = random_areas[i]["plon"].copy(), random_areas[i]["plat"].copy()
    make_polygon_find_tg_inside(axis, plon, plat, analysis, colors, labels=labels)

    if i2 is not None:
        plon, plat = random_areas[i2]["plon"].copy(), random_areas[i2]["plat"].copy()
        make_polygon_find_tg_inside(
            axis, plon, plat, analysis, colors2, zshift=-1, labels=labels2
        )

    # configure plot
    axis.gridlines(linestyle=":")
    axis.set_global()

    # subplot labels
    sp_labels = ["a", "", "b", ""]
    axis.annotate(
        text=sp_labels[n],
        xy=(0.02, 0.93),
        xycoords="axes fraction",
        fontsize=12,
        fontweight="bold",
    )

handles, labels = axis.get_legend_handles_labels()
leg_order = [0, 2, 1, 3, 4]
handles = [handles[i] for i in leg_order]
labels = [labels[i] for i in leg_order]
leg = fig.legend(
    handles,
    labels,
    ncols=1,
    loc="center",
    bbox_to_anchor=(0.5, 0.49),
    frameon=False,
    fontsize=9,
)

# fig.suptitle("Examples of random geographic sampling")
fig.tight_layout()

fig.savefig("./figures/manuscript/random_geographic_sampling.png", dpi=300)
fig.savefig("./figures/manuscript/random_geographic_sampling.pdf", dpi=300)

## Distributions of differences between large geographic regions/groupings

In [None]:
# minimum number of tide gauges in the regions for the iteration to count
min_group_tgs = 20

# don't try to calculate median differences on columns that are not numeric
data_columns = [
    c
    for c in analysis.columns
    if (analysis[c].dtype == "float64") and c not in ["lat", "lon"]
]
analysis_data = analysis.loc[:, data_columns]


fig = plt.figure()
axis = fig.add_subplot(111, projection=ccrs.Mollweide())

io_median_diff_ensemble = []
io_mean_diff_ensemble = []
io_exclusion_median_diff_ensemble = []
io_exclusion_mean_diff_ensemble = []
iter = list(range(len(random_areas)))
iter2 = iter.copy()
random.shuffle(iter2)
for i, i2 in zip(iter, iter2):

    # get tg locations inside the random area
    # sometimes (albeit rarely) polygons are malformed, which will cause an error; use
    # try/except to skip these malformed polygons
    plon, plat = random_areas[i]["plon"].copy(), random_areas[i]["plat"].copy()
    try:
        tg_in = make_polygon_find_tg_inside(
            axis, plon, plat, analysis, show_polygon=False
        )
    except:
        continue

    # check to be sure there are some minimum number of tgs in each group
    if (tg_in.sum() < min_group_tgs) or ((~tg_in).sum() < min_group_tgs):
        continue

    # get absolute differences between medians inside and outside the random area
    io_median_diff_ensemble.append(
        (
            analysis_data.loc[tg_in].median(axis=0)
            - analysis_data.loc[~tg_in].median(axis=0)
        ).abs()
    )

    # get absolute differences between means inside and outside the random area
    io_mean_diff_ensemble.append(
        (
            analysis_data.loc[tg_in].mean(axis=0)
            - analysis_data.loc[~tg_in].mean(axis=0)
        ).abs()
    )

    # get tg locations outside the first random area and inside a second random area
    plon, plat = random_areas[i2]["plon"].copy(), random_areas[i2]["plat"].copy()
    try:
        tg_in2 = make_polygon_find_tg_inside(
            axis, plon, plat, analysis, show_polygon=False
        )
        tg_in2 = tg_in2 & ~tg_in
    except:
        continue

    # check to be sure there are some minimum number of tgs in each group
    if (tg_in.sum() < min_group_tgs) or ((tg_in2).sum() < min_group_tgs):
        continue

    # get absolute differences between medians inside and outside with random exclusions
    io_exclusion_median_diff_ensemble.append(
        (
            analysis_data.loc[tg_in].median(axis=0)
            - analysis_data.loc[tg_in2].median(axis=0)
        ).abs()
    )

    # get absolute differences between means inside and outside with random exclusions
    io_exclusion_mean_diff_ensemble.append(
        (
            analysis_data.loc[tg_in].mean(axis=0)
            - analysis_data.loc[tg_in2].mean(axis=0)
        ).abs()
    )

plt.close(fig)

# make dataframe of differences across iterations
io_median_diff_ensemble = pd.DataFrame(io_median_diff_ensemble).iloc[:N_iterations]
io_median_diff_ensemble.to_csv(
    f"{output_dir}/group_median_differences_ensemble.csv", index=False
)

# make dataframe of differences across iterations
io_mean_diff_ensemble = pd.DataFrame(io_mean_diff_ensemble).iloc[:N_iterations]
io_mean_diff_ensemble.to_csv(
    f"{output_dir}/group_mean_differences_ensemble.csv", index=False
)

# make dataframe of differences across iterations
io_exclusion_median_diff_ensemble = pd.DataFrame(
    io_exclusion_median_diff_ensemble
).iloc[:N_iterations]
io_exclusion_median_diff_ensemble.to_csv(
    f"{output_dir}/group_exclusion_median_differences_ensemble.csv", index=False
)

# make dataframe of differences across iterations
io_exclusion_mean_diff_ensemble = pd.DataFrame(io_exclusion_mean_diff_ensemble).iloc[
    :N_iterations
]
io_exclusion_mean_diff_ensemble.to_csv(
    f"{output_dir}/group_exclusion_mean_differences_ensemble.csv", index=False
)

# get percentiles across the ensemble
percentiles = [80, 90, 95, 99]

io_median_diff_percentiles = pd.DataFrame(
    [io_median_diff_ensemble.quantile(p / 100, axis=0) for p in percentiles],
    index=percentiles,
).T
io_median_diff_percentiles.to_csv(
    f"{output_dir}/group_median_differences_percentiles.csv"
)

io_mean_diff_percentiles = pd.DataFrame(
    [io_mean_diff_ensemble.quantile(p / 100, axis=0) for p in percentiles],
    index=percentiles,
).T
io_median_diff_percentiles.to_csv(
    f"{output_dir}/group_mean_differences_percentiles.csv"
)

io_exclusion_median_diff_percentiles = pd.DataFrame(
    [io_exclusion_median_diff_ensemble.quantile(p / 100, axis=0) for p in percentiles],
    index=percentiles,
).T
io_exclusion_median_diff_percentiles.to_csv(
    f"{output_dir}/group_exclusion_median_differences_percentiles.csv"
)

io_exclusion_mean_diff_percentiles = pd.DataFrame(
    [io_exclusion_mean_diff_ensemble.quantile(p / 100, axis=0) for p in percentiles],
    index=percentiles,
).T
io_exclusion_median_diff_percentiles.to_csv(
    f"{output_dir}/group_exclusion_mean_differences_percentiles.csv"
)

io_median_diff_percentiles

In [None]:
io_exclusion_median_diff_ensemble.shape