# Bootstrapping

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from _notebooks import analysis
from importlib import reload
from tqdm import tqdm
import warnings

L_box = 50
mu_factor = 6  # conversion to microns
min_factor = 8  # conversion to minutes

In [None]:
import yaml


def apply_time_filter(df, dt):
    """
    dt : float, in min
    """
    df["ts"] = df["time[hr]"] * 60 // dt
    return df.drop_duplicates(subset=["ts", "rid"], keep="first").reset_index(drop=True)


def _get_mp_type(yfile):
    with open(yfile) as f:
        return yaml.safe_load(f)["substrate"]["kind"]


def _make_title(df):
    tbl = {
        "gamma": r"$\gamma$",
        "R_eq": r"$R_{eq}$",
        "mag_std": r"$\sigma_{MVG}$",
        "add_rate": r"$\tau_{MVG}$",
        "gid": "ID",
    }
    d = dict(df.iloc[0][3:8])
    title = ""
    for key, val in d.items():
        title += tbl[key] + " = " + f"{val}" + "\n"
    return title


def linear_init_pts(xmin, xmax, vmin, vmax, n_pts, s=1, basin_only=False):
    def _around_basin():
        d = 10
        x1 = np.linspace(xmin - d, xmin + d, n_pts)
        x2 = np.linspace(xmax - d, xmax + d, n_pts)
        x = np.append(x1, x2)
        y = s * (vmax - vmin) / (xmax - xmin) * (x - xmin) + vmin
        return np.hstack([x.reshape(-1, 1), y.reshape(-1, 1)])

    if basin_only:
        return _around_basin()

    x = np.linspace(xmin, xmax, n_pts)
    y = s * (vmax - vmin) / (xmax - xmin) * (x - xmin) + vmin
    return np.hstack([x.reshape(-1, 1), y.reshape(-1, 1)])


def F_init_pts(F, bounds):
    y_indx, x_indx = np.where(~np.isnan(F))
    xmin, xmax, vmin, vmax, nbins = bounds

    yy = vmin + y_indx * (vmax - vmin) / nbins
    xx = xmin + x_indx * (xmax - xmin) / nbins
    return np.hstack([xx.reshape(-1, 1), yy.reshape(-1, 1)])


def init_lattice(F, bounds, buffer=1):
    xmin, xmax, vmin, vmax, nbins = bounds
    X, Y = np.meshgrid(
        np.linspace(xmin + buffer, xmax - buffer, nbins),
        np.linspace(vmin + buffer, vmax - buffer, nbins),
    )
    non_nans = np.argwhere(~np.isnan(F))
    x, y = X[0][non_nans[:, 1]], Y[:, 0][non_nans[:, 0]]
    return X, Y, np.hstack([x.reshape(-1, 1), y.reshape(-1, 1)])


def get_xv_traj_for_gid(gid):
    df_gid_singleRunLong = pd.read_pickle(
        f"../_server/sim_data/corners_only_2/single_run_long/pkls/fulltake_gid{gid}.pkl"
    )
    df_gid_singleRunLong.x *= mu_factor
    df_gid_singleRunLong.y *= mu_factor

    singleRun_xva = analysis.calc_v_a_from_position(
        df_gid_singleRunLong.x, df_gid_singleRunLong["time[hr]"]
    )
    return singleRun_xva[["x", "v"]].to_numpy()


def sample_dataframe(df, size, s=0):
    import time

    rng = np.random.RandomState(seed=s + int(time.time()))
    df_list = [elem for _, elem in df.groupby("rid")]
    return pd.concat(
        [
            df_list[k]
            for k in rng.choice(np.arange(len(df_list)), replace=True, size=size)
        ]
    )


def prep_dir(gid, parent):
    import os

    root = f"../_server/sim_data/{parent}/phase_space_bootstrap_samples/grid_{gid}/"
    if not os.path.exists(root):
        os.makedirs(root)

    sub_root = (
        f"../_server/sim_data/{parent}/phase_space_bootstrap_samples/grid_{gid}/SDs/"
    )
    if not os.path.exists(sub_root):
        os.makedirs(sub_root)

    stream_root = f"../_server/sim_data/{parent}/phase_space_bootstrap_samples/grid_{gid}/streamlines/"
    if not os.path.exists(stream_root):
        os.makedirs(stream_root)

    return root, sub_root, stream_root


def pretty_imshow(im, cbar_title, mark_basins=False, save_path=None, **imshow_kwargs):
    plt.figure(figsize=(5, 3.5), dpi=300)

    plt.imshow(im, **imshow_kwargs)

    cbar = plt.colorbar()
    cbar.set_label(cbar_title)
    plt.xlabel(r"$x$ ($\mu$m)")
    plt.ylabel(r"$v$ ($\mu$m/hr)")
    plt.axis("auto")

    if mark_basins:
        plt.vlines(
            [133, 167],
            vmin,
            vmax,
            colors=["orange", "orange"],
            linestyles=["dashed", "dashed"],
        )

    if save_path is not None:
        plt.savefig(save_path)
        plt.close()
    plt.show()


#### Load grid by grid
- Load `fulltake_gid*.pkl`, which is all of the runs for a config

In [None]:
from glob import glob

gid = 16
root = "rectangular_only"
gid_pkl = f"../_server/sim_data/{root}/fulltake_gid{gid}.parquet"
df_gid = pd.read_parquet(gid_pkl)

df_gid.x *= mu_factor
df_gid.y *= mu_factor

n_uniq_runs = df_gid.rid.unique().size

display(n_uniq_runs, df_gid)

#### Bootstrap for one grid

Get `N_sample` bootstrapped samples, each with the original number of runs

In [None]:
root_path, msd_path, stream_root = prep_dir(gid, root)
n = df_gid.rid.unique().size
N_sample = 2

For each bootstrap sample, compute $x$, $v$, and $a$. Bin them. Compute $F$ and $\sigma$.

In [None]:
F_samples = []
sigma_samples = []
xmin, xmax, vmin, vmax = 200, -200, 200, -200
x_pos = []

for k in tqdm(range(N_sample), total=N_sample):
    sample_df = sample_dataframe(df_gid, size=n, s=k)

    x_pos.append(sample_df.x.values)

    # x, v, a values for this entire config
    grid_x_v_a = []
    dt = np.mean(np.diff(sample_df["time[hr]"]))  # hr

    # compute speed and acc for each run
    for rid, df_rid in sample_df.groupby("rid"):
        x_v_a = analysis.calc_v_a_from_position(df_rid.x, df_rid["time[hr]"])
        x_v_a[sample_df.iloc[0][5:].index] = sample_df.iloc[0][5:]
        grid_x_v_a.append(x_v_a)

    grid_x_v_a = pd.concat(grid_x_v_a)

    # with a 112mu mp, this gives bins of dim 3.5mu x 3.5mu
    nbins = 32
    analysis.get_bin_indices(grid_x_v_a, nbins)

    F, F_std_err, sigma = analysis.calc_F_sigma(grid_x_v_a, dt, nbins, min_pts=5)
    F_samples.append(F)
    sigma_samples.append(sigma)
    xmin_s, xmax_s, vmin_s, vmax_s = (
        grid_x_v_a.agg(["min", "max"])[["x", "v"]].to_numpy().T.flatten()
    )

    if xmin_s < xmin:
        xmin = xmin_s
    if xmax_s > xmax:
        xmax = xmax_s
    if vmin_s < vmin:
        vmin = vmin_s
    if vmax_s > vmax:
        vmax = vmax_s

Compute $\left[F_s(x, v) - F_{\mathrm{avg}}(x, v)\right]^2$

- Plot each instance
- Average to get MSD and plot its log

In [None]:
F_avg = np.nanmean(F_samples, axis=0)
F_SD = []

for s in range(N_sample):
    F_sqrd_diff = (F_samples[s] - F_avg) ** 2
    F_SD.append(F_sqrd_diff)
    pretty_imshow(
        F_sqrd_diff,
        cbar_title=r"$\left(F_s - F_{\mathrm{avg}}\right)^2$",
        save_path=msd_path + f"sample_{s}",
        mark_basins=True,
        origin="lower",
        extent=[xmin, xmax, vmin, vmax],
        interpolation="bilinear",
    )

In [None]:
F_MSD = np.nanmean(F_SD, axis=0)
pretty_imshow(
    np.log(F_MSD),
    cbar_title=r"ln$\left(\left\langle\left(F_s - F_{\mathrm{avg}}\right)^2\right\rangle\right)$",
    save_path=root_path + "MSD",
    mark_basins=True,
    origin="lower",
    extent=[xmin, xmax, vmin, vmax],
    interpolation="bilinear",
)


Plot streamlines from each $F_s(x, v)$

In [None]:
# plot_title = _make_title(df_gid)
plot_title = ""
mp_type = _get_mp_type(f"../configs/IM/grid_id{gid}/simbox.yaml")
plot_title += f"substrate = {mp_type}"
bounds = (xmin, xmax, vmin, vmax, nbins)
title = {"title": plot_title, "size": 20}

for k, F in enumerate(F_samples):
    X, Y, init_pts = init_lattice(F, bounds)

    fig = analysis.F_streamplot(
        F,
        bounds,
        stream_init_pts=init_pts[:5],
        title=title,
        interp="bilinear",
        do_try=False,
        streamplot_kwargs={
            "integration_direction": "forward",
            "color": "black",
            "broken_streamlines": False,
            "density": 1,
            "linewidth": 0.5,
        },
        save_path=stream_root + f"sample_{k}.png",
    )

    # fig.gca().vlines(
    #     [133, 167],
    #     vmin,
    #     vmax,
    #     colors=["orange", "orange"],
    #     linestyles=["dashed", "dashed"],
    # )


Color phase-space for each $F$

In [None]:
def highlight_end_pts(init_pts, F, filepath):
    colors = []

    for xx, yy in init_pts:
        fig, ax = plt.subplots(1, 1)
        try:
            stream = ax.streamplot(
                X,
                Y,
                Y,
                F,
                linewidth=0.5,
                start_points=[[xx, yy]],
                integration_direction="forward",
                color="black",
                broken_streamlines=False,
                density=1,
            )
            streamlines = np.array(stream.lines.get_segments())
            plt.close(fig)

            if len(streamlines) == 0:
                colors.append("gainsboro")
                continue

            end_pt = streamlines[-1][-1]
            if end_pt[0] < 133:
                colors.append("red")
            elif end_pt[0] > 167:
                colors.append("blue")
            else:
                colors.append("orange")

        except ValueError:
            colors.append("gainsboro")
            plt.close(fig)
            continue

    import matplotlib.patches as mpatches

    red_patch = mpatches.Patch(color="red", label="Left")
    blue_patch = mpatches.Patch(color="blue", label="Right")
    orange_patch = mpatches.Patch(color="orange", label="Middle")

    legend = [red_patch, blue_patch, orange_patch]

    plt.figure(figsize=(3, 3), dpi=300)
    plt.vlines(
        x=133,
        ymin=vmin,
        ymax=vmax,
        linewidth=1,
        colors=["black"],
        linestyles=["dashed"],
    )
    plt.vlines(
        x=167,
        ymin=vmin,
        ymax=vmax,
        linewidth=1,
        colors=["black"],
        linestyles=["dashed"],
    )

    plt.legend(handles=legend, bbox_to_anchor=(1.01, 1))
    plt.scatter(init_pts[:, 0], init_pts[:, 1], color=colors, s=5)
    plt.xlabel(r"$x$ ($\mu$m)")
    plt.ylabel(r"$v$ ($\mu$m/hr)")
    plt.savefig(filepath)
    plt.close()

In [None]:
plot_title = ""
mp_type = _get_mp_type(f"../configs/IM/grid_id{gid}/simbox.yaml")
plot_title += f"substrate = {mp_type}"
bounds = (xmin, xmax, vmin, vmax, nbins)
title = {"title": plot_title, "size": 20}

for k, F in tqdm(enumerate(F_samples), total=N_sample):
    X, Y, init_pts = init_lattice(F, bounds)
    highlight_end_pts(init_pts, F, stream_root + f"sample_{k}.png")

In [None]:
F_avg = np.nanmean(F_samples, axis=0)
F_MSD = []

bounds = (xmin, xmax, vmin, vmax, nbins)
plot_title = _make_title(grid_x_v_a)
mp_type = _get_mp_type(f"../configs/IM/grid_id{gid}/simbox.yaml")
plot_title += f"substrate = {mp_type}"
title = {"title": plot_title, "size": 20}

for F in F_samples:
    F_MSD.append((F - F_avg) ** 2)

# Bootstrap end points & x histogram
for k, F in enumerate(F_samples):
    fig, axs = plt.subplots(1, 2, figsize=(6, 2), dpi=300)
    highlight_end_pts(F, bounds, axs[0], stream_root + f"sample_{k}.png")
    axs[1].hist(x_pos[k], bins=32, density=True)

    plt.subplots_adjust(wspace=0.5)
    plt.close(fig)

import os

cmd_streams = (
    f"ffmpeg -i {stream_root}/sample_%d.png -b:v 4M -s 600x600 -pix_fmt yuv420p -filter:v 'setpts=2.*PTS' "
    f"{root_path}/stream_endings.mp4 -y -hide_banner -loglevel fatal"
)
os.system(cmd_streams)