# Compute and store in netcdf - Lonlat ms diagnosis on main combination

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

import dask.dataframe as dd
import os
from glob import glob

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.geodesic as cgeo

import matplotlib.transforms as mtransforms

crs = ccrs.PlateCarree()

import histlib.matchup as match
from histlib.diagnosis import (
    build_matchup_dataframe,
    synthetic_figure,
    path_csv,
    put_fig_letter,
    lonlat_stats,
)
from histlib.cstes import labels, zarr_dir, matchup_dir, c0, U

U2 = U**2

ERROR! Session/line number was not unique in database. History logging moved to new session 6022


In [2]:
if True:
    from dask.distributed import Client
    from dask_jobqueue import PBSCluster

    # cluster = PBSCluster(cores=56, processes=28, walltime='04:00:00')
    # cluster = PBSCluster(cores=7, processes=7, walltime='04:00:00')
    cluster = PBSCluster(cores=10, processes=10, walltime="00:30:00")
    w = cluster.scale(jobs=1)
else:
    from dask.distributed import Client, LocalCluster

    cluster = LocalCluster()

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://10.148.0.140:8787/status,

0,1
Dashboard: http://10.148.0.140:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.148.0.140:48734,Workers: 0
Dashboard: http://10.148.0.140:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [8]:
cluster.close()

In [3]:
DL, DT = 25e3, 30 * 60
comb_25 = {
    "acc": "drifter_acc_x_25",
    "cor": "drifter_coriolis_x_25",
    "ggx": "alti_ggx_adt_filtered_ocean_tide_internal_tide",
    "wd": "es_cstrio_z15_drifter_wd_x",
}

df = (
    dd.read_csv(path_csv(comb_25, "both", True, DL, DT))
    .repartition(npartitions=10)
    .persist()
)

vars_errors = [
    "B_acc",
    "E_acc",
    "E_cor",
    "B_cor",
    "E_ggx",
    "B_ggx",
    "E_wd",
    "B_wd",
    "X_acc_cor",
    "X_acc_ggx",
    "X_acc_wd",
    "X_cor_ggx",
    "X_cor_wd",
    "X_ggx_wd",
    "ACC",
    "COR",
    "GGX",
    "WD",
    "S",
    "sigma",
]

In [4]:
def mean_df(df):
    return df.mean()


from scipy.stats import bootstrap


def compute_bootstrap_error(dff):
    # print(len(dff))
    if len(dff) < 3:
        return np.nan
    else:
        data = (dff,)  # samples must be in a sequence
        return bootstrap(data, statistic=mean_df).standard_error


def lonlat_stats(df, dl=5, vars_errors=["S"]):
    df["latbin"] = (df.lat // dl) * dl
    df["lonbin"] = (df.lon // dl) * dl

    # Mean in bins
    mean = df.groupby(["latbin", "lonbin"]).mean().compute()

    # count
    count = (
        df.reset_index()[["obs", "latbin", "lonbin"]]
        .groupby(["latbin", "lonbin"])
        .count()
        .obs.compute()
        .rename("nb_coloc_bin")
    )

    # bootstrap errors
    DF = []
    for v in vars_errors:
        DF.append(
            df.reset_index()[vars_errors + ["latbin", "lonbin"]]
            .groupby(["latbin", "lonbin"])[v]
            .apply(compute_bootstrap_error)
            .compute()
        )
    booterrors = pd.concat(DF, axis=1)
    booterrors = booterrors.rename(columns={v: "be__" + v for v in booterrors.columns})

    return pd.concat([mean, count, booterrors], axis=1)

_________
# Compute all 

In [5]:
dfm = lonlat_stats(df, dl=5, vars_errors=vars_errors)

In [6]:
dsm = dfm.to_xarray()

In [7]:
dsm.to_netcdf(
    path_csv(comb_25, "both", True, DL, DT)
    .replace("analysis_files/", "analysis_files/lonlat_")
    .replace(".csv", ".nc")
)