In [1]:
%cd ..

/mnt/c/Users/XCB/OneDrive - Van Oord/Documents/thesis/code


In [2]:
import pandas as pd
import geopandas as gpd
from glob import glob
import seaborn as sns
from atl_module.plotting import (
    error_lidar_pt_vs_truth_pt,
    plot_photon_map,
    plot_tracklines_overview,
)
from sklearn.metrics import r2_score
import numpy as np
from atl_module.geospatial_utils.raster_interaction import (
    query_raster,
    query_from_lines,
)
import matplotlib.pyplot as plt
from atl_module.bathymetry_extraction.refraction_correction import correct_refr
from atl_module.plotting import set_size

# aggregation functions

In [3]:
# defining some useful aggregation functions

rms_lambda = lambda x: np.sqrt(np.mean(np.square(x)))
mae_lambda = lambda x: np.mean(np.abs(x))

# this is just for by-trackline groupbys
def agg_photon_counts(arr):
    assert np.all(
        arr == arr[0]
    ), "the groupby must be by tracklines with identical values for `n_subsurf_points` and `n_total_points`"
    return arr[0]


# this dictionary can be unpacked into .agg() to create summary table with named columns. can only be used with the bathymetry points GDF since its based on the expected column names
unpackable_error_dict = {
    "RMS Error [m]": pd.NamedAgg(column="error", aggfunc=rms_lambda),
    "MAE [m]": pd.NamedAgg(column="error", aggfunc=mae_lambda),
    "Mean Error [M]": pd.NamedAgg(column="error", aggfunc=np.mean),
    # the total number of points for any grouping is the count of entries in the bathy points gdf
    # the column X, which is the latitude, its used because it always has a valid value
    "n bathy points": pd.NamedAgg(column="X", aggfunc="count"),
    # to get the number of subsurface photons, we can take the mean because it will be the same if we are averaging by site and unique trackline
    # uses a function that raises an error if the points aren't all the same
    # "n photons":pd.NamedAgg(column='n_total_points',aggfunc=agg_photon_counts),
    # "n subsurf photons":pd.NamedAgg(column='n_subsurf_points',aggfunc=agg_photon_counts)
}

# get all AOIs

In [4]:
aoidflist = []
for aoifile in glob("../data/test_sites/*/AOI.gpkg"):
    # get the name of each site
    sitename = aoifile.split("/")[3]
    df = gpd.read_file(aoifile)
    df["site"] = sitename
    aoidflist.append(df)

aoi_combined = pd.concat(aoidflist)

# get all secchi depth points

In [5]:
secchi_pt_list = []
for file in glob("../data/test_sites/*/secchi_pts.gpkg"):
    # get the name of each site
    sitename = file.split("/")[3]
    df = gpd.read_file(file)
    df["site"] = sitename
    secchi_pt_list.append(df)

secchi_pt_combined = pd.concat(secchi_pt_list).reset_index(drop=True)

secchi_pt_combined = secchi_pt_combined.assign(
    X=secchi_pt_combined.geometry.x, Y=secchi_pt_combined.geometry.y
)

# get all tracklines

In [6]:
# tracklines_dflist = [gpd.read_file(file) for file in glob('../data/test_sites/*/tracklines')]

tracklinesdflist = []
for file in glob("../data/test_sites/*/tracklines"):
    # get the name of each site
    sitename = file.split("/")[3]
    df = gpd.read_file(file)
    df["site"] = sitename
    tracklinesdflist.append(df)

tracklines_combined = pd.concat(tracklinesdflist).reset_index(drop=True)
tracklines_combined = tracklines_combined.assign(
    date=pd.to_datetime(tracklines_combined.date).dt.date
)

tracklines_combined.sort_values('avg_ph_cou')

Unnamed: 0,file,beam,rgt,date,beam_type,n_photons,p_hconf,avg_ph_cou,avg_fsat,length,geometry,site
898,processed_ATL03_20190115094052_02740207_005_01,gt1l,274,2019-01-15,strong,5,96.328679,1.000000,0.000000,184.521091,"LINESTRING (-157.83223 21.56362, -157.83208 21...",oahu5
583,processed_ATL03_20200713073941_02740807_005_01,gt2r,274,2020-07-13,weak,34,95.260816,1.147059,0.000000,6218.042512,"LINESTRING (-157.95395 21.27779, -157.95125 21...",oahu1
1200,processed_ATL03_20210507051755_06631101_005_01,gt3r,663,2021-05-07,strong,20,96.302326,1.150000,0.000000,638.312852,"LINESTRING (-158.28003 21.55493, -158.28069 21...",oahu8
510,processed_ATL03_20220531053325_10561501_005_02,gt2l,1056,2022-05-31,strong,216,96.222648,1.310185,0.000000,14332.084721,"LINESTRING (-81.14391 24.61500, -81.15778 24.7...",florida_keys
570,processed_ATL03_20200413115954_02740707_005_01,gt3l,274,2020-04-13,weak,55,95.813700,1.327273,0.000000,6129.874607,"LINESTRING (-157.97985 21.27356, -157.97778 21...",oahu1
...,...,...,...,...,...,...,...,...,...,...,...,...
1063,processed_ATL03_20200606211436_11050701_005_01,gt1l,1105,2020-06-06,strong,209501,3.120823,2726.849695,0.000000,2851.411856,"LINESTRING (-158.20881 21.57528, -158.21371 21...",oahu7
1159,processed_ATL03_20200606211436_11050701_005_01,gt1l,1105,2020-06-06,strong,850030,3.120823,2727.128385,0.000000,9954.309631,"LINESTRING (-158.17409 21.37328, -158.18971 21...",oahu8
1051,processed_ATL03_20190915220417_12190407_005_01,gt1l,1219,2019-09-15,weak,255324,0.000000,2882.606555,0.000000,3234.898573,"LINESTRING (-158.16393 21.57320, -158.16248 21...",oahu7
1163,processed_ATL03_20200606211436_11050701_005_01,gt3l,1105,2020-06-06,strong,216092,2.836104,3122.430442,0.000000,2382.689088,"LINESTRING (-158.10246 21.31773, -158.10621 21...",oahu8


In [7]:
bathy_pts_df.sort_values('ph_count').ph_count

NameError: name 'bathy_pts_df' is not defined

# Concatenate all bathymetry points from all sites

In [None]:
dflist = []
for file in glob("../data/test_sites/*/all_bathy_pts.gpkg"):
    # get the name of each site
    sitename = file.split("/")[3]
    df = gpd.read_file(file)
    df["site"] = sitename
    dflist.append(df)

bathy_pts_df = pd.concat(dflist).reset_index(drop=True).dropna()

bathy_pts_df = bathy_pts_df.assign(date=pd.to_datetime(bathy_pts_df.delta_time).dt.date)

# looking into refraction correction

In [None]:
fig, ax = plt.subplots()

az_vecs = np.linspace(-3, 3, 1000)
elev_vecs = np.linspace(1.47, 1.58, 1000)
azgrid, elevgrid = np.meshgrid(az_vecs, elev_vecs)
xcorr, ycorr, zcorr = correct_refr(5, azgrid, elevgrid)

ax.contour(elevgrid, azgrid, zcorr)
bathy_pts_df.plot.scatter(x="p_vec_elev", y="p_vec_az", ax=ax)

In [None]:
el1 = bathy_pts_df.p_vec_elev.max()
az1 = bathy_pts_df.p_vec_az.iloc[bathy_pts_df.p_vec_elev.idxmax()]

In [None]:
correct_refr(10, el1, az1)

## Some stats about all the total batymetry points found

do we see a spike around 2.3 or 4.2 due to ringing?


In [None]:
# set bin edges to encapsulate the location where ringing occurs
binedges = np.arange(-25.05, 0.6, 0.1)
# 2.3 or 4.2 m

In [None]:
ax = bathy_pts_df.sf_elev_MSL.plot.hist(
    bins=binedges,
    histtype="step",
    density=True,
    label="ICESat-2 Estimate",
    figsize=(20, 10),
)
bathy_pts_df.true_elevation.plot.hist(
    ax=ax, bins=binedges, histtype="step", density=True, label="Actual Value"
)
# ax.set_xlim((-25,0))
ax.axvline(-2.3)
ax.axvline(-4.2)
ax.legend(loc="upper left")

In [None]:
error_by_site = bathy_pts_df.groupby("site").agg(**unpackable_error_dict).round(2)
error_by_site

based on the graph above we do not see significant deviation from the overall distribution of true bathymetry.

# Sites by Secchi Depth

In [None]:
secchi_pt_combined = secchi_pt_combined.assign(
    gebco_elev=query_raster(
        secchi_pt_combined.assign(
            X=secchi_pt_combined.geometry.x, Y=secchi_pt_combined.geometry.y
        ),
        "../data/GEBCO/GEBCO_2021_sub_ice_topo.nc",
    )
)
# mask out values based on GEBCO elevation
secchi_pt_combined = secchi_pt_combined[secchi_pt_combined.gebco_elev < 0]

In [None]:
zsd_date_beam = (
    secchi_pt_combined.dropna()
    .groupby(["site", secchi_pt_combined.date, "beam"])
    .median()
    .reset_index()
)

In [None]:
fig, ax = plt.subplots(figsize=set_size())
ax = secchi_pt_combined.boxplot(column="zsd", by="site", ax=ax)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_ylabel("Secchi Depth [m]")
ax.set_title("Distribution of $Z_{sd}$ at test sites")
fig.suptitle(None)

In [None]:
fig.savefig("../document/figures/secchi_by_site_boxplot.pdf")
# fig.savefig('../document/figures/secchi_by_site_boxplot.pgf',backend='pgf')

In [None]:
secchi_pt_combined.groupby("site").median().merge(
    error_by_site, left_index=True, right_index=True
)

The code below makes the table prettier by dropping extraneous columns, renaming the sites to actual names instead of filenames, and adding latex formatted table anmes

In [None]:
# tracklines_by_site = tracklines_combined.groupby(["site"]).median()
tracklines_by_site = (
    secchi_pt_combined.groupby("site")
    .median()
    .merge(error_by_site, left_index=True, right_index=True)
    .drop(columns=["X", "Y", "gebco_elev", "diff_atten", "diff_atten_unc"])
)
namelist = [
    "Big Island",
    "Charlotte Amalie",
    "Florida Keys",
    "Oahu 1",
    "Oahu 2",
    "Oahu 3",
    "Oahu 4",
    "Oahu 5",
    "Oahu 6",
    "Oahu 7",
    "Oahu 8",
    "St. Croix",
]

tracklines_by_site["Site Name"] = namelist
output_table = (
    tracklines_by_site.round(4)
    .set_index("Site Name")
    .rename(
        columns={
            "zsd": "$Zsd_{50}$[m]",
            "sigma_zsd": "$Zsd_{50}$ uncertainty",
            "diff_atten": "Median $Kd_{490}$",
            "diff_atten_unc": "median $Kd_{490}$ uncertainty",
        }
    )
)
output_table

In [None]:
output_table.to_latex(
    "../document/tables/secchi_site.tex",
    float_format="%.2f",
    escape=False,
    label="tab:ocean_color_summary_by_site",
    caption="Secchi Depth and RMSE for each site",
)

# Analysis of tracklines by site
going to start from scratch

In [None]:
bathy_pts_df.groupby(["site", "date", "beam"]).agg(np.mean)

In [None]:
error_by_transect = bathy_pts_df.groupby(["site", "date", "beam"]).agg(
    **unpackable_error_dict
)
error_by_transect

In [None]:
final_trackline_summary = tracklines_combined.merge(
    error_by_transect, on=["site", "date", "beam"], how="left"
).replace({"n bathy points": np.NaN}, value=0)

final_trackline_summary['ph_per_m']=final_trackline_summary.n_photons/final_trackline_summary['length']

In [None]:
bathy_pts_df.groupby("beamtype").agg(**unpackable_error_dict)

In [None]:
final_trackline_summary.groupby(
    final_trackline_summary["n bathy points"]==0
).mean()

In [None]:
final_trackline_summary

In [None]:
final_trackline_summary.plot.scatter(y='RMS Error [m]',x='avg_ph_cou',figsize=set_size(fraction=2),ylim=(0,5))

In [None]:
final_trackline_summary.plot.scatter(y='n bathy points',x='avg_ph_cou',figsize=set_size(fraction=2))

In [None]:
final_trackline_summary.plot.scatter(y='n bathy points',x='ph_per_m',figsize=set_size(fraction=2))

In [None]:
final_trackline_summary.plot.scatter(y='n bathy points',x='p_hconf',figsize=set_size(fraction=2))

In [None]:
final_trackline_summary.plot.scatter(y='n bathy points',x='avg_fsat',figsize=set_size(fraction=2))

In [None]:
final_trackline_summary.plot.scatter(y='RMS Error [m]',x='p_hconf',figsize=set_size(fraction=2),ylim=(0,5))

In [None]:
final_trackline_summary.plot.scatter(y='RMS Error [m]',x='avg_fsat',figsize=set_size(fraction=2),ylim=(0,5))

In [None]:
final_trackline_summary.plot.scatter(y='RMS Error [m]',x='ph_per_m',figsize=set_size(fraction=2),ylim=(0,5))

In [None]:
final_trackline_summary.corr()

In [None]:
final_trackline_summary.cov()

In [None]:
final_trackline_summary.columns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,TruncatedSVD

In [None]:
final_trackline_summary.count()

In [None]:
# get the dependant variables and standardize them from 0-1
x = final_trackline_summary.loc[:,['n_photons', 'p_hconf','avg_ph_cou','length', 'n bathy points', 'ph_per_m']].to_numpy()
x = StandardScaler().fit_transform(x)

# get the dependant variable
y = final_trackline_summary.loc[:,'n bathy points'].to_numpy()<=5

pcs = PCA(n_components=2,).fit_transform(x)

In [None]:
fig,ax = plt.subplots(figsize=set_size(fraction=3))

ax.scatter(pcs[:,0],pcs[:,1],c=y)

# PCA for each point to identify error

In [None]:
columns_of_interest = ['ph_count','gebco_elev',
       'sea_level_interp', 'sea_level_std_dev','kde_val', 'beamtype',
       'oc_hconf_perc', 'n_subsurf_points', 'n_total_points']

x = bathy_pts_df.assign(beamtype=(bathy_pts_df.beamtype == 'weak').astype('int')).loc[:,columns_of_interest].to_numpy()
x = StandardScaler().fit_transform(x)

# get the dependant variable
y = bathy_pts_df.loc[:,'error_abs'].to_numpy()

In [None]:
pcs = TruncatedSVD(n_components=2).fit_transform(x)

fig,ax = plt.subplots(figsize=set_size(fraction=3))

ax.scatter(pcs[:,0],pcs[:,1],c=y,s=2,vmax=1)

In [None]:
pcs = TruncatedSVD(n_components=1).fit_transform(x)

fig,ax = plt.subplots(figsize=set_size(fraction=3))

ax.scatter(pcs,y,c=y,s=2,vmax=1)
ax.set_ylim(0,1)

# Bias plot of all sites

In [None]:
error_lidar_pt_vs_truth_pt(bathy_pts_df,'All Sites',