In [None]:
from data import KelpNCDataset
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import xarray as xr
import pandas as pd

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN


def dbscan_knee_plot(X: np.ndarray, eps: float = None, ax: plt.Axes = None) -> int:
    """ Generate knee plot with DBSCAN to guess a good epsilon threshold. Pick eps at the knee. """
    # Dimensionality of data
    n, d = X.shape
    min_pts = (2 * d) + 1  # +1 to include point itself
    min_pts = min(n, min_pts)
    k_nn = 2 * d  # (2 * d - 1) from Schubert et al (2017)
    k_nn = min(n, k_nn)
    print(f"Based on dimensionality, use min_pts={min_pts}!")

    nn = NearestNeighbors(n_neighbors=min_pts)
    nn.fit(X)
    nn_dist, _ = nn.kneighbors(X, k_nn, return_distance=True)
    nn_dist = np.sort(nn_dist[:, -1])[::-1]

    if ax is None:
        fig, ax = plt.subplots()

    ax.plot(nn_dist)
    if eps is not None:
        ax.hlines(eps, 0, len(nn_dist), color="red")

    try:
        # Only show if generated inside function
        fig.show()
    except NameError:
        ...

    return min_pts

In [None]:
kelp = KelpNCDataset(img_nc_path="data_ncf/train_imgs_fe.nc", mask_nc_path="data_ncf/train_masks.ncf")
imgs = kelp.imgs.chunk({"sample": 250}).isel(ch=slice(0, 5))
masks = kelp.masks.chunk({"sample": 1000})

In [None]:
imgs

In [None]:
masks

In [None]:
df_quality = pd.read_csv("quality.csv", index_col=0)
nans = df_quality["nan_fraction"]
nans = nans.to_xarray().rename(index="sample").expand_dims("ch")
nans = nans.assign_coords(sample=imgs.sample, ch=["nan_frac"])
# nans = np.power(nans, 1/2)

nans.plot.hist()

In [None]:
imgs_mean = imgs.mean(["i", "j"])
imgs_std = imgs.std(["i", "j"])
imgs_mean.sizes, imgs_std.sizes

In [None]:
imgs_stats = xr.concat([
    imgs_mean, 
    imgs_std, 
    nans
], dim="ch").compute()
imgs_stats

In [None]:
dbscan_knee_plot(imgs_stats, eps=.16)

In [None]:
# clst = DBSCAN(eps=.13, min_samples=21).fit(imgs_stats)
clst = DBSCAN(eps=.16, min_samples=23).fit(imgs_stats)

In [None]:
np.unique(clst.labels_, return_counts=True)

In [None]:
is_outlier = clst.labels_ == -1
imgs_outlier = imgs.sel(sample=is_outlier).load()
imgs_outlier

In [None]:
random_samples = np.random.choice(imgs_outlier.sample, size=25)
random_samples

In [None]:
fig, axarr = plt.subplots(ncols=5, nrows=5, figsize=(20, 20))
for s, ax in zip(random_samples, axarr.ravel()):
    ax.imshow(imgs_outlier.sel(sample=s).isel(ch=0))

In [None]:
pd.Series(is_outlier).to_csv("is_outlier.csv")

In [None]:
pd.read_csv("is_outlier.csv")["0"]

# Application to test data 
(irrelevant because I need to predict on all of them anyway)

In [None]:
kelp_test = KelpNCDataset(img_nc_path="data_ncf/test_imgs_fe.nc")
imgs_test = kelp_test.imgs.chunk(sample=250).isel(ch=slice(0, 5))
imgs_test

In [None]:
imgs_stats_test = xr.concat([
    imgs_test.mean(["i", "j"]), 
    imgs_test.std(["i", "j"]), 
    # nans
], dim="ch").compute()
imgs_stats_test

In [None]:
np.unique(clst.fit_predict(imgs_stats_test), return_counts=True)