Match-up analysis (Sentinel-2 pixels vs in situ data)

In [1]:
import numpy as np
import xarray as xr
import rasterio
import uav
import os

from scipy.stats import zscore

# NDWI mask filename
ndwi_filename = r'D:\sentinel2-acolite-prod\jupyters\Mar\masks\ndwi.tif'

with rasterio.open(ndwi_filename) as src:
    is_water = src.read() > 0.1
    water_mask_tif = uav.GeoreferenceData(is_water, src.transform)

# Tif of one band (ex: 442) for reference
with rasterio.open(r'D:\sentinel2-acolite-prod\jupyters\Mar\base_georeference_reduced.tif') as reference:
    transform = reference.transform
    height, width = reference.read(1).shape
    

# Function to extract values - NDWI masked
def match_points_mask_ndwi(netcdf_path, variables, points, clear_outliers : bool = False, 
                        kernel_size = 1, method = np.nanmean) -> None:
    def __extract_values(data, x_0, y_0, kernel_size, method, clear_outliers = False):
        values = [ data[y, x] for y in range(y_0 - kernel_size, y_0 + kernel_size + 1) for x in range(x_0 - kernel_size, x_0 + kernel_size + 1)]
        
        if clear_outliers:
            values = remove_by_zscore(np.array(values))

        return float(method(values)), np.count_nonzero(~np.isnan(values))
    
    matches_by_variable = {variable : [] for variable in variables} | {'x' : points[:, 0], 'y' : points[:, 1]} \
                            |{f'{variable}_no_nans' : [] for variable in variables}


    with xr.open_dataset(netcdf_path) as dataset:
        xs, ys = np.meshgrid(src.x, src.y)
        rows, cols = water_mask_tif.get_row_col_by_lon_lat(xs, ys)
        in_shape = water_mask_tif.get_in_shape_mask(rows, cols)
        is_water_mask = water_mask_tif.get_data_by_row_col(rows[in_shape], cols[in_shape])[0].reshape((height, width))
        
        for point in points:
            x_index = np.absolute(dataset.x.values - point[0]).argmin()
            y_index = np.absolute(dataset.y.values - point[1]).argmin()

            for variable in variables:
                if variable in dataset:
                    data = np.squeeze(dataset[variable])
                    data[~is_water_mask] = np.nan

                    value, no_nans = __extract_values(data, x_index, y_index, kernel_size, method, clear_outliers)
                    matches_by_variable[variable].append(float(value))
                    matches_by_variable[f'{variable}_no_nans'].append(int(no_nans))
                else:
                    matches_by_variable[variable].append(np.nan)
                    matches_by_variable[f'{variable}_no_nans'].append(np.nan)

    return matches_by_variable

# Function to extract values - NDWI masked and deepwater
def match_points_mask_ndwi_deepwater(netcdf_path, variables, points, clear_outliers : bool = False, 
                        kernel_size = 1, method = np.nanmean, mask_deep = True) -> None:
    def __extract_values(data, x_0, y_0, kernel_size, method, clear_outliers = False):
        values = [ data[y, x] for y in range(y_0 - kernel_size, y_0 + kernel_size + 1) for x in range(x_0 - kernel_size, x_0 + kernel_size + 1)]
        
        if clear_outliers:
            values = remove_by_zscore(np.array(values))

        return float(method(values)), np.count_nonzero(~np.isnan(values))
    
    matches_by_variable = {variable : [] for variable in variables} | {'x' : points[:, 0], 'y' : points[:, 1]} \
                            |{f'{variable}_no_nans' : [] for variable in variables}

    with xr.open_dataset(r"D:\sentinel2-acolite-prod\jupyters\Mar\masks\deep_water.nc") as deep_file:
        name = os.path.basename(os.path.dirname(netcdf_path))[:4]
        
        for var in deep_file:
            if name in var:
                deep_water_mask = deep_file[var]
        
    if mask_deep:
        deep_water_mask = deep_water_mask
    else:
        deep_water_mask = ~deep_water_mask

    with xr.open_dataset(netcdf_path) as dataset:
        xs, ys = np.meshgrid(src.x, src.y)
        rows, cols = water_mask_tif.get_row_col_by_lon_lat(xs, ys)
        in_shape = water_mask_tif.get_in_shape_mask(rows, cols)
        is_water_mask = water_mask_tif.get_data_by_row_col(rows[in_shape], cols[in_shape])[0].reshape((height, width))
        
        for point in points:
            x_index = np.absolute(dataset.x.values - point[0]).argmin()
            y_index = np.absolute(dataset.y.values - point[1]).argmin()

            for variable in variables:
                if variable in dataset:
                    data = np.squeeze(dataset[variable])
                    data[~is_water_mask] = np.nan
                    data[deep_water_mask] = np.nan

                    value, no_nans = __extract_values(data, x_index, y_index, kernel_size, method, clear_outliers)
                    matches_by_variable[variable].append(float(value))
                    matches_by_variable[f'{variable}_no_nans'].append(int(no_nans))
                else:
                    matches_by_variable[variable].append(np.nan)
                    matches_by_variable[f'{variable}_no_nans'].append(np.nan)

    return matches_by_variable


# Function to remove outliers
def remove_by_zscore(data : np.ndarray) -> np.ndarray:
    z_scores = zscore(data)
    abs_z_scores = np.abs(z_scores)

    data[abs_z_scores > 3] = np.nan
    return data

Extraction

In [2]:
import pandas as pd
import datetime
import os

from glob import glob
from pyproj import Transformer


satellite_dates = []
filenames = []
# Load list of filenames to extract
for filename in glob(r'D:\NRT-Sentinel2\outputs\FRP_Lagoon\section_1\FRP\*\*.nc'):
    date = datetime.datetime.strptime(os.path.basename(os.path.dirname(filename)).split('_')[0], '%Y%m%d')
    satellite_dates.append(date)
    filenames.append(filename)

# Load In-Situ data
in_situ = pd.read_excel(r'D:\sentinel2-acolite-prod\jupyters\Mar\Dados Historicos Lagoa_2025.xlsx')
in_situ[['Latitude', 'Longitude', 'date']]

# Compute nearest satellite images for each in-situ measurement
nearest_satellite_dates = []
nearest_satellite_diff = []
nearest_filenames = []
for date in in_situ.date:
    diff = [abs(date - satellite_date) for satellite_date in satellite_dates]
    nearest_satellite_dates.append(satellite_dates[np.argmin(diff)])
    nearest_filenames.append(filenames[np.argmin(diff)])
    nearest_satellite_diff.append(np.min(diff).days)

nearest_satellite_dates, nearest_satellite_diff


match_up_dates = {
    'in_situ' : in_situ.date,
    'satellite' : nearest_satellite_dates,
    'filename' : nearest_filenames,
    'diference' : nearest_satellite_diff,
    'id' : in_situ.id,
    'lon' : in_situ.Longitude,
    'lat' : in_situ.Latitude
}


# Only use images with 2 days of difference
match_up_dates_df = pd.DataFrame(match_up_dates)
match_up_dates_df = match_up_dates_df[match_up_dates_df.diference <= 2]

latlon_projection : str = 'epsg:4326'
pseudo_mercator_projection : str = 'epsg:32722'
transformer : Transformer = Transformer.from_crs(latlon_projection, pseudo_mercator_projection, always_xy = True)

utm_x, utm_y = transformer.transform(match_up_dates_df.lon, match_up_dates_df.lat)

points = np.array(list(zip(utm_x, utm_y)))
variables =  ['chl_re_mishra', 'SPM_Nechad2016_665']

MatchUp for Deep Water and Shallow Water - Only NDWI masked

In [3]:
# Extraxt variables data for each satellite image
merged_df = pd.merge(left = match_up_dates_df, right = in_situ.loc[match_up_dates_df.index])
for method_name, method in [('mean', np.nanmean), ('median', np.nanmedian)]:
    data_by_variable = {}
    results = []
    for i in range(len(points)):
        try:
            res = match_points_mask_ndwi(match_up_dates_df.filename.iloc[i], variables, points[i:i+1], method = method)
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(res[variable][0])
        except Exception as e:
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(np.nan)
    df = pd.DataFrame(data_by_variable, index = match_up_dates_df.index)
    df['id'] = match_up_dates_df['id']
    merged_df = pd.merge(left = merged_df, right = df)


out_folder = r'D:\sentinel2-acolite-prod\jupyters\Mar\results\shallow_and_deep_water\csv\match_up'
os.makedirs(out_folder, exist_ok = True)

merged_df.to_excel(rf'{out_folder}\Dados Historicos Lagoa_2025.xlsx', index = False)
merged_df.to_csv(rf'{out_folder}\Dados Historicos Lagoa_2025.csv', index = False)

MatchUp for Deep Water - Inverse of DeepWater Masked

In [4]:
# Extraxt variables data for each satellite image
merged_df = pd.merge(left = match_up_dates_df, right = in_situ.loc[match_up_dates_df.index])
for method_name, method in [('mean', np.nanmean), ('median', np.nanmedian)]:
    data_by_variable = {}
    results = []
    for i in range(len(points)):
        try:
            res = match_points_mask_ndwi_deepwater(match_up_dates_df.filename.iloc[i], variables, points[i:i+1], method = method, mask_deep = False)
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(res[variable][0])
        except Exception as e:
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(np.nan)
    df = pd.DataFrame(data_by_variable, index = match_up_dates_df.index)
    df['id'] = match_up_dates_df['id']
    merged_df = pd.merge(left = merged_df, right = df)


out_folder = r'D:\sentinel2-acolite-prod\jupyters\Mar\results\deep_water\csv\match_up'
os.makedirs(out_folder, exist_ok = True)

merged_df.to_excel(rf'{out_folder}\Dados Historicos Lagoa_2025.xlsx', index = False)
merged_df.to_csv(rf'{out_folder}\Dados Historicos Lagoa_2025.csv', index = False)

MatchUp for Shallow Water - DeepWater Masked

In [5]:
# Extraxt variables data for each satellite image
merged_df = pd.merge(left = match_up_dates_df, right = in_situ.loc[match_up_dates_df.index])
for method_name, method in [('mean', np.nanmean), ('median', np.nanmedian)]:
    data_by_variable = {}
    results = []
    for i in range(len(points)):
        try:
            res = match_points_mask_ndwi_deepwater(match_up_dates_df.filename.iloc[i], variables, points[i:i+1], method = method, mask_deep = True)
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(res[variable][0])
        except Exception as e:
            for variable in variables:
                data_by_variable.setdefault(f'{variable}_{method_name}', []).append(np.nan)
    df = pd.DataFrame(data_by_variable, index = match_up_dates_df.index)
    df['id'] = match_up_dates_df['id']
    merged_df = pd.merge(left = merged_df, right = df)


out_folder = r'D:\sentinel2-acolite-prod\jupyters\Mar\results\shallow_water\csv\match_up'
os.makedirs(out_folder, exist_ok = True)

merged_df.to_excel(rf'{out_folder}\Dados Historicos Lagoa_2025.xlsx', index = False)
merged_df.to_csv(rf'{out_folder}\Dados Historicos Lagoa_2025.csv', index = False)