# SWOT PIXC pre processing

In [11]:
import xarray as xr
import pandas as pd
import numpy as np

### One file test

In [13]:
# ---------- Filepath ----------
aufeis_test_tile_filepath = "../../../../shared_space/SWOT_Aufeis/SWOT_PIXC_data/SWOT_L2_HR_PIXC_033_487_278R_20250605T070551_20250605T070602_PID0_01.nc"

# NetCDF structure check
import netCDF4
data_structure = netCDF4.Dataset(aufeis_test_tile_filepath)
# print(data_structure) # shows 'groups: pixel_cloud, tvp, noise'

# file pointer
fp = xr.open_dataset(aufeis_test_tile_filepath, group = 'pixel_cloud')
# print(fp)
# print(fp.attrs)

In [14]:
# Extract the variables we want into 1D numpy arrays
def getvar(name):
    return fp[name].values.ravel() if name in fp else np.nan

# Build a Pandas DataFrame
SWOT_Points = pd.DataFrame({
    "longitude": getvar("longitude"),
    "latitude": getvar("latitude"),
    "height": getvar("height"),
    "phase_noise_std": getvar("phase_noise_std"),
    "dheight": getvar("dheight_dphase"),
    "class": getvar("classification"),
    "classqual": getvar("classification"),
    "bright_land_flag": getvar("bright_land_flag"),
    "ancillary_surface_classification_flag": getvar("ancillary_surface_classification_flag"),
    "waterfrac": getvar("water_frac"),
    "waterfrac_uncert": getvar("water_frac_uncert"),
    "prior_water_prob": getvar("prior_water_prob"),
    "geolocqual": getvar("geolocation_qual"),
    "sig0": getvar("sig0"),
    "sig0_uncert": getvar("sig0_uncert"),
    "sig0_qual": getvar("sig0_qual"),
    "crosstrack": getvar("cross_track"),
    "pixel_area": getvar("pixel_area"),
    "darea_dheight": getvar("darea_dheight"),
    # elevation corrections
    "geoid": getvar("geoid"),
    "solid_tide": getvar("solid_earth_tide"),
    "load_tide": getvar("load_tide_fes"),
    "pole_tide": getvar("pole_tide")
})

# Derive elevation & height uncertainty
SWOT_Points["height_uncert"] = SWOT_Points["phase_noise_std"] * SWOT_Points["dheight"]
SWOT_Points["geoid_correction"] = SWOT_Points["geoid"] - SWOT_Points["solid_tide"] - SWOT_Points["load_tide"] - SWOT_Points["pole_tide"]
SWOT_Points["elevation"] = SWOT_Points["height"] - SWOT_Points["geoid_correction"]

# Drop any empty rows
SWOT_Points = SWOT_Points.dropna(how="all")

# Quality filtering
geolocqual_problem_bits = {
    4, 4101, 5, 6, 4100, 4102, 524292, 524293, 524294, 524295,
    528389, 528390, 7, 528388, 16777220, 17301508, 17305604,
    528391, 4103
}

SWOT_Points = SWOT_Points[
    (~SWOT_Points["geolocqual"].isin(geolocqual_problem_bits)) &
    (SWOT_Points["crosstrack"].abs().between(10000, 60000))]

print(SWOT_Points.head())
fp.close()

      longitude   latitude      height  phase_noise_std   dheight  class  \
211 -143.098029  68.201372  571.506653         0.117613  0.744377    6.0   
212 -143.096199  68.200914  571.323730         0.110314  0.758199    6.0   
213 -143.093815  68.200317  571.329895         0.131377  0.776198    3.0   
285 -142.934672  68.160307  572.359375         0.109845  1.980290    3.0   
286 -142.934076  68.160156  572.105469         0.061149  1.984812    3.0   

     classqual  bright_land_flag  ancillary_surface_classification_flag  \
211        6.0               0.0                                    1.0   
212        6.0               0.0                                    1.0   
213        3.0               0.0                                    1.0   
285        3.0               0.0                                    1.0   
286        3.0               0.0                                    1.0   

     waterfrac     ...        crosstrack   pixel_area  darea_dheight  \
211   0.888415     .

In [15]:
# Need to add metadata attributes

# Get the attributes of interest from the data_structure
cycle_number = data_structure.cycle_number
pass_number = data_structure.pass_number
tile_number = data_structure.tile_number
time_granule_start = data_structure.time_granule_start

# Add these as new columns to the SWOT_Points DataFrame
SWOT_Points["cycle_number"] = cycle_number
SWOT_Points["pass_number"] = pass_number
SWOT_Points["tile_number"] = tile_number
SWOT_Points["time"] = time_granule_start # just taking start time since it's a diff of seconds

In [None]:
# need to add cycle, pass, time to SWOT_Points df
# for ML:
# longitude, latitude, NORMALIZED to each tile elevation, phase_noise_std, sig0
# crosstrack, maybe cycle & pass??

### Batch processing

In [None]:
import glob
import os
import traceback
from datetime import datetime

import netCDF4
import xarray as xr
import numpy as np
import pandas as pd

# OPTIONAL for nicer progress bars; fallback harmless if not installed
try:
    from tqdm import tqdm
except Exception:
    tqdm = lambda x: x

# directory & filename pattern
base_dir = "../../../../shared_space/SWOT_Aufeis/SWOT_PIXC_data"
pattern = os.path.join(base_dir, "SWOT_L2_HR_PIXC_*.nc")

# target_crs
# target_crs = "EPSG:32606"   # UTM zone 6N

# same set of geolocqual "problem bits" you used
geolocqual_problem_bits = {
    4, 4101, 5, 6, 4100, 4102, 524292, 524293, 524294, 524295,
    528389, 528390, 7, 528388, 16777220, 17301508, 17305604,
    528391, 4103
}

# list of variables to extract from the pixel_cloud group
vars_to_extract = [
    "longitude", "latitude", "height", "phase_noise_std", "dheight_dphase",
    "classification", "bright_land_flag", "ancillary_surface_classification_flag",
    "water_frac", "water_frac_uncert", "prior_water_prob",
    "geolocation_qual", "sig0", "sig0_uncert", "sig0_qual",
    "cross_track", "pixel_area", "darea_dheight",
    "geoid", "solid_earth_tide", "load_tide_fes", "pole_tide"
]

# helper to safely get xr.DataArray -> 1D numpy (ravel) or fill with NaNs if missing
def xr_getvar(xds, name):
    if name in xds:
        da = xds[name]
        # ensure numeric dtype and flatten (ravel) to 1D
        vals = da.values
        if np.ma.isMaskedArray(vals):
            vals = vals.filled(np.nan)
        return vals.ravel()
    else:
        # return array of NaNs with length equal to first dimension if possible
        # but we cannot know the length here, so return np.nan and let pandas handle broadcasting
        return np.nan

def process_single_file(nc_path, geolocqual_problem_bits=geolocqual_problem_bits):
    """Process one SWOT PIXC file and return a DataFrame (or None on error)."""
    try:
        # read global attrs with netCDF4 (for metadata)
        ds_root = netCDF4.Dataset(nc_path)
        # open pixel_cloud group with xarray (convenient indexing/array ops)
        xr_fp = xr.open_dataset(nc_path, group="pixel_cloud", decode_times=False)

        # build data dict for DataFrame
        data = {}
        # map variables to your intended DataFrame column names (matching your earlier code)
        mapping = {
            "longitude": "longitude",
            "latitude": "latitude",
            "height": "height",
            "phase_noise_std": "phase_noise_std",
            "dheight_dphase": "dheight",
            "classification": "class",
            "bright_land_flag": "bright_land_flag",
            "ancillary_surface_classification_flag": "ancillary_surface_classification_flag",
            "water_frac": "waterfrac",
            "water_frac_uncert": "waterfrac_uncert",
            "prior_water_prob": "prior_water_prob",
            "geolocation_qual": "geolocqual",
            "sig0": "sig0",
            "sig0_uncert": "sig0_uncert",
            "sig0_qual": "sig0_qual",
            "cross_track": "crosstrack",
            "pixel_area": "pixel_area",
            "darea_dheight": "darea_dheight",
            "geoid": "geoid",
            "solid_earth_tide": "solid_tide",
            "load_tide_fes": "load_tide",
            "pole_tide": "pole_tide"
        }

        # extract each variable; result may be np.nan if missing
        for xr_name, col_name in mapping.items():
            data[col_name] = xr_getvar(xr_fp, xr_name)

        # Build DataFrame
        SWOT_df = pd.DataFrame(data)

        # if phase_noise column missing name mismatch handling:
        # your earlier code used "phase_noise_std" correctly; keep that
        # compute derived fields (guard against missing arrays)
        if "phase_noise_std" in SWOT_df.columns and "dheight" in SWOT_df.columns:
            SWOT_df["height_uncert"] = SWOT_df["phase_noise_std"] * SWOT_df["dheight"]
        else:
            SWOT_df["height_uncert"] = np.nan

        # geoid correction and elevation
        # ensure columns exist (fill with nan if not)
        for c in ["geoid", "solid_tide", "load_tide", "pole_tide", "height"]:
            if c not in SWOT_df.columns:
                SWOT_df[c] = np.nan

        SWOT_df["geoid_correction"] = (
            SWOT_df["geoid"] - SWOT_df["solid_tide"] - SWOT_df["load_tide"] - SWOT_df["pole_tide"]
        )
        SWOT_df["elevation"] = SWOT_df["height"] - SWOT_df["geoid_correction"]

        # Add the classification qual / duplicate columns (you previously used classqual from classification)
        if "class" in SWOT_df.columns:
            SWOT_df["classqual"] = SWOT_df["class"]
        else:
            SWOT_df["class"] = np.nan
            SWOT_df["classqual"] = np.nan

        # Drop rows that are completely empty
        SWOT_df = SWOT_df.dropna(how="all")

        # Apply quality filtering (only if geolocqual and crosstrack exist)
        if "geolocqual" in SWOT_df.columns and "crosstrack" in SWOT_df.columns:
            SWOT_df = SWOT_df[
                (~SWOT_df["geolocqual"].isin(geolocqual_problem_bits)) &
                (SWOT_df["crosstrack"].abs().between(10000, 60000))
            ]

        # extract root-level metadata and attach as constant columns
        # If an attribute is missing, fill with np.nan
        def safe_attr(ds, name):
            return getattr(ds, name, np.nan)

        SWOT_df["cycle_number"] = safe_attr(ds_root, "cycle_number")
        SWOT_df["pass_number"] = safe_attr(ds_root, "pass_number")
        SWOT_df["tile_number"] = safe_attr(ds_root, "tile_number")
        # time_granule_start may be a string; keep original string for traceability
        SWOT_df["time_granule_start"] = safe_attr(ds_root, "time_granule_start")

        # record file info
        SWOT_df["source_file"] = os.path.basename(nc_path)
        SWOT_df["source_filepath"] = os.path.abspath(nc_path)

        # close datasets
        xr_fp.close()
        ds_root.close()

        return SWOT_df

    except Exception as e:
        # print traceback and return None so the loop can continue
        print(f"Error processing {nc_path}: {e}")
        traceback.print_exc()
        try:
            xr_fp.close()
        except Exception:
            pass
        try:
            ds_root.close()
        except Exception:
            pass
        return None


# find all matching files
all_files = sorted(glob.glob(pattern))
print(f"Found {len(all_files)} files matching pattern.")

# process each file and gather DataFrames
dfs = []
for f in tqdm(all_files):
    df = process_single_file(f)
    if df is not None and not df.empty:
        dfs.append(df)
    else:
        print(f"Skipping empty/failed result for {f}")

# Concatenate all into one master DataFrame
if dfs:
    SWOT_Points_all = pd.concat(dfs, ignore_index=True)
    print("Combined DataFrame shape:", SWOT_Points_all.shape)
else:
    SWOT_Points_all = pd.DataFrame()
    print("No valid data found in files.")



  0%|          | 0/137 [00:00<?, ?it/s]

Found 137 files matching pattern.


  1%|          | 1/137 [00:02<05:25,  2.39s/it]