### Importing Libraries

In [1]:
import xarray as xr
import pandas as pd
import os
from tqdm import tqdm
import warnings

### Loading, Subsetting, and Saving Data

In [2]:
BASE_PATH = "Data/Sea Surface Currents"
OUTPUT_FILE = "Data/Processed_SSC_Data.nc"
START_DATE = '2021-01-01'
END_DATE = '2023-11-12'
LON_MIN = 13.9
LON_MAX = 14.81
LAT_MIN = 35.6
LAT_MAX = 36.3

def load_and_select_ssc_data():
    date_range = pd.date_range(START_DATE, END_DATE, freq='12H')
    merged_dataset = []
    time_list = []

    for target_time in tqdm(date_range, desc="Processing files"):
        year = target_time.year
        month = target_time.strftime('%m')
        day = target_time.strftime('%d')
        day_path = os.path.join(BASE_PATH, f"SSC_MaltaSicily_{year}", str(year), month, day)

        if not os.path.exists(day_path):
            continue

        files = os.listdir(day_path)
        nearest_file = min(files, key=lambda x: abs(pd.to_datetime(x.split('_')[-1][:4], format='%H%M') - target_time))
        file_path = os.path.join(day_path, nearest_file)
        with xr.open_dataset(file_path) as ds:
            # Ensure time dimension matches target_time
            ds = ds.assign_coords(time=[target_time])
            merged_dataset.append(ds.copy())  # Copy the data to avoid issues after closing
        time_list.append(target_time)

    if merged_dataset:
        combined_ds = xr.concat(merged_dataset, dim='time')
        # Interpolate to fill missing time points
        combined_ds = combined_ds.interp(time=date_range)
        print("All files loaded. Starting concatenation and interpolation...")
        print("="*125)
        return combined_ds
    else:
        print("No data loaded.")
        print("="*125)
        return None

def subset_data_for_polygon(dataset):
    subset_ds = dataset.sel(lon=slice(LON_MIN, LON_MAX), lat=slice(LAT_MIN, LAT_MAX))
    return subset_ds

def save_to_netcdf(dataset):
    print("Saving dataset to NetCDF file...")
    print("="*125)
    dataset.to_netcdf(OUTPUT_FILE)
    print("Dataset successfully saved.")
    print("="*125)

# Main execution block
combined_ds = load_and_select_ssc_data()

if combined_ds is not None:
    print("Subsetting dataset for the specified polygon area...")
    print("="*125)
    subset_ds = subset_data_for_polygon(combined_ds)
    save_to_netcdf(subset_ds)
    print("Processing completed successfully.")
    print("="*125)
else:
    print("Data processing was unsuccessful. Please check your data files and paths.")
    print("="*125)

Processing files: 100%|██████████| 2091/2091 [00:27<00:00, 77.18it/s] 


All files loaded. Starting concatenation and interpolation...
Subsetting dataset for the specified polygon area...
Saving dataset to NetCDF file...
Dataset successfully saved.
Processing completed successfully.


### Ensuring that the merged file is correct

In [3]:
# Open the merged dataset
ds = xr.open_dataset(OUTPUT_FILE)

# Calculate the expected number of time points
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='12H')
expected_time_points = len(date_range)

# Print section separator
print("=" * 125)
print("Sea Surface Currents Dataset Information")
print("=" * 125)

# Now, print the structure and check the dataset
print("\nDataset Dimensions:")
print(ds.dims)
print("\nDataset Coordinates:")
print(ds.coords)
print("\nData Variables in the Dataset:")
print(ds.data_vars)
print("\nAttributes (Metadata) in the Dataset:")
print(ds.attrs)

# Check that the time dimension is as expected
actual_time_points = ds.sizes['time']
print("\nExpected Time Points:", expected_time_points)
print("Actual Time Points:", actual_time_points)

# Check that the lat/lon are within the specified bounds
lat_min, lat_max = ds['lat'].min().values, ds['lat'].max().values
lon_min, lon_max = ds['lon'].min().values, ds['lon'].max().values
print("\nLatitude Range in the Dataset:", lat_min, "to", lat_max)
print("Longitude Range in the Dataset:", lon_min, "to", lon_max)

# Check for expected variables
assert 'u' in ds.variables, "u variable is missing from the dataset"
assert 'v' in ds.variables, "v variable is missing from the dataset"

print("")
print("=" * 125)

# Close the dataset after inspection
ds.close()

Sea Surface Currents Dataset Information

Dataset Dimensions:

Dataset Coordinates:
Coordinates:
  * lat      (lat) float32 35.74 35.77 35.79 35.81 ... 36.21 36.23 36.26 36.28
  * lon      (lon) float32 13.92 13.96 14.0 14.04 ... 14.65 14.69 14.73 14.77
  * time     (time) datetime64[ns] 2021-01-01 2021-01-01T12:00:00 ... 2023-11-12

Data Variables in the Dataset:
Data variables:
    u        (time, lat, lon) float64 ...
    v        (time, lat, lon) float64 ...
    stdu     (time, lat, lon) float64 ...
    stdv     (time, lat, lon) float64 ...
    cov      (time, lat, lon) float64 ...
    velo     (time, lat, lon) float64 ...
    head     (time, lat, lon) float64 ...

Attributes (Metadata) in the Dataset:
{'NC_GLOBAL.Title': 'Near-Real time Surface Ocean Velocity', 'NC_GLOBAL.origin': 'BARK (measured);POZZ (measured);', 'NC_GLOBAL.source': 'HF Radar Derived Surface Currents obtained from CODAR combine method', 'NC_GLOBAL.history': '08-Jun-2023 14:46:23', 'NC_GLOBAL.grid_type': 'REGULA