### Importing Libraries

In [None]:
import xarray as xr
import pandas as pd
import os
from tqdm import tqdm

### Loading, Subsetting, and Saving Data

In [None]:
# Global Variables
BASE_PATH = r"Data\Sea Surface Currents"
OUTPUT_FILE = r"Data\Processed_SSC_Data.nc"
START_DATE = '2020-02-25'
END_DATE = '2023-12-16'
LON_MIN = 13.9
LON_MAX = 14.81
LAT_MIN = 35.6
LAT_MAX = 36.3

def load_and_merge_ssc_data():
    date_range = pd.date_range(START_DATE, END_DATE)
    merged_dataset = []
    total_files_count = 0  # To keep track of the total number of files

    # Calculate the total number of files for the progress bar
    for single_date in date_range:
        day_path = os.path.join(BASE_PATH, f"SSC_MaltaSicily_{single_date.year}",
                                str(single_date.year), single_date.strftime('%m'), single_date.strftime('%d'))
        if os.path.exists(day_path):
            total_files_count += len([f for f in os.listdir(day_path) if f.endswith('.nc')])

    with tqdm(total=total_files_count, desc="Merging files", unit="file") as pbar:
        for single_date in date_range:
            day_path = os.path.join(BASE_PATH, f"SSC_MaltaSicily_{single_date.year}",
                                    str(single_date.year), single_date.strftime('%m'), single_date.strftime('%d'))
            if not os.path.exists(day_path):
                continue

            for hour_file in os.listdir(day_path):
                if hour_file.endswith('.nc'):
                    file_path = os.path.join(day_path, hour_file)
                    ds = xr.open_dataset(file_path)
                    merged_dataset.append(ds)
                    pbar.update(1)
    
    print("Finished loading all files. Starting concatenation process...")
    print("="*125)
    print("\n")

    if merged_dataset:
        # Concatenate here
        combined_ds = xr.concat(merged_dataset, dim='time')
        print("Concatenation complete. Now saving to file...")
        print("="*125)
        print("\n")

        return combined_ds
    else:
        print("No data loaded.")
        print("="*125)
        print("\n")
        return None

def save_to_netcdf(dataset):
    # Saving to NetCDF
    print(f"Attempting to save the merged dataset to {OUTPUT_FILE}...")
    print("="*125)
    print("\n")
    dataset.to_netcdf(OUTPUT_FILE)
    print(f"Dataset successfully saved to {OUTPUT_FILE}")
    print("="*125)
    print("\n")

# Execution block
combined_ds = load_and_merge_ssc_data()

if combined_ds is not None:
    # Subset and save operations
    print("Subsetting dataset for the specified polygon area...")
    print("="*125)
    print("\n")
    subset_ds = subset_data_for_polygon(combined_ds)
    save_to_netcdf(subset_ds)
else:
    print("Data processing was unsuccessful!")

### Ensuring that the merged file is correct

In [None]:
# Open the merged dataset
ds = xr.open_dataset(OUTPUT_FILE)

# Calculate the expected number of time points
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='H')
expected_time_points = len(date_range)

# Now, print the structure and check the dataset
print(f"Dataset dimensions: {ds.dims}")
print(f"Dataset coordinates: {ds.coords}")
print(f"Data variables in the dataset: {ds.data_vars}")
print(f"Attributes (metadata) in the dataset: {ds.attrs}")

# Check that the time dimension is as expected
actual_time_points = ds.dims['time']
print(f"Expected time points: {expected_time_points}, Actual time points: {actual_time_points}")

# Check that the lat/lon are within the specified bounds
lat_min, lat_max = ds['lat'].min().values, ds['lat'].max().values
lon_min, lon_max = ds['lon'].min().values, ds['lon'].max().values
print(f"Latitude range in the dataset: {lat_min} to {lat_max}")
print(f"Longitude range in the dataset: {lon_min} to {lon_max}")

# Check for expected variables
assert 'u' in ds.variables, "u variable is missing from the dataset"
assert 'v' in ds.variables, "v variable is missing from the dataset"

# Close the dataset after inspection
ds.close()