# Importing needed packages

In [6]:
import os
import zipfile
import netCDF4 as nc
import numpy as np

# Sentinel level_2 data extraction
This process extracts wind speed, direction, latitude, and longitude data from multiple files to minimize memory issues when loading the data.
## Count the number of .zip files 

In [9]:
directory_sentinel_level_2_data = '/home/noriegac/Documents/Offshore Wind Research/data/sentinel_level_2_data/'
# Documents/Offshore Wind Research/data/sentinel_level_2_data
zip_files = [f for f in os.listdir(directory_sentinel_level_2_data) if f.endswith('.zip')]
print(f"There are {len(zip_files)} files")

There are 5775 files


## Function to extract the arrays for wind speed, direction, lat and lon

In [10]:
def extract_numpy_arrays(directory_path, zip_file):
    """
    Extracts wind speed, direction, latitude, and longitude data 
    from a zip file containing NetCDF (.nc) files.

    Parameters:
        directory_path (str): The path to the directory containing the zip file.
        zip_file (str): The name of the zip file to be processed.

    Returns:
        tuple: A tuple containing:
            - output_name (str): The name of the extracted file without the .zip extension.
            - owiSpeed (np.ndarray): Array of wind speeds with NaN for extreme values.
            - owiDir (np.ndarray): Array of wind directions with NaN for extreme values.
            - owiLat (np.ndarray): Array of latitudes.
            - owiLon (np.ndarray): Array of longitudes.

    Raises:
        Exception: If there is an error processing the zip file.
    """
    zip_file_path = directory_path + zip_file  # Join the directory path with the zip file name
    output_name = zip_file[:-4]  # Remove the .zip extension

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        for file_name in zip_file.namelist():
            if file_name.endswith('.nc'):  # Process only .nc files
                with zip_file.open(file_name) as nc_file:
                    try:
                        dataset = nc.Dataset('sample.nc', memory=nc_file.read())
                        owiSpeed = np.array(dataset.variables['owiWindSpeed'])
                        owiSpeed[owiSpeed == -999.0] = np.nan  # Replace extreme values with NaN

                        owiDir = np.array(dataset.variables['owiWindDirection'])
                        owiDir[owiDir == -999.0] = np.nan  # Replace extreme values with NaN

                        owiLat = np.array(dataset.variables['owiLat'])
                        owiLon = np.array(dataset.variables['owiLon'])

                        return output_name, owiSpeed, owiDir, owiLat, owiLon

                    except Exception as e:
                        print(f"Error processing {zip_file}: {e}")
                break  # Exit after processing the first .nc file

In [16]:
directory_sentinel_level_2_data / "S1A_IW_OCN__2SDV_20170403T222528_20170403T222553_015986_01A5DF_AE79.zip"

TypeError: unsupported operand type(s) for /: 'str' and 'str'

## Save the extracted arrays for readability

In [10]:
def save_numpy_arrays(path_numpyArrays, extracted_file):
    """
    Saves extracted NumPy arrays to a compressed .npz file.

    Parameters:
        path_numpyArrays (str): The path where the NumPy arrays will be saved.
        extracted_file (tuple): A tuple containing the extracted data:
            - output_name (str): Name of the extracted file.
            - owiSpeed (np.ndarray): Array of wind speeds.
            - owiDir (np.ndarray): Array of wind directions.
            - lat (np.ndarray): Array of latitudes.
            - lon (np.ndarray): Array of longitudes.
    """
    name_np = path_numpyArrays + extracted_file[0]
    np.savez_compressed(name_np, owiSpeed=extracted_file[1], owiDir=extracted_file[2], lat=extracted_file[3], lon=extracted_file[4])

# Reading and extracting the data

In [18]:
path_numpyArrays = '/home/noriegac/Documents/Offshore Wind Research/data/numpy_arrays_from_level_2/'

for i in zip_files[:5]:
    save_numpy_arrays(path_numpyArrays, extract_numpy_arrays(directory_sentinel_level_2_data, i))
    print(f'{i} was processed')

S1A_IW_OCN__2SDH_20190420T224949_20190420T225014_026880_0305CC_6952.zip was processed
S1A_IW_OCN__2SDH_20190420T225014_20190420T225039_026880_0305CC_639F.zip was processed
S1A_IW_OCN__2SDH_20190420T225039_20190420T225104_026880_0305CC_6190.zip was processed
S1A_IW_OCN__2SDH_20221230T105800_20221230T105823_046560_059459_6696.zip was processed
S1A_IW_OCN__2SDH_20230123T105759_20230123T105822_046910_05A02B_87C1.zip was processed


# Checking that all files were processed

In [25]:
npz_files = [f for f in os.listdir(path_numpyArrays) if f.endswith('.npz')]
print(f"There are {len(npz_files)} files")

There are 5774 files


## Remove the .npz extension - keep only the file name

In [26]:
npz_file_name = set([name[:-4] for name in npz_files])

## Remove the .zip extension - keep only the file name

In [28]:
zip_file_names = set([name[:-4] for name in zip_files])

## Find the missing values

In [32]:
missing_zip_files = [f"{name}.zip" for name in list(zip_file_names - npz_file_name)]
print(f"The following files have not been extracted yet:\n{missing_zip_files}")

The following files have not been extracted yet:
['S1A_IW_OCN__2SDV_20230605T224319_20230605T224344_048857_05E021_5414.zip']


# Read through the missing values

In [33]:
for i in missing_zip_files:
    try:
        save_numpy_arrays(path_numpyArrays, extract_numpy_arrays(directory_sentinel_level_2_data, i))
    except Exception:
        # Skip this item and continue with the next one
        continue

In [2]:
directory_sentinel_level_2_data / 'S1A_IW_OCN__2SDV_20170403T222528_20170403T222553_015986_01A5DF_AE79.zip'

NameError: name 'directory_sentinel_level_2_data' is not defined