In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os.path
import pathlib
import platform 
import xarray as xr

In [32]:
cwd = pathlib.Path().resolve()
src = cwd.parent
data = src.parent.parent.parent
OS_type = platform.system()
if OS_type == 'Darwin':
    username = 'Mats '
    data_path = os.path.join(data, 'data_tana', 'TAHMO', 'location_tahmo')
    data_path_location = os.path.join(data, 'data_tana')
    data_path_netcdf = os.path.join(data_path_location, 'TAHMO', 'interpolated')
    
else:
    username = 'Mootje'
    data_path = os.path.join(data, 'OneDrive - Delft University of Technology', 'TU Delft', 'Master ENVM', 'MDP', 'Model', 'Data', 'TAHMO')
    data_path_location = os.path.join(data, 'OneDrive - Delft University of Technology', 'TU Delft', 'Master ENVM', 'MDP', 'Model', 'Data')

print(f"Welcome {username}, have a wonderful day on your {OS_type} machine. Your data should be located in {data_path}")

Welcome Mats , have a wonderful day on your Darwin machine. Your data should be located in /Users/matskerver/Documents/data_tana/TAHMO/location_tahmo


In [33]:
netcdf_file = 'kriging_results_evap.nc'
ds = xr.open_dataset(os.path.join(data_path_netcdf, netcdf_file))


In [34]:
def calculate_evaporation_average(ds):
    average_evap = ds_trim['evap'].mean().values
    return average_evap

In [35]:
# Define the variable and time range
variable = 'evap'


# List to store image file paths
file_paths = []
cleaning = True

while cleaning: 
    
    time_indices = range(0, ds.dims['time'], 1)  # Adjust the step based on your needs
    yes = 0
    no = 0
    timestamps = []
    
    for time_index in time_indices:
        selected_data = ds.isel(time=time_index)
        data_array = selected_data[variable]

        # Check for NaN values in the current data_array
        nan_check = data_array.isnull()

        # Check for any values smaller than zero in the current data_array
        negative_check = data_array < 0

        # Determine if there are any NaN or negative values in the DataArray
        has_nan = nan_check.any()
        has_negative = negative_check.any()

        # Print data_array only if it has NaN values or negative values
        if has_nan.values or has_negative.values:
            timestamps.append(time_index)
            yes += 1
        else: 
            no += 1
    
    if (yes == 0):
        cleaning = False
        print(f'No negative values detected, proceeding to average evaporation value')
        average_evap = calculate_evaporation_average(ds)
        print(f'Completed. The average evaporation data for this dataset is {average_evap}mm/day')
        break;
    print(f'The used dataset contained {yes} timestamps with potential invalid data.')
    print(f'It contained {no} timestamps where no anomalies were detected.')
    print('Proceeding to remove these values...')

    mask = ~np.isin(range(ds.dims['time']), timestamps)

    # Use this mask to select data from the original dataset
    ds_trim = ds.isel(time=mask)
    ds = ds_trim

    print(f'Removed {yes} values correctly, checking new dataset...')

The used dataset contained 525 timestamps with potential invalid data.
It contained 1666 timestamps where no anomalies were detected.
Proceeding to remove these values...
Removed 525 values correctly, checking new dataset...
No negative values detected, proceeding to average evaporation value
Completed. The average evaporation data for this dataset is 0.41639290443606647 mm/day


In [28]:
print(ds)

<xarray.Dataset>
Dimensions:    (time: 1666, latitude: 120, longitude: 120)
Coordinates:
  * time       (time) datetime64[ns] 2018-01-01 2018-01-02 ... 2023-12-31
  * latitude   (latitude) float64 -4.0 -3.95 -3.9 -3.85 ... 1.8 1.85 1.9 1.95
  * longitude  (longitude) float64 36.0 36.05 36.1 36.15 ... 41.85 41.9 41.95
Data variables:
    evap       (time, latitude, longitude) float64 ...
