In [31]:
import numpy as np
import pandas as pd
import glob
import os.path
import pathlib
import platform 
import xarray as xr

In [32]:
# For data safety reasons a local path is needed to save the data. Both paths noted here are paths of the 
# programmers who worked on the Hydrological models. Please replace with your own path.

cwd = pathlib.Path().resolve()
src = cwd.parent
data = src.parent.parent.parent
OS_type = platform.system()
if OS_type == 'Darwin':
    username = 'Mats '
    data_path = os.path.join(data, 'data_tana', 'TAHMO', 'location_tahmo')
    data_path_location = os.path.join(data, 'data_tana')
    data_path_netcdf = os.path.join(data_path_location, 'TAHMO', 'interpolated')
    
else:
    username = 'Mootje'
    data_path = os.path.join(data, 'OneDrive - Delft University of Technology', 'TU Delft', 'Master ENVM', 'MDP', 'Model', 'Data', 'TAHMO')
    data_path_location = os.path.join(data, 'OneDrive - Delft University of Technology', 'TU Delft', 'Master ENVM', 'MDP', 'Model', 'Data')

print(f"Welcome {username}, have a wonderful day on your {OS_type} machine. Your data should be located in {data_path}")

Welcome Mats , have a wonderful day on your Darwin machine. Your data should be located in /Users/matskerver/Documents/data_tana/TAHMO/location_tahmo


In [34]:
def calculate_evaporation_average(ds):
    average_evap = ds_trim['evap'].mean().values
    return average_evap

In [35]:
# This cell cleans up the Kriging interpolation data as some NaN values were still present in the TAHMO data. 
# As the evaporation is averaged in time and space it is not needed to have all the datapoints. So this cell
# removes any timestep with NaN values and then calculates the average of the timesteps that remain. 


# Open the kriging NetCDF file and define neccesary variables. Max file size netcdf_file = 4Gb. 
netcdf_file = 'kriging_results_evap.nc'
ds = xr.open_dataset(os.path.join(data_path_netcdf, netcdf_file))

variable = 'evap'
file_paths = []
cleaning = True

while cleaning: 
    
    time_indices = range(0, ds.dims['time'], 1)  
    yes = 0
    no = 0
    timestamps = []
    
    for time_index in time_indices: #We loop through all the timesteps and check them individually
        selected_data = ds.isel(time=time_index)
        data_array = selected_data[variable]

        # Both negative and NaN values indicate potential problems in the data and are detected here
        nan_check = data_array.isnull()
        negative_check = data_array < 0
        has_nan = nan_check.any()
        has_negative = negative_check.any()

        # If any anomoly is detected it is added to the list of values to be removed
        if has_nan.values or has_negative.values:
            timestamps.append(time_index)
            yes += 1
        else: 
            no += 1
    
    if (yes == 0):
        # When no more potential issues are detected in the file we can proceed to calculate the average value
        
        cleaning = False
        print(f'No negative values detected, proceeding to average evaporation value')
        average_evap = calculate_evaporation_average(ds)
        print(f'Completed. The average evaporation data for this dataset is {average_evap}mm/day')
        break;
        
        
    print(f'The used dataset contained {yes} timestamps with potential invalid data.')
    print(f'It contained {no} timestamps where no anomalies were detected.')
    print('Proceeding to remove these values...')

    #A mask is created containing all the timesteps that don't contain NaN values. They are then selected and 
    #all the other steps are removed from the dataset. 
    mask = ~np.isin(range(ds.dims['time']), timestamps)
    ds_trim = ds.isel(time=mask)
    ds = ds_trim

    print(f'Removed {yes} values correctly, checking new dataset...')

The used dataset contained 525 timestamps with potential invalid data.
It contained 1666 timestamps where no anomalies were detected.
Proceeding to remove these values...
Removed 525 values correctly, checking new dataset...
No negative values detected, proceeding to average evaporation value
Completed. The average evaporation data for this dataset is 0.41639290443606647 mm/day


In the test case the value can simply be copied over to the dataset compiling script. In any final application the file can simply be saved as a NetCDF or .csv file. Alternatively, the function can be simply put in a separate script and then imported in another notebook. 