In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os.path
import pathlib
import platform 

In [2]:
cwd = pathlib.Path().resolve()
src = cwd.parent
data = src.parent.parent.parent
OS_type = platform.system()

if OS_type == 'Darwin':
    username = 'Mats '
    data_path = os.path.join(data, 'data_tana', 'TAHMO')
else:
    username == 'Mootje'
    data_path = os.path.join(data, 'OneDrive - Delft University of Technology', 'TU Delft', 'Master ENVM', 'MDP', 'Model', 'Data', 'TAHMO')

print(f"Welcome {username}, have a wondeful day on your {OS_type} machine. Your data should be located in {data_path}")

Welcome Mats , have a wondeful day on your Darwin machine. Your data should be located in /Users/matskerver/Documents/data_tana/TAHMO


In [3]:
data_files = glob.glob(os.path.join(data_path, '*.csv'))

dataframes = {}

for file in data_files:
    station_name = os.path.splitext(os.path.basename(file))[0].split('_')[0]
    df = pd.read_csv(file, index_col = 0, sep =',', parse_dates = True)
    dataframes[station_name] = df

In [4]:
dataframes['TA00023']

Unnamed: 0_level_0,te,pr,ra
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-16 08:15:00,27.4,0.017,834.0
2018-03-16 08:20:00,27.3,0.000,643.0
2018-03-16 08:25:00,27.3,0.000,805.0
2018-03-16 08:30:00,27.2,0.000,728.0
2018-03-16 08:35:00,27.2,0.000,676.0
...,...,...,...
2023-12-30 23:40:00,22.4,0.000,0.0
2023-12-30 23:45:00,22.4,0.000,0.0
2023-12-30 23:50:00,22.3,0.000,0.0
2023-12-30 23:55:00,22.3,0.000,0.0


In [5]:
invalid_keys = []

# Iterate over each key, df pair in the dictionary
for key, df in dataframes.items():
    # Check if 'te' and 'pr' columns are present in the dataframe
    if 'te' not in df.columns or 'pr' not in df.columns:
        # Add the key to the list of invalid keys
        invalid_keys.append(key)

invalid_keys

['TA00360', 'TA00080', 'TA00166']

In [6]:
dataframes['TA00080']

Unnamed: 0_level_0,te_S001384,te_S000824,te_S000823,pr_S001384,pr_S000824,pr_S000822,ra_S001384,ra_S000820,ra_S000824
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:00:00,,,13.2,,,,,0.0,
2018-01-01 00:05:00,,,13.0,,,,,0.0,
2018-01-01 00:10:00,,,13.0,,,,,0.0,
2018-01-01 00:15:00,,,12.9,,,,,0.0,
2018-01-01 00:20:00,,,12.8,,,,,0.0,
...,...,...,...,...,...,...,...,...,...
2023-12-30 23:40:00,16.9,,,,,,0.0,,
2023-12-30 23:45:00,16.9,,,,,,0.0,,
2023-12-30 23:50:00,16.9,,,,,,0.0,,
2023-12-30 23:55:00,16.9,,,,,,0.0,,


In [11]:
# Dataset from WRA contained three files with strange formatting, presumably due to an error with the API 
# This snipped solves the individual dataframes after manual inspection. For implementation with more stations 
# it is recommended to contact TAHMO or skip these 'faulty' stations 

if dataframes['TA00080'].shape[1] == 9: # This contains three values for each te, pr, ra, two of which are NaN
    dataframes['TA00080'].columns = ['te_1', 'te_2', 'te_3', 'pr_1', 'pr_2', 'pr_3', 'ra_1', 'ra_2', 'ra_3']
    dataframes['TA00080'] = dataframes['TA00080'].fillna(0.0)
    dataframes['TA00080']['te'] = dataframes['TA00080'].iloc[:, :3].sum(axis=1) #summed to fill the columns
    dataframes['TA00080']['pr'] = dataframes['TA00080'].iloc[:, 3:6].sum(axis=1)
    dataframes['TA00080']['ra'] = dataframes['TA00080'].iloc[:, 6:9].sum(axis=1)
    dataframes['TA00080'] = dataframes['TA00080'].iloc[:, 9:]  #Remove the faulty columns
    
if dataframes['TA00360'].shape[1] == 6: #similar to above
    dataframes['TA00360'].columns = ['te_1', 'te_2', 'pr_1', 'pr_2', 'ra_1', 'ra_2']
    dataframes['TA00360']['te'] = dataframes['TA00360'].iloc[:, :2].sum(axis=1)
    dataframes['TA00360']['pr'] = dataframes['TA00360'].iloc[:, 2:4].sum(axis=1)
    dataframes['TA00360']['ra'] = dataframes['TA00360'].iloc[:, 4:6].sum(axis=1)
    dataframes['TA00360'] = dataframes['TA00360'].iloc[:, 6:]
    
if dataframes['TA00166'].shape[1] == 9: #similar to above
    dataframes['TA00166'].columns = ['te_1', 'te_2', 'te_3', 'pr_1', 'pr_2', 'pr_3', 'ra_1', 'ra_2', 'ra_3']
    dataframes['TA00166']['te'] = dataframes['TA00166'].iloc[:, :3].sum(axis=1)
    dataframes['TA00166']['pr'] = dataframes['TA00166'].iloc[:, 3:6].sum(axis=1)
    dataframes['TA00166']['ra'] = dataframes['TA00166'].iloc[:, 6:9].sum(axis=1)
    dataframes['TA00166'] = dataframes['TA00166'].iloc[:, 9:]
    

In [12]:
def Extra_rad(Tmax, Tmin, df, lat):
    "Calculation of Extraterrestrial radiation"
    G = 0.0820 #* 10 ** 6 # J/m^2/min  --> Constant value

    # Calculation of the radian location of the station
    phi = np.pi / 180 * lat

    # Calculation of the number of the day in a year
    J = df.index.dayofyear.values  # Extract day of the year directly from the index

    # Calculation of the extraterrestrial radiation: Ra
    dr = 1 + 0.033 * np.cos(2 * np.pi * J / 365)
    delta = 0.409 * np.sin((2 * np.pi * J / 365) - 1.39)
    w = np.arccos(- np.tan(phi) * np.tan(delta))
    Ra = ((24 * 60) / np.pi) * G * dr * (w * np.sin(phi) * np.sin(delta) + np.cos(phi) * np.cos(delta) * np.sin(w)) #* 10 ** - 6

    # Create a DataFrame with Ra values and corresponding timestamps
    Ra_df = pd.DataFrame({'Ra': Ra}, index=df.index)

    return Ra_df

In [17]:
def process_dataframe(df):
    # Convert the index to datetime
    df.index = pd.to_datetime(df.index)
    
    # Calculate daily mean temperature
    df_temp_mean = df['te'].resample('D').mean()
    # Calculate daily minimum temperature
    df_temp_min = df['te'].resample('D').min()
    # Calculate daily maximum temperature
    df_temp_max = df['te'].resample('D').max()
    
    # Calculate daily precipitation sum
    df_pr_daily = df['pr'].resample('D').sum()
    
    # Calculate extraterrestrial radiation
    lat = -1.071545386681787  # Latitude for calculation
    df_ra = Extra_rad(df_temp_max, df_temp_min, df, lat)
    df_ra_daily = df_ra.resample('D').mean()
    # Concatenate all daily data into a single dataframe
    
    # Concatenate all daily data into a single dataframe
    df_daily = pd.concat([df_pr_daily, df_temp_mean, df_temp_max, df_temp_min, df_ra_daily], axis=1)
    
    # Rename columns
    df_daily.columns = ['pr', 'te_mean', 'te_max', 'te_min', 'ra']
    
    return df_daily



In [18]:
# Process each dataframe in the dictionary
for key, df in dataframes.items():
     # Process the dataframe and update it in the dictionary
    dataframes[key] = process_dataframe(df)


NameError: name 'df_' is not defined

In [10]:
dataframes['TA00023']

Unnamed: 0_level_0,te,pr,ra
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-16 08:15:00,27.4,0.017,834.0
2018-03-16 08:20:00,27.3,0.000,643.0
2018-03-16 08:25:00,27.3,0.000,805.0
2018-03-16 08:30:00,27.2,0.000,728.0
2018-03-16 08:35:00,27.2,0.000,676.0
...,...,...,...
2023-12-30 23:40:00,22.4,0.000,0.0
2023-12-30 23:45:00,22.4,0.000,0.0
2023-12-30 23:50:00,22.3,0.000,0.0
2023-12-30 23:55:00,22.3,0.000,0.0
