# Normalizing the raw data

This is how raw meteo data is preprocessed. 

In [1]:
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
era_dir = './raw era5/'
gsmap_dir = './GSMAP/'
chirps_dir = './chirps/'
fancy_dir = './fancy_dataset/'
    

In [None]:
for dataframe in os.listdir(era_dir):
    mydf = pd.DataFrame()
    # loading era5 raw data 
    eradf = pd.read_csv(era_dir + dataframe, parse_dates=['date'])
    mydf['basin_id'] = eradf['basin_id']
    mydf['date'] = eradf['date']
    mydf['prcp_era'] = eradf['prcp'] * 1000                 # for some reason ERA computes prcp in m not mm
    mydf['temp_mean'] = eradf['temp_mean'] - 273.15         # from Kelvin to Celcius
    mydf['temp_min'] = eradf['temp_min'] - 273.15
    mydf['temp_max'] = eradf['temp_max'] - 273.15
    mydf['vp1'] = 0.1 * (eradf['dew_mean'] ** -4.9283) * (10 ** (23.5518 + (-2937.4 / eradf['dew_mean']))) # Dr. Vadim Yapiyev provided this formula
    mydf['dew_mean'] = eradf['dew_mean'] - 273.15
    mydf['wind_speed'] = (eradf['u_comp_wind'] ** 2 + eradf['v_comp_wind'] ** 2) ** 0.5 # Dr. Vadim Yapiyev provided this formula
    mydf['vp2'] = 0.6108 * np.exp(17.27 * mydf['dew_mean'] / (mydf['dew_mean'] + 237.3)) # Dr. Vadim Yapiyev provided this formula 
    mydf['srad_joules'] = eradf['srad_joules']

    # print(mydf)
    # print(fancy_dir + dataframe.split('_')[1][:5])
    # break

    mydf[['basin_id', 'date', 'prcp_era', 'temp_mean', 'temp_min', 'temp_max', 'dew_mean', 'wind_speed', 'vp1', 'vp2', 'srad_joules']].to_csv(fancy_dir + dataframe.split('_')[1][:5] + '.csv', index=False)
    print(dataframe.split('_')[1][:5] + ' is DONE')


In [None]:
for dataframe in os.listdir(gsmap_dir):
    
    bigdf = pd.read_csv(fancy_dir + dataframe.split()[0], parse_dates=['date'])
    gsdf = pd.read_csv(gsmap_dir + dataframe.split()[0], parse_dates=['date'])

    temp = gsdf[['date', 'prcp']]
    merged_df = pd.merge(bigdf, temp, on='date', how='left')
    merged_df.rename(columns={'prcp': 'prcp_gsmap'}, inplace=True)
    # print(merged_df)
    print('\n')

    path = chirps_dir + dataframe.split()[0]
    if os.path.exists(path):
        chirpsdf = pd.read_csv(path, parse_dates=['date'])
        final_df = pd.merge(merged_df, chirpsdf, on='date',how='left')
        final_df.rename(columns={'precipitation':'prcp_chirps'}, inplace=True) 
        # print(final_df.head())
        final_df[['basin_id', 'date', 'prcp_era', 'prcp_gsmap', 'prcp_chirps', 'temp_mean', 'temp_min', 'temp_max', 'dew_mean', 'wind_speed', 'vp1', 'vp2', 'srad_joules']].to_csv(fancy_dir + dataframe, index=False)
    else :
        merged_df['prcp_chirps'] = np.nan
        merged_df[['basin_id', 'date', 'prcp_era', 'prcp_gsmap', 'prcp_chirps', 'temp_mean', 'temp_min', 'temp_max', 'dew_mean', 'wind_speed', 'vp1', 'vp2', 'srad_joules']].to_csv(fancy_dir + dataframe, index=False)

    print(dataframe.split()[0] + " is DONE")
    

In [63]:
from datetime import datetime
import pytz
# import astral 
from astral import LocationInfo
from astral.sun import sun

def daylight(date, lat, lon, timezone_str='Asia/Almaty'):
    # Create a location object with the given latitude and longitude
    location = LocationInfo(latitude=lat, longitude=lon)
    
    # Get the local timezone using the provided timezone string
    timezone = pytz.timezone(timezone_str)
    
    # Combine the date with the minimum time (midnight) and localize it to the timezone
    date_local = timezone.localize(datetime.combine(date, datetime.min.time()))
    
    try:
        # Calculate the sunrise and sunset times for the location and date
        s = sun(location.observer, date=date_local)
        sunrise_local = s['sunrise'].astimezone(timezone)
        sunset_local = s['sunset'].astimezone(timezone)
        
        # Calculate the daylight duration in seconds
        daylight_duration = sunset_local - sunrise_local
        daylight_seconds = daylight_duration.total_seconds()
        
        # Handle edge case where daylight duration is negative
        if daylight_seconds < 0:
            raise ValueError(f"Negative daylight duration: {daylight_seconds} seconds (Sunrise: {sunrise_local}, Sunset: {sunset_local})")
        
    except ValueError:
        # Return -1 in case of an error (e.g., invalid location/date, polar night)
        daylight_seconds = -1
    
    return daylight_seconds



In [None]:
coordinates_df = pd.read_csv('selected_hydro_stations.csv')
coord = {row['id']: [row['lng'], row['lat']] for _, row in coordinates_df.iterrows()}
print(coord)


def daylight_count(row):
    return daylight(row['date'], coord[row['basin_id']][1], coord[row['basin_id']][0])

for elem in os.listdir(era_dir):
    df = pd.read_csv(era_dir + elem, parse_dates=['date'])
    df['daylight'] = df.apply(daylight_count, axis = 1)
    
    # Replace -1 with NaN
    df['daylight'] = df['daylight'].replace(-1, np.nan)

    # Forward fill NaN values, propagating the previous day's valid value
    df['daylight'] = df['daylight'].fillna(method='ffill')
    
    df['srad'] = df['srad_joules'] / df['daylight']
    # print(df.head())
    # break
    df[['basin_id', 'date', 'prcp', 'temp_mean', 'temp_min', 'temp_max', 'dew_mean', 'u_comp_wind', 'v_comp_wind', 'srad_joules', 'daylight', 'srad']].to_csv(era_dir+elem, index=False)
    print(elem + " is Done")
    

In [None]:
for dataframe in os.listdir(era_dir):
    final_df = pd.read_csv(fancy_dir + dataframe.split('_')[1], parse_dates=['date'])
    df1 = pd.read_csv(era_dir + dataframe, parse_dates=['date'])
    final_df['srad'] = df1['srad']

    final_df[['basin_id', 'date', 'prcp_era', 'prcp_gsmap', 'prcp_chirps', 'temp_mean', 'temp_min', 'temp_max', 'dew_mean', 'wind_speed', 'vp1', 'vp2', 'srad']].to_csv(fancy_dir + dataframe.split('_')[1], index=False)
    print(dataframe.split('_')[1] + " is DONE")
    
