In [1]:
import os
import sys
import glob
import xarray as xr
import numpy as np
from scipy.stats import genextreme, weibull_min
from joblib import Parallel, delayed
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF

operating_system = 'mac'

if operating_system == 'win':
    os.chdir('C:/Users/fabau/OneDrive/Documents/GitHub/master-project-cleaned/')
elif operating_system == 'curnagl':
    os.chdir('/work/FAC/FGSE/IDYST/tbeucler/default/fabien/repos/cleaner_version/')
else:
    os.chdir('/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/')

# Add the path to the custom library
custom_library_path = os.path.abspath('util/processing/')
sys.path.append(custom_library_path)

import time_series

custom_library_path_2 = os.path.abspath('util/gev')
sys.path.append(custom_library_path_2)

import data_processing
from data_processing import depickle, save_to_pickle

In [30]:
# TAKES 17 MINUTES TO RUN
without_storms = '_no_storms'
from_scratch = False

def initialize_datasets_winter(months, days):
    datasets_winter = []
    for month in months:
        for day in days:
            try:
                dataset = depickle(f'data/climatology/daily_winter_season{without_storms}/climatology_europe_winter_{month}_{day}.pkl')
                datasets_winter.append(dataset)
            except FileNotFoundError:
                print(f'No data for month {month} and day {day}')
    return datasets_winter

def process_storm_file(event, cluster_data_original, datasets_winter, tif_15):
    print(f"Processing event: {event}")
    
    # Storm data
    storm = data_processing.process_daily_climatology(
        f'data/time_series_rasters_storms_15h/{event}.tif', 
        cluster_data_original, 
        event
        )
    
    clusters = range(0, 15)
    log_cdf_max = pd.DataFrame()
    max_event = pd.DataFrame()
    
    for cluster_n in clusters:
        if from_scratch == True:
            dataset_wind_speed = []
            for dataset in datasets_winter:
                for ds in dataset:
                    ds_cluster = ds.where(tif_15['band_data'] == cluster_n)
                    daily_max = ds_cluster['i10fg'].max().values
                    dataset_wind_speed.append(daily_max)
        else:
            dataset_wind_speed = pd.read_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/wind_data_cluster_{cluster_n}.csv')['0'].tolist()
                    
            technic = genextreme
            # Fit the GEV distribution
            shape, loc, scale = technic.fit(dataset_wind_speed)
            storm_subset = storm[storm['cluster_n'] == cluster_n]['wind_speed_None']
            max_storm = storm_subset.max()
            # Compute return period for max_storm
            cdf_max_storm = technic.cdf(max_storm, shape, loc=loc, scale=scale)
            return_period = round((1 / (1 - cdf_max_storm)) / 2, 2)
            log_cdf_max_storm = -np.log(1 - cdf_max_storm)
            log_cdf_max.at[event, f'cluster_{cluster_n}'] = log_cdf_max_storm
            max_event.at[event, f'cluster_{cluster_n}'] = max_storm
            print(f"Return Period of Max {event} Wind Gust: {return_period:.2f} years for cluster {cluster_n} with wind speed of {max_storm:.2f} m/s")

            x_vals = dataset_wind_speed #np.linspace(min(dataset_wind_speed), max(dataset_wind_speed), 1000)
            cdf_vals = technic.cdf(x_vals, shape, loc=loc, scale=scale)

            # Transform CDF to -log10(1-CDF)
            cdf_vals_transformed = -np.log10(1 - cdf_vals)

            # save both in CDF and transformed CDF folders
            pd.DataFrame(cdf_vals).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF/GEV_CDF_cluster_{cluster_n}.csv')
            pd.DataFrame(cdf_vals_transformed).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_transformed/GEV_CDF_cluster_{cluster_n}.csv')

        # Optional: Plot the transformed CDF
        # plt.plot(x_vals, cdf_vals_transformed, label="GEV Transformed CDF (-log10(1-CDF)")

    return log_cdf_max, max_event

def main_joblib_parallel_processing(path, cluster_csv, months, days, tif_15, n_jobs=-1):
    # Collect all storm files
    files = glob.glob(os.path.join(path, '*.tif'))
    name_tif = [os.path.basename(file) for file in files]
    # Remove 'max_all_storms.tif'
    if 'max_all_storms.tif' in name_tif:
        name_tif.remove('max_all_storms.tif')

    # Load cluster data
    cluster_data_original = pd.read_csv(cluster_csv)

    if from_scratch == True:
        # Initialize datasets_winter
        datasets_winter = initialize_datasets_winter(months, days)
    else:
        datasets_winter = None

    # Run tasks in parallel using joblib
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_storm_file)(file.split('.')[0], cluster_data_original, datasets_winter, tif_15) 
        for file in name_tif
    )
    # Separate log_cdf_max and max_event results
    log_cdf_max_list, max_event_list = zip(*results)
    
    # Combine results across all events
    log_cdf_max_combined = pd.concat(log_cdf_max_list, axis=0)
    max_event_combined = pd.concat(max_event_list, axis=0)

    return log_cdf_max_combined, max_event_combined

# Prepare data for processing
tif_15 = xr.open_dataset('pre_processing/cluster/number_of_clusters/cleaned_cluster_15_v2.tif', engine='rasterio')
tif_15 = tif_15.rename({'x': 'longitude', 'y': 'latitude'})

path = '/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/data/time_series_rasters_storms_15h/'
cluster_csv = 'pre_processing/cluster/number_of_clusters/cleaned_cluster_15_v2.csv'
months = [10, 11, 12, 1, 2, 3]  # Winter months
days = range(1, 32)  # Days from 01 to 31

# Process all storms in parallel
log_cdf_max_combined, max_event_combined = main_joblib_parallel_processing(path, cluster_csv, months, days, tif_15, n_jobs=-1)

Processing event: 9_C3S_STORM_TRACKS_ERA5
Processing event: 35_C3S_STORM_TRACKS_ERA5
Processing event: 82_C3S_STORM_TRACKS_ERA5
Processing event: 1_C3S_STORM_TRACKS_ERA5
Processing event: 13_C3S_STORM_TRACKS_ERA5
Processing event: 95_AIDEN 
Processing event: 7_HERTA
Processing event: 28_C3S_STORM_TRACKS_ERA5
Processing event: 92_CIARA2
Processing event: 79_C3S_STORM_TRACKS_ERA5
Return Period of Max 82_C3S_STORM_TRACKS_ERA5 Wind Gust: 4.84 years for cluster 0 with wind speed of 24.37 m/s
Return Period of Max 82_C3S_STORM_TRACKS_ERA5 Wind Gust: 2.13 years for cluster 1 with wind speed of 20.72 m/s
Return Period of Max 28_C3S_STORM_TRACKS_ERA5 Wind Gust: 47.54 years for cluster 0 with wind speed of 31.54 m/s
Return Period of Max 82_C3S_STORM_TRACKS_ERA5 Wind Gust: 2.15 years for cluster 2 with wind speed of 22.54 m/s
Return Period of Max 1_C3S_STORM_TRACKS_ERA5 Wind Gust: 6.63 years for cluster 0 with wind speed of 25.46 m/s
Return Period of Max 28_C3S_STORM_TRACKS_ERA5 Wind Gust: 3.88 ye

In [31]:
# Reset index and split 'Unnamed: 0' into 'storm_number' and 'storm_name'
log_cdf_max_all = log_cdf_max_combined.reset_index()
log_cdf_max_all[['storm_number', 'storm_name']] = log_cdf_max_all['index'].str.split('_', n=1, expand=True)

max_event_all = max_event_combined.reset_index()
max_event_all[['storm_number', 'storm_name']] = max_event_all['index'].str.split('_', n=1, expand=True)

# Drop unnecessary columns
log_cdf_max_all = log_cdf_max_all.drop(columns=['index'])

max_event_all = max_event_all.drop(columns=['index'])

# Reorder columns to place 'storm_number' and 'storm_name' at the beginning
cols = ['storm_number', 'storm_name'] + [col for col in log_cdf_max_all.columns if col not in ['storm_number', 'storm_name']]
log_cdf_max_all = log_cdf_max_all[cols]

cols = ['storm_number', 'storm_name'] + [col for col in max_event_all.columns if col not in ['storm_number', 'storm_name']]
max_event_all = max_event_all[cols]

# match the storms witht the time series 
storm_time_series = pd.read_csv('data/time_series_1h_non_EU/2m_dewpoint_temperature/2m_dewpoint_temperature_max.csv')['storm_index'].astype(int)
# Change type of 'storm_number' to integer
log_cdf_max_all['storm_number'] = log_cdf_max_all['storm_number'].astype(int)
log_cdf_max_all = log_cdf_max_all[log_cdf_max_all['storm_number'].isin(storm_time_series)]

max_event_all['storm_number'] = max_event_all['storm_number'].astype(int)
max_event_all = max_event_all[max_event_all['storm_number'].isin(storm_time_series)]

# Save the combined dataframe to multiple locations
log_cdf_max_all.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_max/log_cdf_max_combined.csv', index=False)
log_cdf_max_all.to_csv(f'pre_processing/nestedMLR/log_cdf_max_combined{without_storms}.csv', index=False)

max_event_all.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/EVENT_max/max_event_combined.csv', index=False)
max_event_all.to_csv(f'pre_processing/nestedMLR/max_event_combined{without_storms}.csv', index=False)
'''
# Load storm splits for training, validation, and testing
storm_training_test_valid = pd.read_csv(
    'pre_processing/time_series_i10fg_before_eu/storm_training_test_valid.csv'
)

# Filter and save data for training, validation, and testing
for split, split_name in zip(
    ['training', 'validation', 'test'],
    ['log_cdf_max_training', 'log_cdf_max_validation', 'log_cdf_max_test']
):
    split_df = log_cdf_max_all[log_cdf_max_all['storm_number'].isin(storm_training_test_valid[split])]
    split_df = split_df.drop(columns=['storm_name'])
    
    # Save the split data to multiple locations
    split_df.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_max/{split_name}.csv', index=False)
    split_df.to_csv(f'pre_processing/nestedMLR/{split_name}{without_storms}.csv', index=False)'''

"\n# Load storm splits for training, validation, and testing\nstorm_training_test_valid = pd.read_csv(\n    'pre_processing/time_series_i10fg_before_eu/storm_training_test_valid.csv'\n)\n\n# Filter and save data for training, validation, and testing\nfor split, split_name in zip(\n    ['training', 'validation', 'test'],\n    ['log_cdf_max_training', 'log_cdf_max_validation', 'log_cdf_max_test']\n):\n    split_df = log_cdf_max_all[log_cdf_max_all['storm_number'].isin(storm_training_test_valid[split])]\n    split_df = split_df.drop(columns=['storm_name'])\n    \n    # Save the split data to multiple locations\n    split_df.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_max/{split_name}.csv', index=False)\n    split_df.to_csv(f'pre_processing/nestedMLR/{split_name}{without_storms}.csv', index=False)"