In [9]:
import os
import sys
import glob
import xarray as xr
import numpy as np
from scipy.stats import genextreme, weibull_min
from joblib import Parallel, delayed
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF

operating_system = 'mac'

if operating_system == 'win':
    os.chdir('C:/Users/fabau/OneDrive/Documents/GitHub/master-project-cleaned/')
elif operating_system == 'curnagl':
    os.chdir('/work/FAC/FGSE/IDYST/tbeucler/default/fabien/repos/cleaner_version/')
else:
    os.chdir('/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/')

# Add the path to the custom library
custom_library_path = os.path.abspath('util/processing/')
sys.path.append(custom_library_path)

import time_series

custom_library_path_2 = os.path.abspath('util/gev')
sys.path.append(custom_library_path_2)

import data_processing
from data_processing import depickle, save_to_pickle

In [25]:
# TAKES 17 MINUTES TO RUN
without_storms = ''

def initialize_datasets_winter(months, days):
    datasets_winter = []
    for month in months:
        for day in days:
            try:
                dataset = depickle(f'data/climatology/daily_winter_season{without_storms}/climatology_europe_winter_{month}_{day}.pkl')
                datasets_winter.append(dataset)
            except FileNotFoundError:
                print(f'No data for month {month} and day {day}')
    return datasets_winter

def process_storm_file(event, cluster_data_original, datasets_winter, tif_15):
    print(f"Processing event: {event}")
    
    # Storm data
    storm = data_processing.process_daily_climatology(
        f'data/time_series_rasters_storms_15h/{event}.tif', 
        cluster_data_original, 
        event
    )
    
    clusters = range(0, 15)
    log_cdf_max = pd.DataFrame()
    
    for cluster_n in clusters:
        dataset_wind_speed = []
        for dataset in datasets_winter:
            for ds in dataset:
                ds_cluster = ds.where(tif_15['band_data'] == cluster_n)
                daily_max = ds_cluster['i10fg'].max().values
                dataset_wind_speed.append(daily_max)
                
        technic = genextreme
        # Fit the GEV distribution
        shape, loc, scale = technic.fit(dataset_wind_speed)
        storm_subset = storm[storm['cluster_n'] == cluster_n]['wind_speed_None']
        max_storm = storm_subset.max()
        # Compute return period for max_storm
        cdf_max_storm = technic.cdf(max_storm, shape, loc=loc, scale=scale)
        return_period = round((1 / (1 - cdf_max_storm)) / 2, 2)
        log_cdf_max_storm = -np.log(1 - cdf_max_storm)
        log_cdf_max.at[event, f'cluster_{cluster_n}'] = log_cdf_max_storm
        print(f"Return Period of Max {event} Wind Gust: {return_period:.2f} years for cluster {cluster_n}")

        x_vals = np.linspace(min(dataset_wind_speed), max(dataset_wind_speed), 1000)
        cdf_vals = technic.cdf(x_vals, shape, loc=loc, scale=scale)

        # Transform CDF to -log10(1-CDF)
        cdf_vals_transformed = -np.log10(1 - cdf_vals)

        # save both in CDF and transformed CDF folders
        pd.DataFrame(cdf_vals).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF/GEV_CDF_cluster_{cluster_n}.csv')
        pd.DataFrame(cdf_vals_transformed).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_transformed/GEV_CDF_cluster_{cluster_n}.csv')

        # calculate the ECDF of the data
        ecdf = ECDF(dataset_wind_speed)

        # save the ECDF
        pd.DataFrame(ecdf.y).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/ECDF/ECDF_cluster_{cluster_n}.csv')

        # save the dataset_wind_speed
        pd.DataFrame(dataset_wind_speed).to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/wind_data_cluster_{cluster_n}.csv')

        # Optional: Plot the transformed CDF
        # plt.plot(x_vals, cdf_vals_transformed, label="GEV Transformed CDF (-log10(1-CDF)")

    return log_cdf_max

def main_joblib_parallel_processing(path, cluster_csv, months, days, tif_15, n_jobs=-1):
    # Collect all storm files
    files = glob.glob(os.path.join(path, '*.tif'))
    name_tif = [os.path.basename(file) for file in files]
    # Remove 'max_all_storms.tif'
    if 'max_all_storms.tif' in name_tif:
        name_tif.remove('max_all_storms.tif')

    # Load cluster data
    cluster_data_original = pd.read_csv(cluster_csv)

    # Initialize datasets_winter
    datasets_winter = initialize_datasets_winter(months, days)

    # Run tasks in parallel using joblib
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_storm_file)(file.split('.')[0], cluster_data_original, datasets_winter, tif_15) 
        for file in name_tif
    )
    
    # Combine results
    log_cdf_max_combined = pd.concat(results)
    return log_cdf_max_combined

# Prepare data for processing
tif_15 = xr.open_dataset('pre_processing/cluster/number_of_clusters/cleaned_cluster_15_v1.tif', engine='rasterio')
tif_15 = tif_15.rename({'x': 'longitude', 'y': 'latitude'})

path = '/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/data/time_series_rasters_storms_15h/'
cluster_csv = 'pre_processing/cluster/number_of_clusters/cleaned_cluster_15_v1.csv'
months = [10, 11, 12, 1, 2, 3]  # Winter months
days = range(1, 32)  # Days from 01 to 31

# Process all storms in parallel
log_cdf_max_combined = main_joblib_parallel_processing(path, cluster_csv, months, days, tif_15, n_jobs=-1)

No data for month 11 and day 31
No data for month 2 and day 30
No data for month 2 and day 31
Processing event: 9_C3S_STORM_TRACKS_ERA5
Processing event: 35_C3S_STORM_TRACKS_ERA5
Return Period of Max 9_C3S_STORM_TRACKS_ERA5 Wind Gust: 2.22 years for cluster 0
Processing event: 82_C3S_STORM_TRACKS_ERA5
Return Period of Max 35_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.56 years for cluster 0
Return Period of Max 9_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.75 years for cluster 1
Processing event: 1_C3S_STORM_TRACKS_ERA5
Return Period of Max 35_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.02 years for cluster 1
Return Period of Max 9_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.73 years for cluster 2
Return Period of Max 82_C3S_STORM_TRACKS_ERA5 Wind Gust: 4.84 years for cluster 0
Processing event: 13_C3S_STORM_TRACKS_ERA5
Return Period of Max 9_C3S_STORM_TRACKS_ERA5 Wind Gust: 3.46 years for cluster 3
Return Period of Max 35_C3S_STORM_TRACKS_ERA5 Wind Gust: 2.40 years for cluster 2
Return Period of Max 82_C3S_STORM_TRACKS_



Return Period of Max 61_EMMA Wind Gust: 0.69 years for cluster 2
Return Period of Max 80_C3S_STORM_TRACKS_ERA5 Wind Gust: 20.46 years for cluster 0
Return Period of Max 19_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.23 years for cluster 4
Processing event: 11_C3S_STORM_TRACKS_ERA5
Return Period of Max 85_C3S_STORM_TRACKS_ERA5 Wind Gust: 9.84 years for cluster 10
Return Period of Max 65_XYNTHIA Wind Gust: 4.27 years for cluster 6
Return Period of Max 6_WIEBKE Wind Gust: 41.33 years for cluster 8
Return Period of Max 80_C3S_STORM_TRACKS_ERA5 Wind Gust: 59.06 years for cluster 1
Return Period of Max 45_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.68 years for cluster 13
Return Period of Max 19_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.96 years for cluster 5
Return Period of Max 61_EMMA Wind Gust: 26.36 years for cluster 3
Return Period of Max 85_C3S_STORM_TRACKS_ERA5 Wind Gust: 3591.98 years for cluster 11
Return Period of Max 65_XYNTHIA Wind Gust: 4.44 years for cluster 7
Return Period of Max 6_WIEBKE Wind Gust: 



Return Period of Max 74_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.56 years for cluster 13
Return Period of Max 30_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.65 years for cluster 1
Return Period of Max 5_VIVIAN Wind Gust: 2.54 years for cluster 7
Processing event: 69_C3S_STORM_TRACKS_ERA5
Return Period of Max 4_C3S_STORM_TRACKS_ERA5 Wind Gust: 71.26 years for cluster 11
Return Period of Max 87_C3S_STORM_TRACKS_ERA5 Wind Gust: 22.51 years for cluster 3
Return Period of Max 74_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.58 years for cluster 14
Return Period of Max 16_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.66 years for cluster 9
Return Period of Max 21_LORE Wind Gust: 1.79 years for cluster 5
Return Period of Max 30_C3S_STORM_TRACKS_ERA5 Wind Gust: 4.98 years for cluster 2
Return Period of Max 5_VIVIAN Wind Gust: 1.93 years for cluster 8
Return Period of Max 4_C3S_STORM_TRACKS_ERA5 Wind Gust: 56.06 years for cluster 12
Return Period of Max 87_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.69 years for cluster 4
Return Period of 



Return Period of Max 86_C3S_STORM_TRACKS_ERA5 Wind Gust: 12.14 years for cluster 0
Return Period of Max 69_C3S_STORM_TRACKS_ERA5 Wind Gust: 66.21 years for cluster 2
Return Period of Max 5_VIVIAN Wind Gust: 1.54 years for cluster 11
Return Period of Max 30_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.99 years for cluster 5
Return Period of Max 16_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.54 years for cluster 13
Return Period of Max 87_C3S_STORM_TRACKS_ERA5 Wind Gust: 4.77 years for cluster 7
Return Period of Max 21_LORE Wind Gust: 0.58 years for cluster 9
Processing event: 42_ORATIA-TORA
Return Period of Max 86_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.66 years for cluster 1
Return Period of Max 5_VIVIAN Wind Gust: 3.39 years for cluster 12
Return Period of Max 69_C3S_STORM_TRACKS_ERA5 Wind Gust: 1.15 years for cluster 3
Return Period of Max 30_C3S_STORM_TRACKS_ERA5 Wind Gust: 10.64 years for cluster 6
Return Period of Max 16_C3S_STORM_TRACKS_ERA5 Wind Gust: 0.70 years for cluster 14
Return Period of Max 87_C

In [26]:
# Reset index and split 'Unnamed: 0' into 'storm_number' and 'storm_name'
log_cdf_max_all = log_cdf_max_combined.reset_index()
log_cdf_max_all[['storm_number', 'storm_name']] = log_cdf_max_all['index'].str.split('_', n=1, expand=True)

# Drop unnecessary columns
log_cdf_max_all = log_cdf_max_all.drop(columns=['index'])

# Reorder columns to place 'storm_number' and 'storm_name' at the beginning
cols = ['storm_number', 'storm_name'] + [col for col in log_cdf_max_all.columns if col not in ['storm_number', 'storm_name']]
log_cdf_max_all = log_cdf_max_all[cols]

# Save the combined dataframe to multiple locations
log_cdf_max_all.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_max/log_cdf_max_combined.csv', index=False)
log_cdf_max_all.to_csv(f'pre_processing/nestedMLR/log_cdf_max_combined{without_storms}.csv', index=False)

# Change type of 'storm_number' to integer
log_cdf_max_all['storm_number'] = log_cdf_max_all['storm_number'].astype(int)

# Load storm splits for training, validation, and testing
storm_training_test_valid = pd.read_csv(
    'pre_processing/time_series_i10fg_before_eu/storm_training_test_valid.csv'
)

# Filter and save data for training, validation, and testing
for split, split_name in zip(
    ['training', 'validation', 'test'],
    ['log_cdf_max_training', 'log_cdf_max_validation', 'log_cdf_max_test']
):
    split_df = log_cdf_max_all[log_cdf_max_all['storm_number'].isin(storm_training_test_valid[split])]
    split_df = split_df.drop(columns=['storm_name'])
    
    # Save the split data to multiple locations
    split_df.to_csv(f'data/climatology_dm_winter_per_cluster{without_storms}/GEV_CDF_max/{split_name}.csv', index=False)
    split_df.to_csv(f'pre_processing/nestedMLR/{split_name}{without_storms}.csv', index=False)