In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from collections import defaultdict
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from math import sqrt
from sklearn.feature_selection import SequentialFeatureSelector
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingGridSearchCV

operating_system = 'mac'

if operating_system == 'win':
    os.chdir('C:/Users/fabau/OneDrive/Documents/GitHub/master-project-cleaned/')
elif operating_system == 'curnagl':
    os.chdir('/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/cleaner_version/')
else:
    os.chdir('/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/')

# Add the path to the custom library
custom_library_path = os.path.abspath('util/processing/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/gev/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/feature_selection/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/ml/')
sys.path.append(custom_library_path)

import extraction_squares, pre_processing_data, data_processing, selection_vars, sensitivity_test

'''if operating_system == 'curnagl':
    name_of_variable= pd.read_csv('/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/curnagl/DATASETS/variable_list_80_mean.csv')
    path_data = '/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/curnagl/DATASETS'
else:'''
name_of_variable_20 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_20.csv')['Unnamed: 0']#('data/variable_list_levels.csv')
name_of_variable_30 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_30.csv')['Unnamed: 0']#('data/variable_list_levels.csv')
name_of_variable_40 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_40.csv')['Unnamed: 0']#('data/variable_list_levels.csv')

path_data = 'data'

storm_dates = pd.read_csv('pre_processing/tracks/storm_dates.csv')
#path_tracks_1h_non_EU = 'pre_processing/tracks/ALL_TRACKS/tracks_1h_non_EU'
#dataset = 'datasets_1h'
#dataset_non_EU = 'datasets_1h_non_EU'
levels = pd.read_csv('data/levels.csv')

In [6]:
# import the all_loadings data
all_ts = pd.read_csv('data/time_series_1h_non_EU/all_time_series.csv')
all_loadings = pd.read_csv('ml_scripts/nestedMLR/all_loadings_1000.csv')

'''# Extract variable names and storm data
#variables = all_loadings['variable']  # First column
storm_data = all_loadings.iloc[:, 1:]  # All columns from the second onward

# Transpose storm data and set variable names as columns
transposed_data = storm_data.T
transposed_data.columns = variables'''


# extract the storm number
storm_numbers = all_ts['storm_index'].copy()
transposed_data = all_ts
transposed_data_loadings = all_loadings

# Extract the base names from transposed_data columns by removing "_step_x"
base_names_in_data = set(col.split('_step_')[0] for col in transposed_data.columns)
base_names_in_data_loadings = set(col.split('_step_')[0] for col in transposed_data_loadings.columns)

# Filter the columns based on base names from name_of_variable_20
columns_to_select_20 = [col for col in transposed_data.columns if col.split('_step_')[0] in name_of_variable_20.tolist()]
# Select the corresponding columns from the dataframe
transposed_data_20 = transposed_data[columns_to_select_20]
# Filter the columns based on base names from name_of_variable_30
columns_to_select_30 = [col for col in transposed_data.columns if col.split('_step_')[0] in name_of_variable_30.tolist()]
# Select the corresponding columns from the dataframe
transposed_data_30 = transposed_data[columns_to_select_30]
# Filter the columns based on base names from name_of_variable_40
columns_to_select_40 = [col for col in transposed_data.columns if col.split('_step_')[0] in name_of_variable_40.tolist()]
# Select the corresponding columns from the dataframe
transposed_data_40 = transposed_data[columns_to_select_40]
# Select all the data
all_columns = transposed_data.columns
transposed_data_all = transposed_data[all_columns]

# For the loadings
all_columns_loadings = transposed_data_loadings.columns
transposed_data_loadings_all = transposed_data_loadings[all_columns_loadings]

# add the storm number to the transposed data
transposed_data_20['storm_number'] = storm_numbers.astype(int)
transposed_data_30['storm_number'] = storm_numbers.astype(int)
transposed_data_40['storm_number'] = storm_numbers.astype(int)
transposed_data_all['storm_number'] = storm_numbers.astype(int)
transposed_data_loadings_all['storm_number'] = storm_numbers.astype(int)

# set storm number as first column
cols_20 = transposed_data_20.columns.tolist()
cols_20 = cols_20[-1:] + cols_20[:-1]
transposed_data_20 = transposed_data_20[cols_20]
#cols_20 = cols_20[1:]

cols_30 = transposed_data_30.columns.tolist()
cols_30 = cols_30[-1:] + cols_30[:-1]
transposed_data_30 = transposed_data_30[cols_30]
#cols_30 = cols_30[1:]

cols_40 = transposed_data_40.columns.tolist()
cols_40 = cols_40[-1:] + cols_40[:-1]
transposed_data_40 = transposed_data_40[cols_40]
#cols_40 = cols_40[1:]

cols_all = transposed_data_all.columns.tolist()
cols_all = cols_all[-1:] + cols_all[:-1]
transposed_data_all = transposed_data_all[cols_all]

cols_loadings_all = transposed_data_loadings_all.columns.tolist()
cols_loadings_all = cols_loadings_all[-1:] + cols_loadings_all[:-1]
transposed_data_loadings_all = transposed_data_loadings_all[cols_loadings_all]


# Add storm number to the original data

'''original_data = transposed_data.copy()
original_columns = transposed_data.columns
original_data['storm_number'] = original_data['storm_number'].astype(int)'''

# Add PCA numbers to each variable to differentiate modes
'''
# Count how many times each variable appears in the column names
variable_counts_20 = transposed_data_20.columns.value_counts()
# Create a mapping with PCA numbers appended to each variable
updated_columns_20 = []
pca_tracker_20 = {}
# for 20 variables
for var in transposed_data_20.columns:
    if var not in pca_tracker_20:
        pca_tracker_20[var] = 1
    else:
        pca_tracker_20[var] += 1
    # Append PCA number to the variable name
    updated_columns_20.append(f"{var}_PCA_{pca_tracker_20[var]}")
# Update the column names
transposed_data_20.columns = updated_columns_20
# rename the first column to storm_number
transposed_data_20 = transposed_data_20.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_20['storm_number'] = transposed_data_20['storm_number'].astype(int)

# for 30 variables
updated_columns_30 = []
pca_tracker_30 = {}
for var in transposed_data_30.columns:
    if var not in pca_tracker_30:
        pca_tracker_30[var] = 1
    else:
        pca_tracker_30[var] += 1
    # Append PCA number to the variable name
    updated_columns_30.append(f"{var}_PCA_{pca_tracker_30[var]}")
# Update the column names
transposed_data_30.columns = updated_columns_30
# rename the first column to storm_number
transposed_data_30 = transposed_data_30.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_30['storm_number'] = transposed_data_30['storm_number'].astype(int)

# for 40 variables
updated_columns_40 = []
pca_tracker_40 = {}
for var in transposed_data_40.columns:
    if var not in pca_tracker_40:
        pca_tracker_40[var] = 1
    else:
        pca_tracker_40[var] += 1
    # Append PCA number to the variable name
    updated_columns_40.append(f"{var}_PCA_{pca_tracker_40[var]}")
# Update the column names
transposed_data_40.columns = updated_columns_40
# rename the first column to storm_number
transposed_data_40 = transposed_data_40.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_40['storm_number'] = transposed_data_40['storm_number'].astype(int)'''

# load the actual y values

y_all_cdf = pd.read_csv('data/climatology_dm_winter_per_cluster/GEV_CDF_max/log_cdf_max_combined.csv')
y_all_max = pd.read_csv('data/climatology_dm_winter_per_cluster/EVENT_max/max_event_combined.csv')

# Extract storm indices
storm_indices = transposed_data_20['storm_number'].to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_20['storm_number'] = storm_numbers.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_30['storm_number'] = storm_numbers.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_40['storm_number'] = storm_numbers.astype(int)


In [7]:
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]

for seed in seeds:
    # separate the data in training and testing
    storm_index_training, storm_index_test, storm_index_validation = extraction_squares.split_storm_numbers(storm_indices, 0.12, seed, 'number')

    # order the index of the storms

    storm_index_training.sort()
    storm_index_test.sort()
    storm_index_validation.sort()

    # add +1 to the storm index to match the storm index in the storm_dates dataframe (it's actually storm index for this set, so +1 is needed)
    #storm_index_training = [x+1 for x in storm_index_training]
    #storm_index_test = [x+1 for x in storm_index_test]
    #storm_index_validation = [x+1 for x in storm_index_validation]

    print("Storm Training:", storm_index_training)
    print("Storm Test:", storm_index_test)
    print("Storm Valid:", storm_index_validation) 

    # remove the variable convective_rain_rate and vertical_velocity
    #columns_to_drop = transposed_data.columns[transposed_data.columns.str.startswith(('convective_rain_rate', 'vertical_velocity'))]
    #transposed_data = transposed_data.drop(columns=columns_to_drop)

    # Update the column names
    #updated_columns = transposed_data.columns

    X_train_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_training, cols_20)
    X_test_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_test, cols_20)
    X_validation_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_validation, cols_20)

    X_train_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_training, cols_30)
    X_test_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_test, cols_30)
    X_validation_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_validation, cols_30)

    X_train_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_training, cols_40)
    X_test_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_test, cols_40)
    X_validation_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_validation, cols_40)

    X_train_all = selection_vars.prepare_training_data(transposed_data_all, storm_index_training, cols_all)
    X_test_all = selection_vars.prepare_training_data(transposed_data_all, storm_index_test, cols_all)
    X_validation_all = selection_vars.prepare_training_data(transposed_data_all, storm_index_validation, cols_all)

    X_train_all_loadings = selection_vars.prepare_training_data(transposed_data_loadings_all, storm_index_training, cols_loadings_all)
    X_test_all_loadings = selection_vars.prepare_training_data(transposed_data_loadings_all, storm_index_test, cols_loadings_all)
    X_validation_all_loadings = selection_vars.prepare_training_data(transposed_data_loadings_all, storm_index_validation, cols_loadings_all)

    X_train_pca_20.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_ts_20.csv', index=False)
    X_test_pca_20.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_ts_20.csv', index=False)
    X_validation_pca_20.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_ts_20.csv', index=False)

    X_train_pca_30.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_ts_30.csv', index=False)
    X_test_pca_30.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_ts_30.csv', index=False)
    X_validation_pca_30.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_ts_30.csv', index=False)

    X_train_pca_40.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_ts_40.csv', index=False)
    X_test_pca_40.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_ts_40.csv', index=False)
    X_validation_pca_40.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_ts_40.csv', index=False)

    X_train_all.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_ts_all.csv', index=False)
    X_test_all.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_ts_all.csv', index=False)
    X_validation_all.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_ts_all.csv', index=False)

    X_train_all_loadings.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_all.csv', index=False)
    X_test_all_loadings.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_all.csv', index=False)
    X_validation_all_loadings.to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_all.csv', index=False)

    # load the actual y values
'''
    y_train_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_training)
    y_test_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_test)
    y_validation_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_validation)

    y_train_max = selection_vars.process_y_data(y_all_max, storm_index_training)
    y_test_max = selection_vars.process_y_data(y_all_max, storm_index_test)
    y_validation_max = selection_vars.process_y_data(y_all_max, storm_index_validation)'''

To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 32, 34, 39, 43, 45, 46, 49, 50, 51, 53, 54, 56, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 76, 77, 78, 79, 80, 81, 82, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Valid: [21, 33, 44, 47, 58, 83]
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 5, 7, 8, 11, 12, 13, 16, 19, 21, 26, 27, 31, 32, 33, 34, 39, 43, 45, 46, 47, 49, 50, 51, 56, 58, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 77, 78, 79, 81, 82, 83, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Valid: [3, 44, 53, 54, 76, 80]
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 33, 34, 39, 43, 44, 45, 46, 47, 49, 50, 51, 53, 54, 56, 58, 60, 61, 62, 63, 64, 67, 68, 69, 71

'\n    y_train_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_training)\n    y_test_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_test)\n    y_validation_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_validation)\n\n    y_train_max = selection_vars.process_y_data(y_all_max, storm_index_training)\n    y_test_max = selection_vars.process_y_data(y_all_max, storm_index_test)\n    y_validation_max = selection_vars.process_y_data(y_all_max, storm_index_validation)'