In [2]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from collections import defaultdict
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from math import sqrt
from sklearn.feature_selection import SequentialFeatureSelector
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingGridSearchCV

operating_system = 'mac'

if operating_system == 'win':
    os.chdir('C:/Users/fabau/OneDrive/Documents/GitHub/master-project-cleaned/')
elif operating_system == 'curnagl':
    os.chdir('/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/cleaner_version/')
else:
    os.chdir('/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/')

# Add the path to the custom library
custom_library_path = os.path.abspath('util/processing/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/gev/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/feature_selection/')
sys.path.append(custom_library_path)
custom_library_path = os.path.abspath('util/ml/')
sys.path.append(custom_library_path)

import extraction_squares, pre_processing_data, data_processing, selection_vars, sensitivity_test

'''if operating_system == 'curnagl':
    name_of_variable= pd.read_csv('/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/curnagl/DATASETS/variable_list_80_mean.csv')
    path_data = '/work/FAC/FGSE/IDyST/tbeucler/default/fabien/repos/curnagl/DATASETS'
else:'''
name_of_variable_20 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_20.csv')['Unnamed: 0']#('data/variable_list_levels.csv')
name_of_variable_30 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_30.csv')['Unnamed: 0']#('data/variable_list_levels.csv')
name_of_variable_40 = pd.read_csv('ml_scripts/feature_selection/corr_timeseries/corr_inst_max_40.csv')['Unnamed: 0']#('data/variable_list_levels.csv')

path_data = 'data'

storm_dates = pd.read_csv('pre_processing/tracks/storm_dates.csv')
#path_tracks_1h_non_EU = 'pre_processing/tracks/ALL_TRACKS/tracks_1h_non_EU'
#dataset = 'datasets_1h'
#dataset_non_EU = 'datasets_1h_non_EU'
levels = pd.read_csv('data/levels.csv')

In [3]:
# import the all_loadings data
all_loadings = pd.read_csv('ml_scripts/nestedMLR/all_loadings_1000.csv')

# Extract variable names and storm data
variables = all_loadings['variable']  # First column
storm_data = all_loadings.iloc[:, 1:]  # All columns from the second onward

# Transpose storm data and set variable names as columns
transposed_data = storm_data.T
transposed_data.columns = variables

# Optionally reset index to name storms
transposed_data.index.name = 'storm_number'
transposed_data.reset_index(inplace=True)

# extract the storm number
storm_numbers = transposed_data['storm_number'].copy()

# Extract variables most correlated with the target and leaving the storm number
columns_to_select_20 = [col for col in name_of_variable_20.tolist() if col in transposed_data.columns]
transposed_data_20 = transposed_data[columns_to_select_20]
columns_to_select_30 = [col for col in name_of_variable_30.tolist() if col in transposed_data.columns]
transposed_data_30 = transposed_data[columns_to_select_30]
columns_to_select_40 = [col for col in name_of_variable_40.tolist() if col in transposed_data.columns]
transposed_data_40 = transposed_data[columns_to_select_40]

# add the storm number to the transposed data
transposed_data_20['storm_number'] = storm_numbers
transposed_data_30['storm_number'] = storm_numbers
transposed_data_40['storm_number'] = storm_numbers

'''original_data = transposed_data.copy()
original_columns = transposed_data.columns
original_data['storm_number'] = original_data['storm_number'].astype(int)'''

# Add PCA numbers to each variable to differentiate modes

# Count how many times each variable appears in the column names
variable_counts_20 = transposed_data_20.columns.value_counts()
# Create a mapping with PCA numbers appended to each variable
updated_columns_20 = []
pca_tracker_20 = {}
# for 20 variables
for var in transposed_data_20.columns:
    if var not in pca_tracker_20:
        pca_tracker_20[var] = 1
    else:
        pca_tracker_20[var] += 1
    # Append PCA number to the variable name
    updated_columns_20.append(f"{var}_PCA_{pca_tracker_20[var]}")
# Update the column names
transposed_data_20.columns = updated_columns_20
# rename the first column to storm_number
transposed_data_20 = transposed_data_20.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_20['storm_number'] = transposed_data_20['storm_number'].astype(int)

# for 30 variables
updated_columns_30 = []
pca_tracker_30 = {}
for var in transposed_data_30.columns:
    if var not in pca_tracker_30:
        pca_tracker_30[var] = 1
    else:
        pca_tracker_30[var] += 1
    # Append PCA number to the variable name
    updated_columns_30.append(f"{var}_PCA_{pca_tracker_30[var]}")
# Update the column names
transposed_data_30.columns = updated_columns_30
# rename the first column to storm_number
transposed_data_30 = transposed_data_30.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_30['storm_number'] = transposed_data_30['storm_number'].astype(int)

# for 40 variables
updated_columns_40 = []
pca_tracker_40 = {}
for var in transposed_data_40.columns:
    if var not in pca_tracker_40:
        pca_tracker_40[var] = 1
    else:
        pca_tracker_40[var] += 1
    # Append PCA number to the variable name
    updated_columns_40.append(f"{var}_PCA_{pca_tracker_40[var]}")
# Update the column names
transposed_data_40.columns = updated_columns_40
# rename the first column to storm_number
transposed_data_40 = transposed_data_40.rename(columns={'storm_number_PCA_1': 'storm_number'})
transposed_data_40['storm_number'] = transposed_data_40['storm_number'].astype(int)

# load the actual y values

y_all_cdf = pd.read_csv('data/climatology_dm_winter_per_cluster/GEV_CDF_max/log_cdf_max_combined.csv')
y_all_max = pd.read_csv('data/climatology_dm_winter_per_cluster/EVENT_max/max_event_combined.csv')

# Extract storm indices
storm_indices = transposed_data_20['storm_number'].to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_20['storm_number'] = storm_numbers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_30['storm_number'] = storm_numbers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transposed_data_40['storm_number'] = storm_numbers


In [3]:
seeds = [42, 1996, 45319, 43709]

for seed in seeds:
    # separate the data in training and testing
    storm_index_training, storm_index_test, storm_index_validation = extraction_squares.split_storm_numbers(storm_indices, 0.12, seed, 'number')

    # order the index of the storms

    storm_index_training.sort()
    storm_index_test.sort()
    storm_index_validation.sort()

    # add +1 to the storm index to match the storm index in the storm_dates dataframe (it's actually storm index for this set, so +1 is needed)
    #storm_index_training = [x+1 for x in storm_index_training]
    #storm_index_test = [x+1 for x in storm_index_test]
    #storm_index_validation = [x+1 for x in storm_index_validation]

    print("Storm Training:", storm_index_training)
    print("Storm Test:", storm_index_test)
    print("Storm Valid:", storm_index_validation) 

    # remove the variable convective_rain_rate and vertical_velocity
    #columns_to_drop = transposed_data.columns[transposed_data.columns.str.startswith(('convective_rain_rate', 'vertical_velocity'))]
    #transposed_data = transposed_data.drop(columns=columns_to_drop)

    # Update the column names
    #updated_columns = transposed_data.columns

    X_train_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_training, updated_columns_20)
    X_test_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_test, updated_columns_20)
    X_validation_pca_20 = selection_vars.prepare_training_data(transposed_data_20, storm_index_validation, updated_columns_20)

    X_train_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_training, updated_columns_30)
    X_test_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_test, updated_columns_30)
    X_validation_pca_30 = selection_vars.prepare_training_data(transposed_data_30, storm_index_validation, updated_columns_30)

    X_train_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_training, updated_columns_40)
    X_test_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_test, updated_columns_40)
    X_validation_pca_40 = selection_vars.prepare_training_data(transposed_data_40, storm_index_validation, updated_columns_40)

    # without the PCA in the names :

    '''X_train_original = prepare_training_data(original_data, storm_index_training, original_columns)
    X_validation_original = prepare_training_data(original_data, storm_index_validation, original_columns)
    X_test_original = prepare_training_data(original_data, storm_index_test, original_columns)

    X_train_original = X_train_original[columns_to_select]
    X_validation_original = X_validation_original[columns_to_select]
    X_test_original = X_test_original[columns_to_select]'''

    # load the actual y values

    y_train_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_training)
    y_test_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_test)
    y_validation_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_validation)

    y_train_max = selection_vars.process_y_data(y_all_max, storm_index_training)
    y_test_max = selection_vars.process_y_data(y_all_max, storm_index_test)
    y_validation_max = selection_vars.process_y_data(y_all_max, storm_index_validation)

        # Example usage
    param_grid_xgb = {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [3, 5, 10, 20, 40],
        'learning_rate': np.linspace(0.05, 0.2, 4)
    }

    results = sensitivity_test.process_xgboost_workflow(
        X_train_pca={
            20: X_train_pca_20,
            30: X_train_pca_30,
            40: X_train_pca_40,
        },
        X_validation_pca={
            20: X_validation_pca_20,
            30: X_validation_pca_30,
            40: X_validation_pca_40,
        },
        y_train={
            'cdf': y_train_cdf,
            'max': y_train_max,
        },
        y_validation={
            'cdf': y_validation_cdf,
            'max': y_validation_max,
        },
        variable_counts=[20, 30, 40],
        target_types=['cdf'], # 'max' is out of scope for now
        param_grid=param_grid_xgb
    )

To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 32, 34, 39, 43, 45, 46, 49, 50, 51, 53, 54, 56, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 76, 77, 78, 79, 80, 81, 82, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Valid: [21, 33, 44, 47, 58, 83]
y is already a numpy array
Selected features: Index(['mean_surface_latent_heat_flux_std_PCA_2',
       'mean_surface_latent_heat_flux_std_PCA_3', 'geopotential_500_max_PCA_1',
       'surface_pressure_max_PCA_2', 'geopotential_1000_std_PCA_2'],
      dtype='object')
Target: cdf, Variables: 20
RMSE: 1.6723358439700844, MAE: 1.1508057535607916
Selected Features: Index(['mean_surface_latent_heat_flux_std_PCA_2',
       'mean_surface_latent_heat_flux_std_PCA_3', 'geopotential_500_max_PCA_1',
       'surface_pressure_max_PCA_2', 'geopotential_1000_std_PCA_2'],
      dtype='object')
------------------------------
y is alre

KeyboardInterrupt: 

In [None]:
# Default usage
param_grid_xgb = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 10, 20, 40],
    'learning_rate': np.linspace(0.05, 0.2, 4)
}

results = sensitivity_test.process_xgboost_workflow(
    X_train_pca={
        20: X_train_pca_20,
        30: X_train_pca_30,
        40: X_train_pca_40,
    },
    X_validation_pca={
        20: X_validation_pca_20,
        30: X_validation_pca_30,
        40: X_validation_pca_40,
    },
    y_train={
        'cdf': y_train_cdf,
        'max': y_train_max,
    },
    y_validation={
        'cdf': y_validation_cdf,
        'max': y_validation_max,
    },
    variable_counts=[20, 30, 40],
    target_types=['cdf', 'max'],
    param_grid=param_grid_xgb
)

n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 10
max_resources_: 50
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 80
n_resources: 10
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 1
n_candidates: 27
n_resources: 30
Fitting 5 folds for each of 27 candidates, totalling 135 fits
y is already a numpy array
Selected features: Index(['mean_sea_level_pressure_min_PCA_1', 'surface_pressure_mean_PCA_2',
       'geopotential_500_max_PCA_1', 'mean_sea_level_pressure_max_PCA_2',
       'convective_precipitation_mean_PCA_1'],
      dtype='object')
Target: cdf, Variables: 20
RMSE: 2.013381796243133, MAE: 1.3409397456685754
Selected Features: Index(['mean_sea_level_pressure_min_PCA_1', 'surface_pressure_mean_PCA_2',
       'geopotential_500_max_PCA_1', 'mean_sea_level_pressure_max_PCA_2',
       'convective_precipitation_mean_PCA_1'],
      dtype='object')
------------------------------
n_iterations: 2
n_re

KeyboardInterrupt: 

In [3]:
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]
for seed in seeds:
    print(f"\nProcessing for seed: {seed}")

    for count in [20, 30, 40]:
    
        # Split the data
        storm_index_training, storm_index_test, storm_index_validation = extraction_squares.split_storm_numbers(
            storm_indices, 0.12, seed, 'number'
        )
        storm_index_training.sort()
        storm_index_test.sort()
        storm_index_validation.sort()
        
        print("Storm Training:", storm_index_training)
        print("Storm Test:", storm_index_test)
        print("Storm Validation:", storm_index_validation)

        X_train = selection_vars.prepare_training_data(
            locals()[f'transposed_data_{count}'], storm_index_training, locals()[f'updated_columns_{count}'])
        X_test = selection_vars.prepare_training_data(
            locals()[f'transposed_data_{count}'], storm_index_test, locals()[f'updated_columns_{count}'])
        X_validation = selection_vars.prepare_training_data(
            locals()[f'transposed_data_{count}'], storm_index_validation, locals()[f'updated_columns_{count}'])
        
        # Load the actual y values
        y_train_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_training)
        y_test_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_test)
        y_validation_cdf = selection_vars.process_y_data(y_all_cdf, storm_index_validation)

        y_train_max = selection_vars.process_y_data(y_all_max, storm_index_training)
        y_test_max = selection_vars.process_y_data(y_all_max, storm_index_test)
        y_validation_max = selection_vars.process_y_data(y_all_max, storm_index_validation)

        # Save the data
        X_train.to_csv(f'data/sensitivity_test/X_train_{count}_{seed}.csv', index=False)
        X_test.to_csv(f'data/sensitivity_test/X_test_{count}_{seed}.csv', index=False)
        X_validation.to_csv(f'data/sensitivity_test/X_validation_{count}_{seed}.csv', index=False)

        pd.DataFrame(y_train_cdf).to_csv(f'data/sensitivity_test/y_train_cdf_{seed}.csv', index=False)
        pd.DataFrame(y_test_cdf).to_csv(f'data/sensitivity_test/y_test_cdf_{seed}.csv', index=False)
        pd.DataFrame(y_validation_cdf).to_csv(f'data/sensitivity_test/y_validation_cdf_{seed}.csv', index=False)

        pd.DataFrame(y_train_max).to_csv(f'data/sensitivity_test/y_train_max_{seed}.csv', index=False)
        pd.DataFrame(y_test_max).to_csv(f'data/sensitivity_test/y_test_max_{seed}.csv', index=False)
        pd.DataFrame(y_validation_max).to_csv(f'data/sensitivity_test/y_validation_max_{seed}.csv', index=False)

        print(f"Data saved for count: {count} and seed: {seed}")
    


Processing for seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 32, 34, 39, 43, 45, 46, 49, 50, 51, 53, 54, 56, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 76, 77, 78, 79, 80, 81, 82, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Validation: [21, 33, 44, 47, 58, 83]
Data saved for count: 20 and seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 32, 34, 39, 43, 45, 46, 49, 50, 51, 53, 54, 56, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 76, 77, 78, 79, 80, 81, 82, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Validation: [21, 33, 44, 47, 58, 83]
Data saved for count: 30 and seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16,

In [4]:
from collections import defaultdict

def sensitivity_analysis(seeds, split_function, process_workflow, data_dict):
    sensitivity_results = defaultdict(list)

    for seed in seeds:
        print(f"\nProcessing for seed: {seed}")
        
        # Split the data
        storm_index_training, storm_index_test, storm_index_validation = split_function(
            data_dict['storm_indices'], 0.12, seed, 'number'
        )
        storm_index_training.sort()
        storm_index_test.sort()
        storm_index_validation.sort()
        
        print("Storm Training:", storm_index_training)
        print("Storm Test:", storm_index_test)
        print("Storm Validation:", storm_index_validation)
        
        # Prepare PCA datasets
        X_train_pca = {count: selection_vars.prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_training, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}
        
        X_validation_pca = {count: selection_vars.prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_validation, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        # Prepare y data
        y_train = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_training),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_training),
        }
        y_validation = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_validation),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_validation),
        }

        # Process the workflow for this seed
        results = process_workflow(
            X_train_pca=X_train_pca,
            X_validation_pca=X_validation_pca,
            y_train=y_train,
            y_validation=y_validation,
            variable_counts=[20, 30, 40],
            target_types=['cdf', 'max'],  # or ['cdf', 'max']
            param_grid=data_dict['param_grid']
        )

        # Collect selected features for each variable count and target type
        for key, res in results.items():
            sensitivity_results[key].append(set(res['selected_features']))

    return sensitivity_results

In [5]:
# TAKES 71 MINUTES TO RUN (with 10 seeds) otherwise 24 minutes with 4 seeds

# Define seeds
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]

# or generate random seeds
#seeds = np.random.randint(0, 100000, 10).tolist()
print_info = 'yes'

# Define data and required functions in a dictionary for modularity
data_dict = {
    'storm_indices': storm_indices,
    'transposed_data_20': transposed_data_20,
    'transposed_data_30': transposed_data_30,
    'transposed_data_40': transposed_data_40,
    'updated_columns_20': updated_columns_20,
    'updated_columns_30': updated_columns_30,
    'updated_columns_40': updated_columns_40,
    'selection_vars': selection_vars,
    'y_all_cdf': y_all_cdf,
    'y_all_max': y_all_max,
    'param_grid': {
        'n_estimators': [5, 10, 20, 50, 100, 200, 500],
        'max_depth': [1, 2, 3, 5, 10, 20, 40],
        'learning_rate': np.linspace(0.05, 0.2, 8)
    }
}

# Run sensitivity analysis
sensitivity_results = sensitivity_analysis(
    seeds=seeds,
    split_function=extraction_squares.split_storm_numbers,
    process_workflow=sensitivity_test.process_xgboost_workflow,  # Assuming this is the earlier provided function
    data_dict=data_dict
)

# Analyze the sensitivity
for key, selected_features in sensitivity_results.items():
    union_features = set.union(*selected_features)
    intersection_features = set.intersection(*selected_features)
    print(f"\nTarget and Variables: {key}")
    print(f"Selected Features Union: {union_features}")
    print(f"Selected Features Intersection: {intersection_features}")
    print(f"Variability: {len(union_features) - len(intersection_features)}")
    print('-' * 50)


Processing for seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
Storm Training: [1, 2, 3, 5, 7, 8, 11, 12, 13, 16, 19, 26, 27, 31, 32, 34, 39, 43, 45, 46, 49, 50, 51, 53, 54, 56, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 76, 77, 78, 79, 80, 81, 82, 85, 87, 89, 90, 95]
Storm Test: [6, 29, 38, 48, 66, 86, 93]
Storm Validation: [21, 33, 44, 47, 58, 83]
y is already a numpy array
Target: cdf, Variables: 20
RMSE Before Tuning: 1.6723358439700844, MAE Before Tuning: 1.1508057535607916
RMSE After Tuning: 1.1320325711600632, MAE After Tuning: 0.8733677808698516
R2 Before Tuning: -5.938676235281977, R2 After Tuning: -0.9791489956587122
Relative Variance on validation set: 0.7341266511692504
Relative Variance on training set: 1.2214691375806768
Residuals before tuning: [[ 1.75762396e-01 -9.76407129e-01 -3.85941587e-01  3.87975267e-01
   1.53126661e+00  8.58302988e-01  4.94792376e-01 -5.99794963e-01
  -3.77176906e-03  2.75567629e-02  7.5268237

KeyboardInterrupt: 

In [5]:
# Collect data for the DataFrame
results_data = []

for key, feature_sets in sensitivity_results.items():
    target_type, var_count = key.split('_')  # Extract target type and variable count
    all_features = [features for features in feature_sets]  # List of feature sets across seeds

    # Compute union and intersection
    union_features = set.union(*all_features)
    intersection_features = set.intersection(*all_features)
    variability = len(union_features) - len(intersection_features)
    consistency_score = len(intersection_features) / len(union_features) if len(union_features) > 0 else 0

    # Append data to results list
    results_data.append({
        'Target Type': target_type,
        'Variable Count': var_count,
        'All Features': all_features,
        #'Union Features': union_features,
        #'Intersection Features': intersection_features,
        #'Variability': variability,
        #'Consistency Score': consistency_score
    })

# Create a DataFrame
results_df = pd.DataFrame(results_data)

results_cdf = results_df[results_df['Target Type'] == 'cdf']
results_max = results_df[results_df['Target Type'] == 'max']

In [7]:
version = 'v4'

# Extract the results for each dataset (20-30-40 vars) variables
results_cdf_20 = pd.DataFrame(results_cdf['All Features'][0])
results_cdf_30 = pd.DataFrame(results_cdf['All Features'][1])
results_cdf_40 = pd.DataFrame(results_cdf['All Features'][2])

# collapse into a single list
results_cdf_20 = [item for sublist in results_cdf['All Features'][0] for item in sublist]
results_cdf_30 = [item for sublist in results_cdf['All Features'][1] for item in sublist]
results_cdf_40 = [item for sublist in results_cdf['All Features'][2] for item in sublist]

# combine the 3 lists into one
results_cdf_all_vars = results_cdf_20 + results_cdf_30 + results_cdf_40

# count the number of times each variable appears in the list
results_cdf_count = pd.Series(results_cdf_all_vars).value_counts()

# repeat the same for the max dataset
results_max_20 = pd.DataFrame(results_max['All Features'][3])
results_max_30 = pd.DataFrame(results_max['All Features'][4])
results_max_40 = pd.DataFrame(results_max['All Features'][5])

# collapse into a single list
results_max_20 = [item for sublist in results_max['All Features'][3] for item in sublist]
results_max_30 = [item for sublist in results_max['All Features'][4] for item in sublist]
results_max_40 = [item for sublist in results_max['All Features'][5] for item in sublist]

# combine the 3 lists into one
results_max_all_vars = results_max_20 + results_max_30 + results_max_40

# count the number of times each variable appears in the list
results_max_count = pd.Series(results_max_all_vars).value_counts()

# create a new list with the 2 preivous results_target_all_vars and count each variable
results_target_all_vars = results_cdf_all_vars + results_max_all_vars
results_target_count = pd.Series(results_target_all_vars).value_counts()

# export the 3 lists to a csv file
results_cdf_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/cdf_count_{version}.csv')
results_max_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/max_count_{version}.csv')
results_target_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/both_target_count_{version}.csv')

In [9]:
# select the top 10 % of the variables

percentage = 0.5
results_cdf_count_10 = results_cdf_count[results_cdf_count > percentage*len(seeds)]
results_max_count_10 = results_max_count[results_max_count > percentage*len(seeds)]
results_target_count_10 = results_target_count[results_target_count > percentage*len(seeds)]

# export the 3 lists to a csv file

results_cdf_count_10.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/cdf_count_50_{version}.csv')
results_max_count_10.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/max_count_50_{version}.csv')
results_target_count_10.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/both_target_count_50_{version}.csv')

# selection based on RMSE and MAE

In [8]:
from collections import defaultdict

def combined_workflow_with_best_model(seeds, split_function, data_dict, param_grid_og, print_info='yes'):
    def process_workflow(X_train_pca, X_validation_pca, y_train, y_validation, variable_counts, target_types):
        results = {}
        best_models = {target_type: {'model': None, 'rmse': float('inf'), 'mae': float('inf')} for target_type in target_types}

        for target_type in target_types:  # e.g., ['cdf', 'max']
            for var_count in variable_counts:  # e.g., [20, 30, 40]
                for ml in ['xgboost', 'random_forest']:
                    # Convert PCA data to numpy
                    X_train_np = X_train_pca[var_count].to_numpy()
                    X_validation_np = X_validation_pca[var_count].to_numpy()

                    # Initialize and train model
                    if ml == 'xgboost':
                        model = XGBRegressor(random_state=42, n_jobs=-1)
                    elif ml == 'random_forest':
                        model = RandomForestRegressor(random_state=42, n_jobs=-1)

                    model.fit(X_train_np, y_train[target_type])

                    # Predictions
                    predictions = model.predict(X_validation_np)

                    # Metrics before tuning
                    rmse = np.sqrt(mean_squared_error(y_validation[target_type], predictions))
                    mae = mean_absolute_error(y_validation[target_type], predictions)

                    # Update best model if applicable
                    if rmse < best_models[target_type]['rmse'] or (
                        rmse == best_models[target_type]['rmse'] and mae < best_models[target_type]['mae']):
                        best_models[target_type] = {
                            'model': model,
                            'rmse': rmse,
                            'mae': mae,
                            'var_count': var_count,
                            'ml': ml,
                            'params': None  # Params will be updated after tuning
                        }

                    # Hyperparameter tuning
                    if ml == 'random_forest':
                        param_grid = {
                            'n_estimators': [5, 10, 20, 50, 100, 200, 500],
                            'max_depth': [1, 2, 3, 5, 10, 20, 40]
                        }
                    else:
                        param_grid = param_grid_og

                    search = HalvingGridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=0)
                    search.fit(X_train_np, y_train[target_type])

                    # Get best model after tuning
                    best_model = search.best_estimator_

                    # Predictions after tuning
                    tuned_predictions = best_model.predict(X_validation_np)

                    # Metrics after tuning
                    rmse_tuned = np.sqrt(mean_squared_error(y_validation[target_type], tuned_predictions))
                    mae_tuned = mean_absolute_error(y_validation[target_type], tuned_predictions)

                    # Update best model if applicable
                    if rmse_tuned < best_models[target_type]['rmse'] or (
                        rmse_tuned == best_models[target_type]['rmse'] and mae_tuned < best_models[target_type]['mae']):
                        best_models[target_type] = {
                            'model': best_model,
                            'rmse': rmse_tuned,
                            'mae': mae_tuned,
                            'var_count': var_count,
                            'ml': ml,
                            'params': search.best_params_
                        }

                    # Feature Selection
                    selected_vars = data_dict['selection_vars'].feature_selection(
                        X_train_pca[var_count],
                        X_train_np,
                        y_train[target_type],
                        best_model
                    )

                    # Store results
                    results[f'{target_type}_{var_count}'] = {
                        'model': best_model,
                        'rmse_after_tuning': rmse_tuned,
                        'mae_after_tuning': mae_tuned,
                        'best_params': search.best_params_,
                        'selected_features': selected_vars,
                    }

                    if print_info == 'yes':
                        print(f"Target: {target_type}, Variables: {var_count}")
                        print(f"ML Model: {ml}")
                        print(f"RMSE After Tuning: {rmse_tuned}, MAE After Tuning: {mae_tuned}")
                        print(f"Best Params: {search.best_params_}")
                        print(f"Selected Features: {selected_vars}")
                        print('-' * 30)

        return results, best_models

    sensitivity_results = defaultdict(list)
    overall_best_models = {target_type: {'model': None, 'rmse': float('inf'), 'mae': float('inf')} for target_type in ['cdf', 'max']}

    for seed in seeds:
        print(f"\nProcessing for seed: {seed}")

        # Split the data
        storm_index_training, storm_index_test, storm_index_validation = split_function(
            data_dict['storm_indices'], 0.12, seed, 'number'
        )
        storm_index_training.sort()
        storm_index_test.sort()
        storm_index_validation.sort()

        # Prepare PCA datasets
        X_train_pca = {count: data_dict['selection_vars'].prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_training, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        X_validation_pca = {count: data_dict['selection_vars'].prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_validation, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        # Prepare y data
        y_train = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_training),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_training),
        }
        y_validation = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_validation),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_validation),
        }

        # Process the workflow for this seed
        results, best_models = process_workflow(
            X_train_pca=X_train_pca,
            X_validation_pca=X_validation_pca,
            y_train=y_train,
            y_validation=y_validation,
            variable_counts=[20, 30, 40],
            target_types=['cdf', 'max']
        )

        # Update sensitivity results
        for key, res in results.items():
            sensitivity_results[key].append(set(res['selected_features']))

        # Update overall best models
        for target_type, model_info in best_models.items():
            if model_info['rmse'] < overall_best_models[target_type]['rmse'] or (
                model_info['rmse'] == overall_best_models[target_type]['rmse'] and
                model_info['mae'] < overall_best_models[target_type]['mae']):
                overall_best_models[target_type] = model_info

    return sensitivity_results, overall_best_models

In [12]:
# Define seeds
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]

# or generate random seeds
# seeds = np.random.randint(0, 100000, 10).tolist()
print_info = 'yes'

# Define data and required functions in a dictionary for modularity
data_dict = {
    'storm_indices': storm_indices,
    'transposed_data_20': transposed_data_20,
    'transposed_data_30': transposed_data_30,
    'transposed_data_40': transposed_data_40,
    'updated_columns_20': updated_columns_20,
    'updated_columns_30': updated_columns_30,
    'updated_columns_40': updated_columns_40,
    'selection_vars': selection_vars,
    'y_all_cdf': y_all_cdf,
    'y_all_max': y_all_max,
    'param_grid': {
        'n_estimators': [5, 10, 20, 50, 100, 200, 500],
        'max_depth': [1, 2, 3, 5, 10, 20, 40],
        'learning_rate': np.linspace(0.05, 0.2, 8)
    }
}

# Run sensitivity analysis and extract best models
sensitivity_results, overall_best_models = combined_workflow_with_best_model(
    seeds=seeds,
    split_function=extraction_squares.split_storm_numbers,
    data_dict=data_dict,
    param_grid_og=data_dict['param_grid'],
    print_info=print_info
)

# Analyze sensitivity results
for key, selected_features in sensitivity_results.items():
    union_features = set.union(*selected_features)
    intersection_features = set.intersection(*selected_features)
    print(f"\nTarget and Variables: {key}")
    print(f"Selected Features Union: {union_features}")
    print(f"Selected Features Intersection: {intersection_features}")
    print(f"Variability: {len(union_features) - len(intersection_features)}")
    print('-' * 50)

# Display best models
print("\nBest Models Summary:")
for target_type, model_info in overall_best_models.items():
    print(f"\nTarget Type: {target_type}")
    print(f"Best Model: {model_info['ml']} with Variable Count: {model_info['var_count']}")
    print(f"RMSE: {model_info['rmse']}, MAE: {model_info['mae']}")
    print(f"Best Params: {model_info['params']}")
    print('-' * 50)


Processing for seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.
y is already a numpy array
Target: cdf, Variables: 20
ML Model: xgboost
RMSE After Tuning: 1.1320325711600632, MAE After Tuning: 0.8733677808698516
Best Params: {'learning_rate': 0.15714285714285717, 'max_depth': 1, 'n_estimators': 10}
Selected Features: Index(['mean_surface_latent_heat_flux_std_PCA_3',
       'surface_latent_heat_flux_std_PCA_3',
       'mean_sea_level_pressure_max_PCA_2', 'surface_pressure_max_PCA_2',
       'geopotential_1000_std_PCA_2'],
      dtype='object')
------------------------------
y is already a numpy array
Target: cdf, Variables: 20
ML Model: random_forest
RMSE After Tuning: 1.0084506667883624, MAE After Tuning: 0.7687004366559859
Best Params: {'max_depth': 1, 'n_estimators': 500}
Selected Features: Index(['mean_sea_level_pressure_min_PCA_1', 'geopotential_500_mean_PCA_2',
       'surface_latent_heat_flux_min_PCA_1',
       'mean_surface_latent_

KeyboardInterrupt: 

# selection based on r2 score

In [8]:
import pickle
def save_to_pickle(data,savepath):

    with open(savepath, 'wb') as handle:

        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return None

def combined_workflow_with_best_model(seeds, split_function, data_dict, param_grid_og, print_info='yes'):
    def process_workflow(X_train_pca, X_validation_pca, X_test_pca, y_train, y_validation, y_test, variable_counts, target_types, seed):
        results = {}
        best_models = {target_type: {'model': None, 'r2': float('-inf')} for target_type in target_types}
        # create folder for the seed
        os.makedirs(f'pre_processing/new_feature_selection/seed_{seed}', exist_ok=True)
        for target_type in target_types:  # e.g., ['cdf', 'max']
            for var_count in variable_counts:  # e.g., [20, 30, 40]
                                    # save the x_train and x_validation
                pd.DataFrame(X_train_pca[var_count]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_{var_count}.csv', index=False) 
                pd.DataFrame(X_validation_pca[var_count]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_{var_count}.csv', index=False)
                pd.DataFrame(X_test_pca[var_count]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_{var_count}.csv', index=False)

                # save y_train and y_validation
                pd.DataFrame(y_train[target_type]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_train_{target_type}.csv', index=False)
                pd.DataFrame(y_validation[target_type]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_validation_{target_type}.csv', index=False)
                pd.DataFrame(y_test[target_type]).to_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_test_{target_type}.csv', index=False)

                if 1+1 == 2:

                    for ml in ['xgboost', 'random_forest']:
                        # create folder for the model
                        os.makedirs(f'ml_scripts/new_feature_selection/seed_{seed}/model_{ml}', exist_ok=True)

                        # Convert PCA data to numpy
                        X_train_np = X_train_pca[var_count].to_numpy()
                        X_validation_np = X_validation_pca[var_count].to_numpy()

                        # Initialize and train model
                        if ml == 'xgboost':
                            model = XGBRegressor(random_state=42, n_jobs=-1)
                        elif ml == 'random_forest':
                            model = RandomForestRegressor(random_state=42, n_jobs=-1)

                        model.fit(X_train_np, y_train[target_type])

                        # Predictions
                        predictions = model.predict(X_validation_np)
                        perdictions_train = model.predict(X_train_np)

                        # Calculate R²
                        y_mean = np.mean(y_validation[target_type])
                        ss_res = np.sum((y_validation[target_type] - predictions) ** 2)
                        ss_tot = np.sum((y_validation[target_type] - y_mean) ** 2)
                        r2 = 1 - (ss_res / ss_tot)

                        y_mean_train = np.mean(y_train[target_type])
                        ss_res_train = np.sum((y_train[target_type] - perdictions_train) ** 2)
                        ss_tot_train = np.sum((y_train[target_type] - y_mean_train) ** 2)
                        r2_train = 1 - (ss_res_train / ss_tot_train)

                        ss_tot_test = np.sum((y_test[target_type] - y_mean) ** 2)

                        # Update best model if R² is higher
                        if r2 > best_models[target_type]['r2']:
                            best_models[target_type] = {
                                'model': model,
                                'r2': r2,
                                'r2_train': r2_train,
                                'var_count': var_count,
                                'ml': ml,
                                'params': None  # Params will be updated after tuning
                            }

                        # Hyperparameter tuning
                        if ml == 'random_forest':
                            param_grid = {
                                'n_estimators': [5, 10, 20, 50, 100, 200, 500],
                                'max_depth': [1, 2, 3, 5, 10, 20, 40]
                            }
                        else:
                            param_grid = param_grid_og

                        search = HalvingGridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=0)
                        search.fit(X_train_np, y_train[target_type])

                        # Get best model after tuning
                        best_model = search.best_estimator_

                        # save the model with savepickle
                        save_to_pickle(best_model, f'pre_processing/new_feature_selection/seed_{seed}/model_{ml}/model_{target_type}_{var_count}.pkl')

                        # Predictions after tuning
                        tuned_predictions = best_model.predict(X_validation_np)
                        tuned_predictions_train = best_model.predict(X_train_np)
                        tuned_predictions_test = best_model.predict(X_test_pca[var_count].to_numpy())

                        # Calculate R² after tuning
                        ss_res_tuned = np.sum((y_validation[target_type] - tuned_predictions) ** 2)
                        r2_tuned = 1 - (ss_res_tuned / ss_tot)

                        ss_res_tuned_train = np.sum((y_train[target_type] - tuned_predictions_train) ** 2)
                        r2_tuned_train = 1 - (ss_res_tuned_train / ss_tot_train)

                        ss_res_tuned_test = np.sum((y_test[target_type] - tuned_predictions_test) ** 2)
                        r2_tuned_test = 1 - (ss_res_tuned_test / ss_tot_test)

                        # Update best model if R² after tuning is higher
                        if r2_tuned > best_models[target_type]['r2']:
                            best_models[target_type] = {
                                'model': best_model,
                                'r2': r2_tuned,
                                'r2_train': r2_tuned_train,
                                'r2_test': r2_tuned_test,
                                'var_count': var_count,
                                'ml': ml,
                                'params': search.best_params_
                            }

                        # Feature Selection
                        selected_vars = data_dict['selection_vars'].feature_selection(
                            X_train_pca[var_count],
                            X_train_np,
                            y_train[target_type],
                            best_model
                        )

                        # Store results
                        results[f'{target_type}_{var_count}'] = {
                            'model': best_model,
                            'r2_after_tuning': r2_tuned,
                            'r2_after_tuning_train': r2_tuned_train,
                            'r2_after_tuning_test': r2_tuned_test,
                            'best_params': search.best_params_,
                            'selected_features': selected_vars,
                        }

                        if print_info == 'yes':
                            print(f"Target: {target_type}, Variables: {var_count}")
                            print(f"ML Model: {ml}")
                            print(f"R² After Tuning: {r2_tuned}")
                            print(f"R² Train After Tuning: {r2_tuned_train}")
                            print(f"R² Test After Tuning: {r2_tuned_test}")
                            print(f"Best Params: {search.best_params_}")
                            print(f"Selected Features: {selected_vars}")
                            print('-' * 30)

        return results, best_models

    sensitivity_results = defaultdict(list)
    overall_best_models = {target_type: {'model': None, 'r2': float('-inf')} for target_type in ['cdf', 'max']}

    for seed in seeds:
        print(f"\nProcessing for seed: {seed}")

        # Split the data
        storm_index_training, storm_index_test, storm_index_validation = split_function(
            data_dict['storm_indices'], 0.12, seed, 'number'
        )
        storm_index_training.sort()
        storm_index_test.sort()
        storm_index_validation.sort()

        # Prepare PCA datasets
        X_train_pca = {count: data_dict['selection_vars'].prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_training, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        X_validation_pca = {count: data_dict['selection_vars'].prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_validation, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        X_test_pca = {count: data_dict['selection_vars'].prepare_training_data(
            data_dict[f'transposed_data_{count}'], storm_index_test, data_dict[f'updated_columns_{count}']
        ) for count in [20, 30, 40]}

        # Prepare y data
        y_train = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_training),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_training),
        }
        y_validation = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_validation),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_validation),
        }
        y_test = {
            'cdf': data_dict['selection_vars'].process_y_data(data_dict['y_all_cdf'], storm_index_test),
            'max': data_dict['selection_vars'].process_y_data(data_dict['y_all_max'], storm_index_test),
        }

        # Process the workflow for this seed
        results, best_models = process_workflow(
            X_train_pca=X_train_pca,
            X_validation_pca=X_validation_pca,
            X_test_pca=X_test_pca,
            y_train=y_train,
            y_validation=y_validation,
            y_test=y_test,
            variable_counts=[20, 30, 40],
            target_types=['cdf', 'max'],
            seed=seed
        )

        # Update sensitivity results
        for key, res in results.items():
            sensitivity_results[key].append(set(res['selected_features']))

        # Update overall best models
        for target_type, model_info in best_models.items():
            if model_info['r2'] > overall_best_models[target_type]['r2']:
                overall_best_models[target_type] = model_info

    return sensitivity_results, overall_best_models

In [9]:
# Define seeds
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]

# or generate random seeds
# seeds = np.random.randint(0, 100000, 10).tolist()
print_info = 'yes'

# Define data and required functions in a dictionary for modularity
data_dict = {
    'storm_indices': storm_indices,
    'transposed_data_20': transposed_data_20,
    'transposed_data_30': transposed_data_30,
    'transposed_data_40': transposed_data_40,
    'updated_columns_20': updated_columns_20,
    'updated_columns_30': updated_columns_30,
    'updated_columns_40': updated_columns_40,
    'selection_vars': selection_vars,
    'y_all_cdf': y_all_cdf,
    'y_all_max': y_all_max,
    'param_grid': {
        'n_estimators': [5, 10, 20, 50, 100, 200, 500],
        'max_depth': [1, 2, 3, 5, 10, 20, 40],
        'learning_rate': np.linspace(0.05, 0.2, 8)
    }
}

# Run sensitivity analysis and extract best models
sensitivity_results, overall_best_models = combined_workflow_with_best_model(
    seeds=seeds,
    split_function=extraction_squares.split_storm_numbers,
    data_dict=data_dict,
    param_grid_og=data_dict['param_grid'],
    print_info=print_info
)

# Analyze sensitivity results
for key, selected_features in sensitivity_results.items():
    union_features = set.union(*selected_features)
    intersection_features = set.intersection(*selected_features)
    print(f"\nTarget and Variables: {key}")
    print(f"Selected Features Union: {union_features}")
    print(f"Selected Features Intersection: {intersection_features}")
    print(f"Variability: {len(union_features) - len(intersection_features)}")
    print('-' * 50)

# Display best models
print("\nBest Models Summary:")
for target_type, model_info in overall_best_models.items():
    print(f"\nTarget Type: {target_type}")
    print(f"Best Model: {model_info['ml']} with Variable Count: {model_info['var_count']}")
    print(f"R² Score: {model_info['r2']}")
    print(f"R² Train Score: {model_info['r2_train']}")
    print(f"R² Test Score: {model_info['r2_test']}")
    print(f"Best Params: {model_info['params']}")
    print('-' * 50)


Processing for seed: 42
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 1996
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 45319
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 43709
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 19961106
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 28012025
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 15012025
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 2019
To keep 50 storms in the training set, storms 45 and 87 are removed from the test set.

Processing for seed: 111194
To keep 50 storms in the training set, storms 4

KeyError: 'ml'

In [17]:
# Collect data for the DataFrame
results_data = []

for key, feature_sets in sensitivity_results.items():
    target_type, var_count = key.split('_')  # Extract target type and variable count
    all_features = [features for features in feature_sets]  # List of feature sets across seeds

    # Compute union and intersection
    union_features = set.union(*all_features)
    intersection_features = set.intersection(*all_features)
    variability = len(union_features) - len(intersection_features)
    consistency_score = len(intersection_features) / len(union_features) if len(union_features) > 0 else 0

    # Append data to results list
    results_data.append({
        'Target Type': target_type,
        'Variable Count': var_count,
        'All Features': all_features,
        #'Union Features': union_features,
        #'Intersection Features': intersection_features,
        #'Variability': variability,
        #'Consistency Score': consistency_score
    })

# Create a DataFrame
results_df = pd.DataFrame(results_data)

results_cdf = results_df[results_df['Target Type'] == 'cdf']
results_max = results_df[results_df['Target Type'] == 'max']

version = 'v5'

# Extract the results for each dataset (20-30-40 vars) variables
results_cdf_20 = pd.DataFrame(results_cdf['All Features'][0])
results_cdf_30 = pd.DataFrame(results_cdf['All Features'][1])
results_cdf_40 = pd.DataFrame(results_cdf['All Features'][2])

# collapse into a single list
results_cdf_20 = [item for sublist in results_cdf['All Features'][0] for item in sublist]
results_cdf_30 = [item for sublist in results_cdf['All Features'][1] for item in sublist]
results_cdf_40 = [item for sublist in results_cdf['All Features'][2] for item in sublist]

# combine the 3 lists into one
results_cdf_all_vars = results_cdf_20 + results_cdf_30 + results_cdf_40

# count the number of times each variable appears in the list
results_cdf_count = pd.Series(results_cdf_all_vars).value_counts()

# repeat the same for the max dataset
results_max_20 = pd.DataFrame(results_max['All Features'][3])
results_max_30 = pd.DataFrame(results_max['All Features'][4])
results_max_40 = pd.DataFrame(results_max['All Features'][5])

# collapse into a single list
results_max_20 = [item for sublist in results_max['All Features'][3] for item in sublist]
results_max_30 = [item for sublist in results_max['All Features'][4] for item in sublist]
results_max_40 = [item for sublist in results_max['All Features'][5] for item in sublist]

# combine the 3 lists into one
results_max_all_vars = results_max_20 + results_max_30 + results_max_40

# count the number of times each variable appears in the list
results_max_count = pd.Series(results_max_all_vars).value_counts()

# create a new list with the 2 preivous results_target_all_vars and count each variable
results_target_all_vars = results_cdf_all_vars + results_max_all_vars
results_target_count = pd.Series(results_target_all_vars).value_counts()

# export the 3 lists to a csv file
results_cdf_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/cdf_count_{version}.csv')
results_max_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/max_count_{version}.csv')
results_target_count.to_csv(f'pre_processing/feature_selection/fs_mls_ts_tests/both_target_count_{version}.csv')