In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import pandas as pd
import matplotlib.pyplot as plt
import optuna
import os
import sys

operating_system = 'mac'

if operating_system == 'win':
    os.chdir('C:/Users/fabau/OneDrive/Documents/GitHub/master-project-cleaned/')
elif operating_system == 'curnagl':
    os.chdir('/work/FAC/FGSE/IDYST/tbeucler/default/fabien/repos/cleaner_version/')
else:
    os.chdir('/Users/fabienaugsburger/Documents/GitHub/master-project-cleaned/')

util_perso = os.path.abspath('util/processing')
sys.path.append(util_perso)
util_perso = os.path.abspath('util/gev')
sys.path.append(util_perso)

from extraction_squares import split_storm_numbers
from data_processing import depickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def r2_score(y_true, y_pred):
    #y_mean_train = np.mean(y_true)
    #ss_res_train = np.sum(y_true - y_pred) ** 2
    #ss_tot_train = np.sum(y_true - y_mean_train) ** 2
    #r2_train = 1 - (ss_res_train / ss_tot_train)

    return 1 - np.sum((y_true-y_pred)**2)/np.sum((y_true-np.mean(y_true))**2)

In [9]:
models = ['random_forest', 'xgboost']
seeds = [42, 1996, 45319, 43709, 19961106, 28012025, 15012025, 2019, 111194, 19052024]
nvars = [20, 30, 40]
output = ['cdf', 'max']

r2_scores = pd.DataFrame(columns=['seed', 'nvar', 'model', 
                                  'r2_train_cdf', 'r2_validation_cdf', 'r2_test_cdf',
                                  'r2_train_max', 'r2_validation_max', 'r2_test_max',
                                  'rmse_train_cdf', 'rmse_validation_cdf', 'rmse_test_cdf',
                                  'rmse_train_max', 'rmse_validation_max', 'rmse_test_max'])

for seed in seeds:
    for nvar in nvars:
        for model in models:
            # Initialize a dictionary for a single row
            r2_temp = {'seed': seed, 'nvar': nvar, 'model': model}

            for out in output:
                # Load the model
                ml = depickle(f'ml_scripts/new_feature_selection/seed_{seed}/model_{model}/model_{out}_{nvar}.pkl')

                # Load the data
                X_train = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_train_{nvar}.csv').to_numpy()
                y_train = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_train_{out}.csv')
                X_validation = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_validation_{nvar}.csv').to_numpy()
                y_validation = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_validation_{out}.csv')
                X_test = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/X_test_{nvar}.csv').to_numpy()
                y_test = pd.read_csv(f'ml_scripts/new_feature_selection/seed_{seed}/y_test_{out}.csv')

                # Make predictions and flatten
                y_train_pred = ml.predict(X_train).flatten()
                y_train = y_train.to_numpy().flatten()
                y_validation_pred = ml.predict(X_validation).flatten()
                y_validation = y_validation.to_numpy().flatten()
                y_test_pred = ml.predict(X_test).flatten()
                y_test = y_test.to_numpy().flatten()

                # Compute R² scores
                r2_train = r2_score(y_train, y_train_pred)
                r2_validation = r2_score(y_validation, y_validation_pred)
                r2_test = r2_score(y_test, y_test_pred)

                # Compute RMSE scores
                rmse_train = np.sqrt(np.mean((y_train - y_train_pred) ** 2))
                rmse_validation = np.sqrt(np.mean((y_validation - y_validation_pred) ** 2))
                rmse_test = np.sqrt(np.mean((y_test - y_test_pred) ** 2))

                # Update the dictionary with the R² values based on `out`
                if out == 'cdf':
                    r2_temp.update({
                        'r2_train_cdf': r2_train,
                        'r2_validation_cdf': r2_validation,
                        'r2_test_cdf': r2_test,
                        'rmse_train_cdf': rmse_train,
                        'rmse_validation_cdf': rmse_validation,
                        'rmse_test_cdf': rmse_test
                    })
                elif out == 'max':
                    r2_temp.update({
                        'r2_train_max': r2_train,
                        'r2_validation_max': r2_validation,
                        'r2_test_max': r2_test,
                        'rmse_train_max': rmse_train,
                        'rmse_validation_max': rmse_validation,
                        'rmse_test_max': rmse_test
                    })

            # Append the row to the DataFrame
            r2_scores = pd.concat([r2_scores, pd.DataFrame([r2_temp])], axis=0, ignore_index=True)

r2_scores.to_csv('ml_scripts/new_feature_selection/r2_scores.csv', index=False)

  r2_scores = pd.concat([r2_scores, pd.DataFrame([r2_temp])], axis=0, ignore_index=True)
