In [6]:
import pandas as pd
import os
import glob
from sklearn.metrics import mean_squared_error

# Define the folders and the features to be processed
base_directories = {'non_norm': './non_norm/', 'norm': './norm/'}
features = ['NR', 'PR', 'SR', 'SFST', 'WS', 'NR_Benefit', 'PR_Benefit', 'SR_Benefit', 'SFST_Benefit', 'WS_Benefit']

# Configuration for prediction classes (used for normalization)
config = {
    "WS": {"min": 1.58, "max": 8.61, "lower": 3.07, "higher": 6.17},
    "PR": {"min": 2.07, "max": 10.0, "lower": 3.66, "higher": 6.11},
    "NR": {"min": 4.10, "max": 10.0, "lower": 2.06, "higher": 4.42},
    "SR": {"min": 2.29, "max": 10.0, "lower": 3.02, "higher": 6.67},
    "SFST": {"min": 0.0, "max": 7.71, "lower": 1.05, "higher": 6.51},
    "WS_Benefit": {"min": 0.08, "max": 10.0, "lower": 2.65, "higher": 6.50},
    "PR_Benefit": {"min": 0.49, "max": 10.0, "lower": 3.29, "higher": 6.68},
    "NR_Benefit": {"min": 0.71, "max": 10.0, "lower": 4.10, "higher": 7.76},
    "SR_Benefit": {"min": 0.49, "max": 8.79, "lower": 2.94, "higher": 6.19},
    "SFST_Benefit": {"min": 0.0, "max": 7.19, "lower": 1.86, "higher": 5.30}
}

# Function to normalize predictions and actual values
def normalize_predictions(predicted, actual, feature):
    min_val = config[feature]["min"]
    max_val = config[feature]["max"]
    return np.clip((predicted - min_val) / (max_val - min_val) * 10, 0, 10), np.clip((actual - min_val) / (max_val - min_val) * 10, 0, 10)

# Function to process a single folder (either 'norm' or 'non_norm')
# Function to process a single folder (either 'norm' or 'non_norm')
def process_folder(base_directory, features, normalize_flag=True):
    # Variables to track the overall best model and its MSE
    overall_best_mse = float('inf')
    overall_best_model = None
    overall_best_dataset = None
    overall_best_feature = None

    # Variables to track the overall best normalized model and its MSE (only for non_norm)
    overall_best_normalized_mse = float('inf')
    overall_best_normalized_model = None
    overall_best_normalized_dataset = None
    overall_best_normalized_feature = None

    # Loop through each feature folder
    for feature in features:
        print(f"Processing feature '{feature}' in folder '{base_directory}'")
        
        # Directory containing the CSV files for the current feature
        output_csv_directory = os.path.join(base_directory, feature)
        
        # Read all the CSV files in the directory
        csv_files = glob.glob(os.path.join(output_csv_directory, '*.csv'))
        
        # Variables to track the best model and its MSE for the current feature
        best_mse = float('inf')
        best_model = None
        best_dataset = None
        
        # Variables to track the best normalized model and its MSE (only for non_norm)
        best_normalized_mse = float('inf')
        best_normalized_model = None
        best_normalized_dataset = None
        
        # Function to extract the relevant part of the filename for the title
        def extract_relevant_part(filename):
            parts = filename.split('_')
            if len(parts) > 3:  # Ensure there are enough parts to extract
                return parts[2]  # Adjust index based on your filename format
            return "Unknown"  # Fallback if format is not as expected
        
        # Read each CSV file and compute the MSE by model
        for csv_file in csv_files:
            data = pd.read_csv(csv_file)
            for model in data['Model'].unique():
                model_data = data[data['Model'] == model]
                actual = model_data['Actual'].clip(lower=0, upper=10)  # Clip actual values to be within the range [0, 10]
                predicted = model_data['Predicted'].clip(lower=0, upper=10)  # Clip predicted values to be within the range [0, 10]
                
                # Calculate non-normalized MSE
                mse = mean_squared_error(actual, predicted)
                
                # Update the best model if current MSE is lower
                if mse < best_mse:
                    best_mse = mse
                    best_model = model
                    best_dataset = extract_relevant_part(os.path.basename(csv_file))
                
                # If the folder is non_norm, calculate normalized MSE as well
                if normalize_flag:
                    # Normalize predictions and actual values
                    predicted_norm, actual_norm = normalize_predictions(predicted, actual, feature)
                    
                    # Calculate normalized MSE
                    normalized_mse = mean_squared_error(actual_norm, predicted_norm)
                    
                    # Update the best normalized model if current normalized MSE is lower
                    if normalized_mse < best_normalized_mse:
                        best_normalized_mse = normalized_mse
                        best_normalized_model = model
                        best_normalized_dataset = extract_relevant_part(os.path.basename(csv_file))
        
        # Check if the best model for the current feature is better than the overall best model
        if best_mse < overall_best_mse:
            overall_best_mse = best_mse
            overall_best_model = best_model
            overall_best_dataset = best_dataset
            overall_best_feature = feature
        
        # Check if the best normalized model for the current feature is better than the overall best normalized model (only for non_norm)
        if normalize_flag and best_normalized_mse < overall_best_normalized_mse:
            overall_best_normalized_mse = best_normalized_mse
            overall_best_normalized_model = best_normalized_model
            overall_best_normalized_dataset = best_normalized_dataset
            overall_best_normalized_feature = feature
        
        # Print the best model and its MSE for the current feature
        print(f"The best model for feature '{feature}' is '{best_model}' with an MSE of {best_mse:.2f} on the dataset '{best_dataset}'.")
        if normalize_flag:
            print(f"The best normalized model for feature '{feature}' is '{best_normalized_model}' with a normalized MSE of {best_normalized_mse:.2f} on the dataset '{best_normalized_dataset}'.")
        print(f"Finished processing feature: {feature}\n")
    
    # Print the overall best model and its MSE across all features
    print(f"\nThe overall best model across all features is '{overall_best_model}' with an MSE of {overall_best_mse:.2f} on the dataset '{overall_best_dataset}' from the feature '{overall_best_feature}'.")
    
    # Print the overall best normalized model and its normalized MSE across all features (only for non_norm)
    if normalize_flag:
        print(f"\nThe overall best normalized model across all features is '{overall_best_normalized_model}' with a normalized MSE of {overall_best_normalized_mse:.2f} on the dataset '{overall_best_normalized_dataset}' from the feature '{overall_best_normalized_feature}'.")

# Process both the 'non_norm' and 'norm' folders
process_folder(base_directories['non_norm'], features, normalize_flag=True)
process_folder(base_directories['norm'], features, normalize_flag=False)


Processing feature 'NR' in folder './non_norm/'
The best model for feature 'NR' is 'AdaBoostRegressor' with an MSE of 0.17 on the dataset 'interpolated.csv'.
The best normalized model for feature 'NR' is 'AdaBoostRegressor' with a normalized MSE of 0.48 on the dataset 'interpolated.csv'.
Finished processing feature: NR

Processing feature 'PR' in folder './non_norm/'
The best model for feature 'PR' is 'AdaBoostRegressor' with an MSE of 0.12 on the dataset 'custom'.
The best normalized model for feature 'PR' is 'AdaBoostRegressor' with a normalized MSE of 0.19 on the dataset 'custom'.
Finished processing feature: PR

Processing feature 'SR' in folder './non_norm/'
The best model for feature 'SR' is 'AdaBoostRegressor' with an MSE of 0.37 on the dataset 'bfill'.
The best normalized model for feature 'SR' is 'AdaBoostRegressor' with a normalized MSE of 0.62 on the dataset 'bfill'.
Finished processing feature: SR

Processing feature 'SFST' in folder './non_norm/'
The best model for feature