In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import ast

# Configuration for all functions
config = {
    "WS": {"min": 1.58, "max": 8.61, "lower": 3.07, "higher": 6.17},
    "PR": {"min": 2.07, "max": 10.0, "lower": 3.66, "higher": 6.11},
    "NR": {"min": 4.10, "max": 10.0, "lower": 2.06, "higher": 4.42},
    "SR": {"min": 2.29, "max": 10.0, "lower": 3.02, "higher": 6.67},
    "SFST": {"min": 0.0, "max": 7.71, "lower": 1.05, "higher": 6.51},
    "WS_Benefit": {"min": 0.08, "max": 10.0, "lower": 2.65, "higher": 6.50},
    "PR_Benefit": {"min": 0.49, "max": 10.0, "lower": 3.29, "higher": 6.68},
    "NR_Benefit": {"min": 0.71, "max": 10.0, "lower": 4.10, "higher": 7.76},
    "SR_Benefit": {"min": 0.49, "max": 8.79, "lower": 2.94, "higher": 6.19},
    "SFST_Benefit": {"min": 0.0, "max": 7.19, "lower": 1.86, "higher": 5.30}
}

# Define specific features for each function
feature_selection = {
    'PR': ['OF22', 'OF26', 'OF27', 'F17', 'F20', 'F21', 'F23', 'F24', 'F28', 'F29', 'F31', 'F33', 'F34', 'F35', 'F36', 'F38', 'F43', 'F44', 'F45', 'F49', 'F63'],
    'NR': ['OF16', 'OF18', 'OF22', 'OF25', 'OF26', 'OF27', 'F1', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F6', 'F17', 'F18', 'F20', 'F21', 'F22', 'F23', 'F24', 'F28', 'F31', 'F33', 'F34', 'F36', 'F43', 'F44', 'F45', 'F48', 'F49', 'F54', 'S5'],
    'SR': ['OF22', 'OF26', 'OF27', 'F9', 'F17', 'F20', 'F22', 'F28', 'F29', 'F31', 'F33', 'F34', 'F35', 'F36', 'F43', 'F44', 'F45', 'F49', 'S5'],
    'WS': ['OF22', 'OF26', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F20', 'F21', 'F22', 'F28', 'F31', 'F43', 'F44', 'F45', 'F48', 'F49'],
    'SFST': ['F1', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F14', 'F17', 'F21', 'F24', 'F25', 'F29', 'F31', 'F33', 'F34', 'F43', 'F47', 'F48'],
    'PR_Benefit': ['OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'F41', 'F48', 'F50', 'F52'],
    'NR_Benefit': ['OF9', 'OF10', 'OF11', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'F13', 'F41', 'F50', 'F51', 'F52'],
    'SR_Benefit': ['OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'F24', 'F28', 'F41', 'F50', 'F52', 'F55', 'S4'],
    'WS_Benefit': ['OF8', 'OF17', 'OF18', 'OF23', 'OF24', 'F51'],
    'SFST_Benefit': ['OF18', 'OF22', 'OF25', 'OF27', 'OF28', 'F50'],
}

# Normalize function
def normalize(values, min_val, max_val):
    return 10 * ((values - min_val) / (max_val - min_val))

# Classify function
def classify(value, lower, higher):
    if value < lower:
        return "lower"
    elif value <= higher:
        return "moderate"
    else:
        return "higher"

# Calculate accuracy function
def calculate_accuracy(actual, predicted):
    actual_classes = actual.split(',')
    predicted_classes = predicted.split(',')
    return sum(1 for a, p in zip(actual_classes, predicted_classes) if a == p) / len(actual_classes)

# Voting system for ensemble learning
def voting_classification(predictions):
    return [Counter(pred).most_common(1)[0][0] for pred in zip(*predictions)]

# Process each function defined in config
for function, params in config.items():
    print("==========================================================================")
    for norm_type in ['non_norm', 'norm', 'non_norm_post']:
        test_csv_file = f"test_results_{function}_{norm_type}.csv"
        data = pd.read_csv(test_csv_file)

        # Filter out results with more than 5 features
        data = data[data['Number of Features'] <= 5]

        # Normalize and classify using the parameters specific to each function
        data['Normalized_Actual'] = data['Actual']
        data['Normalized_Predicted'] = data['Predicted']
        
        data['Classified_Actual'] = data['Normalized_Actual'].apply(lambda x: ','.join([classify(float(val), params['lower'], params['higher']) for val in x.split(',')]))
        data['Classified_Predicted'] = data['Normalized_Predicted'].apply(lambda x: ','.join([classify(float(val), params['lower'], params['higher']) for val in x.split(',')]))

        # Identify the top model based on lowest MSE
        top_model = data.nsmallest(1, 'MSE').iloc[0]

        # Extract and filter features of the top model based on specific features
        try:
            top_features = ast.literal_eval(top_model['Selected Features'])
        except (ValueError, SyntaxError):
            top_features = top_model['Selected Features'].split(', ')

        specific_features = feature_selection.get(function, top_features)
        filtered_top_features = [feature for feature in top_features if feature in specific_features]

        # Calculate overall accuracy for the top model
        top_model_accuracy = calculate_accuracy(top_model['Classified_Actual'], top_model['Classified_Predicted'])

        # Ensemble accuracy calculation
        top_5_models = data.nsmallest(5, 'MSE')
        ensemble_predictions = top_5_models['Classified_Predicted'].apply(lambda x: x.split(','))
        ensemble_predictions = np.array(ensemble_predictions.tolist())
        ensemble_actual = top_5_models.iloc[0]['Classified_Actual'].split(',')
        
        ensemble_votes = voting_classification(ensemble_predictions)
        ensemble_accuracy = calculate_accuracy(','.join(ensemble_actual), ','.join(ensemble_votes))

        # Per-category accuracies for top model and ensemble
        categories = ["lower", "moderate", "higher"]
        top_model_category_accuracy = {category: 0 for category in categories}
        ensemble_category_accuracy = {category: 0 for category in categories}

        for category in categories:
            # Top model category accuracy
            top_model_category_count = top_model['Classified_Actual'].split(',').count(category)
            if top_model_category_count > 0:
                top_model_category_accuracy[category] = sum(1 for a, p in zip(top_model['Classified_Actual'].split(','), top_model['Classified_Predicted'].split(',')) if a == p == category) / top_model_category_count
            else:
                top_model_category_accuracy[category] = None
            
            # Ensemble category accuracy
            ensemble_category_count = ensemble_actual.count(category)
            if ensemble_category_count > 0:
                ensemble_category_accuracy[category] = sum(1 for a, p in zip(ensemble_actual, ensemble_votes) if a == p == category) / ensemble_category_count
            else:
                ensemble_category_accuracy[category] = None

        # Correct ensemble features concatenation
        ensemble_features = set()
        for _, row in top_5_models.iterrows():
            try:
                features = ast.literal_eval(row['Selected Features'])
            except (ValueError, SyntaxError):
                features = row['Selected Features'].split(', ')
            ensemble_features.update(features)

        # Filter ensemble features based on specific features
        filtered_ensemble_features = [feature for feature in ensemble_features if feature in specific_features]

        # Print results
        print(f"\n{function} ({norm_type}) Results:")
        print(f"Top Model MSE: {top_model['MSE']:.4f}")
        print(f"Features used in Top Model: {', '.join(sorted(filtered_top_features))}")
        print(f"Overall Accuracy of Top Model: {top_model_accuracy:.2%}")
        for category in categories:
            top_acc = top_model_category_accuracy[category]
            if top_acc is not None:
                print(f"Top Model Accuracy for '{category}': {top_acc:.2%}")
            else:
                print(f"Top Model Accuracy for '{category}': N/A")

        print(f"\nEnsemble Accuracy: {ensemble_accuracy:.2%}, Ensemble Features: {', '.join(sorted(filtered_ensemble_features))}")
        for category in categories:
            ensemble_acc = ensemble_category_accuracy[category]
            if ensemble_acc is not None:
                print(f"Ensemble Accuracy for '{category}': {ensemble_acc:.2%}")
            else:
                print(f"Ensemble Accuracy for '{category}': N/A")



WS (non_norm) Results:
Top Model MSE: 0.0838
Features used in Top Model: F22, F28, F31, F43
Overall Accuracy of Top Model: 90.48%
Top Model Accuracy for 'lower': 92.86%
Top Model Accuracy for 'moderate': 90.48%
Top Model Accuracy for 'higher': 85.71%

Ensemble Accuracy: 90.48%, Ensemble Features: F22, F28, F31, F43, F44, F45
Ensemble Accuracy for 'lower': 92.86%
Ensemble Accuracy for 'moderate': 90.48%
Ensemble Accuracy for 'higher': 85.71%

WS (norm) Results:
Top Model MSE: 0.8815
Features used in Top Model: F22, F28, F31, F43, F44
Overall Accuracy of Top Model: 95.24%
Top Model Accuracy for 'lower': 100.00%
Top Model Accuracy for 'moderate': 75.00%
Top Model Accuracy for 'higher': 100.00%

Ensemble Accuracy: 90.48%, Ensemble Features: F22, F28, F31, F43, F44, F45
Ensemble Accuracy for 'lower': 95.24%
Ensemble Accuracy for 'moderate': 62.50%
Ensemble Accuracy for 'higher': 100.00%

WS (non_norm_post) Results:
Top Model MSE: 0.1665
Features used in Top Model: F22, F28, F31, F43
Overal