In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import numpy as np

# List of result columns
result_columns_list = ['PR', 'NR', 'SR', 'WS', 'SFST', 'PR_Benefit', 'NR_Benefit', 'SR_Benefit', 'WS_Benefit', 'SFST_Benefit']
output_plots_directory = './output_plots/'
ensemble_plots_directory = './ensemble_plots/'
os.makedirs(output_plots_directory, exist_ok=True)
os.makedirs(ensemble_plots_directory, exist_ok=True)

# Function to extract the relevant part of the filename for the title
def extract_title_part(filename):
    return filename.split('_')[0]  # Adjust the split index based on the filename format

# Function to plot confusion matrix for each best model
def plot_confusion_matrix(ax, model_name, csv_filename, model_data, accuracy):
    actual = model_data['Actual']
    predicted = model_data['Predicted']
    cm = confusion_matrix(actual, predicted, labels=[0, 1, 2])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2], ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    title_part = extract_title_part(csv_filename)
    ax.set_title(f'Confusion Matrix for {model_name} ({title_part})\nAccuracy: {accuracy}')

for result_column in result_columns_list:
    print(f"Processing result column: {result_column}")
    # Directory containing the output CSV files
    output_csv_directory = f'../Training/Results/{result_column}/'

    # Read the file containing the best models info
    best_models_info = pd.read_csv(f'../Training/Results/best_models_info{result_column}.csv')

    # Dictionary to store data for each best model
    best_model_data_dict = {}

    # Read each CSV file and store the data for the best models
    csv_files = glob.glob(os.path.join(output_csv_directory, '*.csv'))

    for csv_file in csv_files:
        data = pd.read_csv(csv_file)
        # Extract the part of the filename to match with best_models_info
        csv_file_basename = os.path.basename(csv_file).replace('output_o', 'o').replace(f'_{result_column}.csv', '')

        best_model_row = best_models_info[best_models_info['csv_file'] == csv_file_basename]
        if not best_model_row.empty:
            best_model_name = best_model_row['model_name'].values[0]
            accuracy = best_model_row['accuracy'].values[0]
            if best_model_name in data['Model'].unique():
                best_model_data = data[data['Model'] == best_model_name]
                best_model_data_dict[(best_model_name, csv_file_basename, accuracy)] = best_model_data

    # Plot the top 2 confusion matrices based on accuracy
    top_2_models = sorted(best_model_data_dict.items(), key=lambda x: x[0][2], reverse=True)[:2]
    top_5_models = sorted(best_model_data_dict.items(), key=lambda x: x[0][2], reverse=True)[:5]
    num_models = len(top_2_models)
    fig, axes = plt.subplots(1, num_models + 1, figsize=(10 * (num_models + 1), 8))

    if num_models == 1:
        axes = [axes]  # Ensure axes is iterable if there's only one subplot

    for ax, ((model_name, csv_filename, accuracy), model_data) in zip(axes, top_2_models):
        plot_confusion_matrix(ax, model_name, csv_filename, model_data, accuracy)

    # Calculate ensemble predictions
    ensemble_actual = top_5_models[0][1]['Actual']
    ensemble_predicted = np.round(np.mean(
        [model_data['Predicted'].values for (_, _, _), model_data in top_5_models],
        axis=0
    )).astype(int)

    # Calculate ensemble accuracy
    ensemble_accuracy = accuracy_score(ensemble_actual, ensemble_predicted)

    # Add the ensemble approach to the plot
    cm_ensemble = confusion_matrix(ensemble_actual, ensemble_predicted, labels=[0, 1, 2])
    sns.heatmap(cm_ensemble, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2], ax=axes[-1])
    axes[-1].set_xlabel('Predicted')
    axes[-1].set_ylabel('Actual')
    axes[-1].set_title(f'Ensemble (Top 5 models) (Accuracy: {ensemble_accuracy:.4f})')

    plt.tight_layout(pad=3.0)
    plot_filename = os.path.join(output_plots_directory, f"top_2_and_ensemble_confusion_matrices_{result_column}.png")
    plt.savefig(plot_filename)
    plt.close()

    print(f"Top 2 confusion matrices and ensemble matrix saved for {result_column} to {output_plots_directory}")

    # Save the ensemble confusion matrix separately with no axis, tight layout
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm_ensemble, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2], cbar=False)
    plt.axis('off')
    plt.tight_layout(pad=3.0)
    ensemble_plot_filename = os.path.join(ensemble_plots_directory, f"ensemble_confusion_matrix_{result_column}.png")
    plt.savefig(ensemble_plot_filename, bbox_inches='tight', pad_inches=0)
    plt.close()

    print(f"Ensemble confusion matrix saved separately for {result_column} to {ensemble_plots_directory}")

Processing result column: PR
Top 2 confusion matrices and ensemble matrix saved for PR to ./output_plots/
Ensemble confusion matrix saved separately for PR to ./ensemble_plots/
Processing result column: NR
Top 2 confusion matrices and ensemble matrix saved for NR to ./output_plots/
Ensemble confusion matrix saved separately for NR to ./ensemble_plots/
Processing result column: SR
Top 2 confusion matrices and ensemble matrix saved for SR to ./output_plots/
Ensemble confusion matrix saved separately for SR to ./ensemble_plots/
Processing result column: WS
Top 2 confusion matrices and ensemble matrix saved for WS to ./output_plots/
Ensemble confusion matrix saved separately for WS to ./ensemble_plots/
Processing result column: SFST
Top 2 confusion matrices and ensemble matrix saved for SFST to ./output_plots/
Ensemble confusion matrix saved separately for SFST to ./ensemble_plots/
Processing result column: PR_Benefit
Top 2 confusion matrices and ensemble matrix saved for PR_Benefit to ./o