In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# This is the concatenated results files with probabilities joined to the analysis data
data = pd.read_csv('../results.csv')

data = data[data['tag'] == 'all_features']
data = data[data['combination_id'].isin([33, 286, 365, 368, 425, 434, 439, 449, 452, 487, 480])]
data = data.drop(columns=['Unnamed: 0', "record_id.1", "record_id.2", "id.1", "id.2", "id.3", "id.4", "model_file", "result_file", "metadata", "created_at", "latest"])
data = data.rename(columns={"index": "case_index"})

data

In [None]:
names = ["\n".join(name[1:]) for name, _ in data.groupby(['combination_id', 'model', 'resampler', 'scorer'])]
ids = data['combination_id'].unique()
id_lookup = dict(zip(ids, names))
id_lookup

In [None]:
os.makedirs('density_plots', exist_ok=True)
for name, group in data.groupby(['combination_id', 'tag']):
    # Create a density plot by probability
    plt.figure(figsize=(10, 6))
    sns.kdeplot(group['proba'], log_scale=(False, True))
    plt.title(f"{id_lookup[name[0]]}")
    plt.savefig(f'density_plots/{name[0]}.png', dpi=500)
    plt.close()
    
plt.figure(figsize=(10, 6))
for name, group in data.groupby(['combination_id', 'tag']):
    # Create a density plot by probability
    if name[0] == 439:
        continue
    sns.kdeplot(group['proba'], label=id_lookup[name[0]], log_scale=(False, True))
    
    

# Labelled Density Histograms 

In [4]:
# Random Under Sampler
from imblearn.under_sampling import RandomUnderSampler

def label_confusion(predicted, actual):
    if predicted == actual:
        return 'True Positive' if predicted == 1 else 'True Negative'
    else:
        return 'False Positive' if predicted == 1 else 'False Negative'
    
os.makedirs('labelled_density_plots', exist_ok=True)
for normalize in [True, False]:
    for name, group in data.groupby(['combination_id', 'tag']):
        # Create a density plot by probability
        plt.figure(figsize=(10, 6))
        group['label'] = group.apply(lambda x: label_confusion(x['predicted'], x['actual']), axis=1)

        if normalize:
            ratio = group['predicted'].mean()
            group['weight'] = group['predicted'].map({0: ratio, 1: 1-ratio})
            group['weight'] = group['weight'] / group['weight'].sum()
            
            sns.histplot(group, x='proba', bins=100, hue='label', weights='weight', palette='tab10', multiple="stack")
            plt.ylabel('Density by Class')
        else:
            sns.histplot(group, x='proba', bins=100, hue='label', palette='tab10', multiple="stack")
            plt.xlim(0.15, 1)
            plt.ylim(0, 250)
            plt.ylabel('Density')
        
        plt.xlabel('Predicted Probability')
        plt.title(f"{id_lookup[name[0]]}")
        plt.savefig(f'labelled_density_plots/{name[0]}_{"normalized" if normalize else "unnormalized"}.png', dpi=500)
        plt.close()
        

    