In [None]:
from pathlib import Path
from textwrap import wrap

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


def to_title_case(input_string):
    # Special handling for strings that are likely to be acronyms or single words
    input_string = input_string.lower()
    
    # Split the string by underscores, then capitalize the first letter of each part
    words = input_string.split("_")
    title_case_words = [word.capitalize() for word in words]
    
    # Join the capitalized words with spaces
    result = " ".join(title_case_words)
    result = result.replace("Tsh", "TSH")
    result = result.replace("tsh", "TSH")
    result = result.replace("Fpr", "FPR")
    result = result.replace("Fnr", "FNR")
    
    return result

def note(description):
    plt.plot([], [], ' ', label="\n" + '\n'.join(wrap(description, 35, replace_whitespace=False)))


sns.set_style("whitegrid")
metrics = ["sensitivity", "specificity", "f10", "f1", "fnr", "fpr", "fold", "precision", "recall"]



file_path = '../analysis.csv'  # Adjust this path as necessary
data = pd.read_csv(file_path)

data = data[data['tag'].isin(['basic_ratios',
                              'original_data',
                              'all_features',])]
                            #   'initial_optimized_features',
                            #   'initial_optimized_features_synthetic_powers',
                            #   'initial_optimized_features_synthetic_ratios',
                            #   'initial_optimized_features_synthetic_powers_ratios'])]
# data = data[data['tag'].isin(['balanced_bagging_test'])]

data = data[data['cross_validator'] == 'StratifiedKFold_5']

data['fold'] = data['fold'].astype(int)
data['precision'] = data['true_positives'] / (data['true_positives'] + data['false_positives'])
data['recall'] = data['true_positives'] / (data['true_positives'] + data['false_negatives'])

data['f10'] = (1+(10**2))*((data['precision'] * data['recall'])/(((10**2) * data['precision']) + data['recall']))


data.sort_values(by='created_at', ascending=False, inplace=True)


# data = data[data['tag'] == 'basic_ratios']
# data = data[data['tag'] == 'original_data']
data['resampler'].unique()

In [None]:
good_df = data[data['tag'] == 'original_data'][['tag', 'model', 'scorer', 'resampler', 'fold', 'fnr', 'ppv', 'f1', 'combination_id']]\
    .groupby(['tag', 'model', 'scorer', 'resampler'])\
    .mean()\
    .reset_index()\
    .set_index('combination_id')\
    .sort_values(by=['fnr', 'ppv'], ascending=[True, False])
    
# good_df[good_df['fnr'] == 0.0]
good_df.head(20)
    



In [None]:
selected_combinations = data[data['tag'] == 'original_data'][['tag', 'model', 'scorer', 'resampler', 'fold', 'fnr', 'ppv', 'f1', 'combination_id']]\
    .groupby(['tag', 'model', 'scorer', 'resampler'])\
    .mean()\
    .reset_index()\
    .set_index('combination_id')\
    .sort_values(by=['fnr', 'ppv'], ascending=[True, False])\
    .round(3)\
    .head(10)\

# print(selected_combinations[['model', 'scorer', 'fnr', 'ppv', ]].to_latex(index=False, float_format='%.3f'))

std = data[(data['combination_id'].isin(selected_combinations.index)) & (data['tag'] == 'original_data')][['tag', 'model', 'scorer', 'resampler', 'fold', 'fnr', 'ppv', 'f1', 'combination_id']]\
    .groupby(['tag', 'model', 'scorer', 'resampler', 'combination_id'])\
    .std()\
    .reset_index()\
    .round(3)\
    .head(10)[['fnr', 'ppv', 'model', 'scorer', 'resampler', 'combination_id']]\
    .set_index('combination_id')


# selected_combinations[['model', 'scorer', 'fnr', 'ppv', ]].join(std, lsuffix='_mean', rsuffix='_std')
print(std.join(selected_combinations[['fnr', 'ppv', ]], lsuffix='_std', rsuffix='_mean')\
    .sort_values(by=['fnr_mean', 'ppv_mean'], ascending=[True, False])\
    .head(10)[['model', 'scorer', 'resampler', 'fnr_mean', 'ppv_mean', 'ppv_std']]\
    .to_latex(index=False, float_format='%.3f')\
    .replace('_mean', '$_\\mu$')\
    .replace('_std', '$_\\sigma$')\
    .replace('fnr', 'FNR')\
    .replace('ppv', 'PPV')\
    .replace('resampler', 'Resampler')\
    .replace('model', 'Model')\
    .replace('scorer', 'Scorer')\
    .replace('Gaussian_FR025_NL001_NPR2', 'GN$\left(25\%, 1\%, 2\\right)$')\
    .replace('Gaussian_FR025_NL001_NPR5', 'GN$\left(25\%, 1\%, 5\\right)$')\
    .replace('Gaussian_FR025_NL005_NPR5', 'GN$\left(25\%, 5\%, 5\\right)$')\
    .replace('fp_fn', 'FPFN')\
    .replace('f10_score_pos', 'F10')\
    .replace('f1_score_pos', 'F1')\
    .replace('recall_score', 'Recall')\
        .replace('0.000', '0.0'))

In [None]:
# Create dataset
indices = np.arange(10)
values_increasing = np.arange(1, 6, 0.5)  # Exponential increase
values_increasing = 4 ** values_increasing
values_decreasing = 2 ** np.arange(5, 10, 0.5)  # Exponential decrease
values_decreasing = values_decreasing / 100
values_increasing = values_increasing / 200

# Set width of bars
width = 0.35

# Create figure and axes
fig, ax = plt.subplots()

# Plot bars
bars1 = ax.bar(indices - width/2, values_increasing, width, label='FNR')
bars2 = ax.bar(indices + width/2, values_decreasing, width, label='PPV')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Index')
ax.set_ylabel('Values')
ax.set_title('FNR and PPV of different combinations sorted by FNR then PPV')
ax.set_xticks(indices)
ax.legend()

plt.show()

In [None]:
# Plot the FNR and PPV of all combinations sorted by FNR then PPV
# Plot lines for FNR and PPV

plt.figure(figsize=(10, 6))
df = data[data['tag'] == 'original_data'][['tag', 'model', 'scorer', 'resampler', 'fold', 'fnr', 'ppv', 'combination_id']]\
    .groupby(['tag', 'model', 'scorer', 'resampler'])\
    .mean()[['fnr', 'ppv']]\
    .reset_index()\
    .sort_values(by=['fnr'], ascending=[True]).reset_index()
    
# Get last index where FNR == 0 and draw a vertical line with label FNR == 0
zero_fnr_index = df[df['fnr'] == 0].index[-1]
plt.axvline(x=zero_fnr_index, color='g', linestyle='--')

rolling_window = 20
# Plot the FNR
# df['fnr'].plot(kind='line', color='r', label='FNR', xticks=df.index)
rolling_mean_fnr = df['fnr'].rolling(window=rolling_window, center=True).mean()
rolling_mean_fnr.plot(kind='line', color='r', linestyle='--', label='FNR (smoothed)')
rolling_ci_fnr = rolling_mean_fnr.quantile(0.1)

plt.fill_between(rolling_mean_fnr.index, rolling_mean_fnr - rolling_ci_fnr, rolling_mean_fnr + rolling_ci_fnr, color='r', alpha=0.1)



# Plot the PPV
rolling_mean_ppv = df['ppv'].rolling(window=rolling_window, center=True).mean()
rolling_mean_ppv.plot(kind='line', color='b', linestyle='--', label='PPV (smoothed)')
rolling_ci_ppv_lower = df['ppv'].rolling(window=rolling_window, center=True).quantile(0.10)
rolling_ci_ppv_upper = df['ppv'].rolling(window=rolling_window, center=True).quantile(0.90)
# Smooth the PPV
rolling_ci_ppv_lower = rolling_ci_ppv_lower.rolling(window=rolling_window//2, center=True).mean()
rolling_ci_ppv_upper = rolling_ci_ppv_upper.rolling(window=rolling_window//2, center=True).mean()

plt.fill_between(rolling_mean_ppv.index,  rolling_ci_ppv_lower,  rolling_ci_ppv_upper, color='b', alpha=0.1)


# Plot the first 20 values of the PPV directly
# df.sort_values(by=['fnr', 'ppv'], ascending=[True, False]).reset_index()['ppv'].head(zero_fnr_index).plot(kind='line', color='b', linestyle='-', label=f'PPV where FNR == 0 (n={zero_fnr_index})')
# df.sort_values(by=['fnr', 'ppv'], ascending=[True, False]).reset_index()['ppv'].plot(kind='line', color='b', linestyle='-', label=f'PPV where FNR == 0 (n={zero_fnr_index})')

# Set the y ticks to show percentages from 0 - 100%
plt.yticks(np.arange(0, 1.1, 0.1), [f'{i*100:.0f}%' for i in np.arange(0, 1.1, 0.1)])

# Set X ticks to only show 20 values
plt.xticks(np.arange(0, len(df), 40))
plt.xlabel('Combination Rank by FNR')
# plt.ylabel('Value')
# plt.title('FNR and PPV of different combinations sorted by FNR then PPV')
plt.legend()
plt.tight_layout()
plt.savefig('../../../paper/layering/images/original_fnr_ppv_relationship.jpg', dpi=1000, format='jpg')


In [None]:
new_def = df[df['fnr'] == 0].sort_values(by='ppv', ascending=False).reset_index()
# Make another line plot with the FNR and PPV of the combinations where FNR == 0
plt.figure(figsize=(5, 3))
# Plot the FNR
new_def['fnr'].plot(kind='line', color='r', label='FNR')
# Plot the PPV
new_def['ppv'].plot(kind='line', color='b', label='PPV')
plt.xlabel('Combination Rank by FNR')
# plt.ylabel('Value')
# plt.title('FNR and PPV of combinations where FNR == 0')
plt.yticks(np.arange(0, 0.25, 0.05), [f'{i*100:.0f}%' for i in np.arange(0, 0.25, 0.05)])

plt.legend()
plt.tight_layout()
plt.savefig('../../../paper/layering/images/original_fnr_ppv_relationship_acceptable.eps', dpi=1000, format='eps')



In [None]:
# Plot lines for FNR and PPV
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

plt.figure(figsize=(10, 6))
df = data[data['tag'] == 'original_data'][['tag', 'model', 'scorer', 'resampler', 'fold', 'fnr', 'ppv', 'combination_id']]\
    .groupby(['tag', 'model', 'scorer', 'resampler'])\
    .mean()[['fnr', 'ppv']]\
    .reset_index()\
    .sort_values(by=['fnr'], ascending=[True]).reset_index()
    
# Get last index where FNR == 0 and draw a vertical line with label FNR == 0
zero_fnr_index = df[df['fnr'] == 0].index[-1]
plt.axvline(x=zero_fnr_index, color='g', linestyle='--')

# Plot the FNR
lowess = sm.nonparametric.lowess
smoothed_fnr = lowess(df['fnr'], df.index, frac=0.1)  # Adjust frac for smoothing level
# df['fnr'].plot(kind='line', color='r', label='FNR', xticks=df.index)
plt.plot(smoothed_fnr[:, 0], smoothed_fnr[:, 1], color='r', label='FNR (smoothed)')


# Apply LOESS smoothing to PPV
lowess = sm.nonparametric.lowess
smoothed_ppv = lowess(df['ppv'], df.index, frac=0.1)  # Adjust frac for smoothing level

# Plot the smoothed PPV
plt.plot(smoothed_ppv[:, 0], smoothed_ppv[:, 1], color='b', label='PPV (smoothed)')

# Plot the first 20 values of the PPV directly
# df.sort_values(by=['fnr', 'ppv'], ascending=[True, False]).reset_index()['ppv'].head(zero_fnr_index).plot(kind='line', color='b', linestyle='-', label=f'PPV where FNR == 0 (n={zero_fnr_index})')
# df.sort_values(by=['fnr', 'ppv'], ascending=[True, False]).reset_index()['ppv'].plot(kind='line', color='b', linestyle='-', label=f'PPV where FNR == 0 (n={zero_fnr_index})')


# Set X ticks to only show 20 values
plt.xticks(np.arange(0, len(df), 40))
plt.xlabel('Combination Rank (sorted by FNR)')
plt.ylabel('Value')
plt.title('FNR and PPV of different combinations sorted by FNR then PPV (LOESS smoothed, 0.1 frac)')
plt.legend()
plt.tight_layout()
# plt.savefig('../../../paper/layering/images/original_fnr_ppv_relationship.png', dpi=500)

In [None]:
# (340, 383, 423)
 # Generate and save each plot individually
for scorer in data['scorer'].unique():
    for tag in data['tag'].unique():
        print(f"Generating plots for {tag} - {scorer}")
        std_dir = Path(f'{tag}/{scorer}/std')
        mean_dir = Path(f'{tag}/{scorer}/mean')
        os.makedirs(mean_dir, exist_ok=True)
        os.makedirs(std_dir, exist_ok=True)
        for metric in metrics:
            # Calculate results
            df = data.copy()
            df = df[df['tag'] == tag]
            df = df[df['scorer'] == scorer]
            columns = ["model", "resampler", metric]
            df = df.drop(columns=[col for col in df.columns if col not in columns])
            var = df.copy()
            df = df.groupby(["model", "resampler"]).mean().reset_index()

            if len(df) == 0:
                continue

            plt.figure(figsize=(13, 9))
            sns.heatmap(data=df.pivot(index="model", columns="resampler", values=metric), 
                        annot=True,
                        fmt=".3f", 
                        cmap="YlGnBu", 
                        cbar_kws={'label': to_title_case(metric)},
            )
            plt.title(f'Performance by {to_title_case(metric)} - Tag: {to_title_case(tag)} - Optimized using: {to_title_case(scorer)}')
            plt.xlabel("Resampler")
            plt.ylabel("Learner")
            plt.tight_layout()
            plt.savefig(mean_dir / f'{metric}_performance.png', dpi=400)
            plt.savefig(mean_dir / f'{metric}_performance.eps', dpi=1000)
            plt.close()  # Close the plot to free memory
        
            # Calculate Standard Deviation of results
            plt.figure(figsize=(15, 9))
            plt.title(f"{tag} - {to_title_case(metric)} Performance Standard Deviation")
            df['std'] = var.groupby(["model", "resampler"]).std().reset_index()[metric]
            sns.heatmap(data=df.pivot(index="model", columns="resampler", values="std"), annot=True, fmt=".3f", cmap="YlGnBu", cbar_kws={'label': to_title_case(metric)})
            plt.tight_layout()
            plt.savefig(std_dir / f'{metric}_performance_std.png')
            plt.close()
