In [None]:
import glob
import pandas as pd
import numpy as np

import os
import os.path as osp
import re

In [None]:
attack = "fgsm"

In [None]:
base_path = osp.join(os.getcwd(),"results")
attack_folder =  f"attack_{attack}"
seeds = [str(i*111) for i in range(1,6)]
results_folder = os.path.join(base_path, attack_folder)

In [None]:
algorithms = ["draft","noise","fgsm","pgd","aae","crownibp"]
exclude_datasets = ["Dialysis","divorce","Pbc3","vlbw"]

In [None]:
# CI , IBS , NegLL
metric = "CI"

ascending = False if metric=="CI" else True

In [None]:
def get_metric_excel_paths(base_path,metric):
    # Use glob to match all CI.xlsx files in seed_{number} folders
    pattern = os.path.join(base_path, "results_*", "*", "seed_*", f"{metric}.xlsx")
    metric_excel_paths = glob.glob(pattern)
    return metric_excel_paths

def read_metric_data(file_path):
    # Read the CI.xlsx file and extract epsilon and CI values
    df = pd.read_excel(file_path)
    if df.shape[1] != 2:
        raise ValueError(f"Unexpected format in {file_path}. Expected exactly two columns.")

    metric_name = df.columns[-1]
    df.columns = ['epsilon', metric_name]
    return df

def extract_metadata_from_path(path):
    # Extract dataset, algorithm, and attack method from the path
    parts = path.split(os.sep)
    attack_method = parts[-5].replace("attack_", "")
    algorithm = parts[-4].replace("results_", "")
    dataset = parts[-3]
    seed = int(parts[-2].replace("seed_", ""))
    return dataset, algorithm, attack_method, seed

def create_aggregated_dataframe(base_path,metric="CI"):
    # Get all CI.xlsx file paths
    ci_excel_files = get_metric_excel_paths(base_path,metric)
    
    # Dictionary to store dataframes by (dataset, algorithm) keys
    data_dict = {}
    
    # Process each CI.xlsx file
    for file_path in ci_excel_files:
        dataset, algorithm, attack_method, seed = extract_metadata_from_path(file_path)
        metric_data = read_metric_data(file_path)
        
        # Use (dataset, algorithm) as key
        key = (dataset, algorithm)
        
        # Initialize list for the key if not present
        if key not in data_dict:
            data_dict[key] = []
        
        # Append CI data to the list for that key
        data_dict[key].append(metric_data.set_index('epsilon'))
    
    # Dictionary to store aggregated dataframes
    aggregated_data = {}
    
    # Aggregate by dataset and algorithm
    for (dataset, algorithm), dfs in data_dict.items():
        # Concatenate along the columns to align by epsilon values and compute mean
        concatenated_df = pd.concat(dfs, axis=1)
        aggregated_df = concatenated_df.mean(axis=1).to_frame(name=(dataset, algorithm))
        aggregated_data[(dataset, algorithm)] = aggregated_df
    
    # Combine all aggregated dataframes into a single dataframe with multi-index columns
    final_df = pd.concat(aggregated_data.values(), axis=1)

    # Sort the columns by dataset and then by algorithm for a clean MultiIndex
    final_df = final_df.sort_index(axis=1, level=[0, 1])

    # Create a MultiIndex for the columns with levels: dataset and algorithm
    columns = pd.MultiIndex.from_tuples(final_df.columns, names=['Dataset', 'Algorithm'])
    final_df.columns = columns
    
    return final_df


In [None]:
final_df = create_aggregated_dataframe(results_folder,metric).reindex(columns=algorithms, level=1)

In [None]:
rename_dict = {"draft":"DRAFT","noise":"Noise","fgsm":"FGSM","pgd":"PGD","aae":"AAE-Cox","crownibp":"SAWAR"}
algorithms_renamed = list(rename_dict.values())

In [None]:
final_df = final_df.rename(columns=rename_dict, level=1)

In [None]:
final_df

In [None]:
final_df[['zinc']]

In [None]:
final_df[final_df==''] = np.NaN
final_df = final_df.astype(float)
final_df

In [None]:
final_df.applymap(np.isnan).sum().sum()

In [None]:
best_per_dataset_rank = final_df.groupby(level=0,axis=1).rank(axis=1,na_option='bottom',method="average",ascending=ascending).reindex(columns=algorithms_renamed, level=1)
best_per_dataset_rank

In [None]:
# best_per_dataset_rank.stack(1)

In [None]:
best_per_dataset_avg_rank = best_per_dataset_rank.stack(level=1).mean(1).unstack(1).sort_values(by="epsilon",ascending=False)
best_per_dataset_avg_rank

In [None]:
if not ascending:
    best_per_dataset = final_df.groupby(level=0,axis=1).idxmax(1)
else:
    best_per_dataset = final_df.groupby(level=0,axis=1).idxmin(1)
best_per_dataset.applymap(lambda x: x[1])

In [None]:
def calculate_percent_change_from_draft(df):
    """
    Calculate the percent change from the 'draft' method for each dataset and algorithm in the given DataFrame.
    
    Args:
        df (pd.DataFrame): MultiIndex DataFrame where level 0 is 'Dataset' and level 1 is 'Algorithm'.
    
    Returns:
        pd.DataFrame: A DataFrame with percent changes from the 'draft' method for each dataset and algorithm.
    """
    # Create a copy of the DataFrame to store the percent changes
    percent_change_df = df.copy()

    # Loop over each dataset in level 0 of the MultiIndex
    for dataset in df.columns.levels[0]:
        # Get the "draft" column for the current dataset
        draft_column = df[(dataset, "DRAFT")]

        # Calculate percent change for each algorithm relative to "draft"
        for algorithm in df.columns.levels[1]:
            # Skip the "draft" column itself as it is 0% change
            if algorithm == "DRAFT":
                continue

            # Calculate percent change and update in the new DataFrame
            percent_change_df[(dataset, algorithm)] = (
                (df[(dataset, algorithm)] - draft_column) / draft_column
            ) * 100  # Multiply by 100 to convert to percentage

    # The "draft" column itself should be 0% change from itself
    for dataset in df.columns.levels[0]:
        percent_change_df[(dataset, "DRAFT")] = 0

    return percent_change_df

In [None]:
percent_change_df = calculate_percent_change_from_draft(final_df).reindex(columns=algorithms_renamed, level=1)

In [None]:
percentage_change_mean =  percent_change_df.stack(level=1).mean(1).unstack(1).sort_values(by="epsilon",ascending=False)
percentage_change_mean

In [None]:
excel_name = os.path.join(results_folder,f"{metric}_all.xlsx")
with pd.ExcelWriter(excel_name) as writer:  
    final_df.applymap(lambda x: np.round(x,3)).to_excel(writer,sheet_name=metric)
    best_per_dataset_rank.to_excel(writer,sheet_name="rank")
    best_per_dataset_avg_rank.to_excel(writer,sheet_name="average_rank")
    best_per_dataset.applymap(lambda x: x[1]).to_excel(writer,sheet_name="best")
    percentage_change_mean.to_excel(writer,sheet_name="%")

In [None]:
best_per_dataset_avg_rank.index = ["{:.2f}".format(float(x)) for x in np.round(best_per_dataset_avg_rank.index.tolist(),2)]
best_per_dataset_avg_rank.index.name = r"$\epsilon$"
print(best_per_dataset_avg_rank.applymap(lambda x: str(np.round(x,2))).to_latex(index=True))

In [None]:
print(best_per_dataset_avg_rank.applymap(lambda x: str(np.round(x,2))).T.to_latex(index=True))

In [None]:
if metric == "NegLL":
    final_df.index = ["{:.2f}".format(float(x)) for x in np.round(final_df.index.tolist(),2)]
    final_df.index.name = r"$\epsilon$"
    print(final_df.applymap(lambda x: np.round(x,3)).applymap(lambda x: "{:.2e}".format(x)).to_latex(index=True,multicolumn_format="c"))
else:
    final_df.index = ["{:.2f}".format(float(x)) for x in np.round(final_df.index.tolist(),2)]
    final_df.index.name = r"$\epsilon$"
    print(final_df.applymap(lambda x: np.round(x,3)).applymap(str).to_latex(index=True,multicolumn_format="c"))

In [None]:
dataset_names = np.array(list(map(np.array,final_df.columns)))[:,0]

In [None]:
dataset_names

## LONG TABLE

In [None]:
final_df.T

In [None]:
if metric == "NegLL":
    print(final_df.T.applymap(lambda x: str(np.round(x,3))).applymap(lambda x: "{:.2e}".format(float(x))).to_latex(index=True,multicolumn_format="c"))
else:
    print(final_df.T.applymap(lambda x: str(np.round(x,3))).to_latex(index=True,multicolumn_format="c"))

## PDF PICTURES

In [None]:
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
# import pandas as pd
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import pandas
import modin.pandas as pd

In [None]:
def scrape_excel_data(root_folder,excel_name,seed_key):
    """
    Scrape data from Excel sheets in a nested folder structure into a dictionary.
    
    Args:
        root_folder (str): Root folder containing the data.
    
    Returns:
        dict: Nested dictionary where keys are datasets, sub-keys are algorithms,
              and sub-sub-keys are seeds, with values being DataFrames from the Excel sheets.
    """
    # Initialize the nested dictionary
    data_dict = {}

    # Traverse the root folder
    
    # Ensure it's a directory
    if os.path.isdir(root_folder):
        # Traverse the results folder for each attack method
        for algorithm_folder in tqdm(os.listdir(root_folder)):
            algorithm_path = os.path.join(root_folder, algorithm_folder)

            # Ensure it's a directory
            if os.path.isdir(algorithm_path):
                # Traverse the dataset folder for each algorithm
                for dataset_folder in os.listdir(algorithm_path):
                    dataset_path = os.path.join(algorithm_path, dataset_folder)

                    # Ensure it's a directory
                    if os.path.isdir(dataset_path):
                        # Initialize sub-dictionary for each dataset
                        if dataset_folder not in data_dict:
                            data_dict[dataset_folder] = {}

                        # Initialize sub-dictionary for each algorithm within the dataset
                        if algorithm_folder not in data_dict[dataset_folder]:
                            algo = algorithm_folder.split("_")[-1]

                            data_dict[dataset_folder][algo] = {}

                        # Traverse the seed folders
                        for seed_folder in os.listdir(dataset_path):
                            if seed_key not in seed_folder:
                                continue
                            seed_path = os.path.join(dataset_path, seed_folder)

                            # Ensure it's a directory and contains the Excel file
                            if os.path.isdir(seed_path):
                                excel_file_path = os.path.join(seed_path, excel_name)

                                if os.path.exists(excel_file_path):
                                    # Read the Excel file into a DataFrame
                                    # print(excel_file_path)
                                    df = pd.read_excel(excel_file_path,engine='openpyxl')

                                    # Extract seed identifier from the folder name
                                    seed_id = seed_folder.split('_')[-1]
                                    
                                    # Store the DataFrame in the nested dictionary
                                    data_dict[dataset_folder][algo][seed_id] = df

    return data_dict


In [None]:
attack = "fgsm"

In [None]:
excel_name = "population_curves_attacked_test.xlsx"

In [None]:
seed_interest = "222"

In [None]:
base_path = osp.join(os.getcwd(),"results")
attack_folder =  f"attack_{attack}"
seeds = [str(i*111) for i in range(1,6)]
results_folder = os.path.join(base_path, attack_folder)

In [None]:
algorithms = ["draft","noise","fgsm","pgd","aae","crownibp"]
exclude_datasets = ["Dialysis","divorce","Pbc3","vlbw"]

In [None]:
data_dict = scrape_excel_data(results_folder,excel_name,seed_interest)

In [None]:
cwd = os.getcwd()
save_folder = osp.join(cwd,"results",f"attack_{attack}","perturb_curves.pdf")

In [None]:
print(save_folder)

In [None]:
save_folder

n_rows = len(data_dict)
n_cols = len(data_dict["stagec"])

rename_dict = {"draft":"DRAFT","noise":"Noise","fgsm":"FGSM","pgd":"PGD","aae":"AAE-Cox","crownibp":"SAWAR"}

In [None]:
from copy import deepcopy

In [None]:
fig,axes = plt.subplots(n_rows,n_cols,figsize=(30,64),sharey=True)

SMALL_SIZE = 80
plt.rc('xtick', labelsize=SMALL_SIZE//2)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE//2)  
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE)     # fontsize of the x and y labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=SMALL_SIZE)   # fontsize of the figure title


for i,key_dataset in enumerate(data_dict.keys()):
    for j,key_al in enumerate(algorithms):
        print(i,j)


        df_population = deepcopy(data_dict[key_dataset][key_al][seed_interest])

        t = df_population.pop("t")
        base_models = df_population.iloc[:,:2]
        base_models.columns = [col.split("_")[0] for col in base_models.columns]
        base_models = base_models.rename(columns={"kmf":"KMC","St":"NN"})
        
        df_population= df_population.iloc[:,2:].iloc[:,-5:-1]
        
        df_population.columns = ["$\epsilon$={:.2f}".format(eval(col.split("=")[1])) for col in df_population.columns]

        # print(base_models)
        axes[i][j].plot(t,base_models.iloc[:,0],linewidth=3,c="b")
        axes[i][j].plot(t,base_models.iloc[:,1],linewidth=3,c="r")

  
        perturb1 = axes[i][j].plot(t,df_population,'--',linewidth=3)
  
        if j == 0:
            base1 = axes[i][j].plot(t,base_models.iloc[:,0],linewidth=3,c="b")
            base2 = axes[i][j].plot(t,base_models.iloc[:,1],linewidth=3,c="r")
            perturb = axes[i][j].plot(t,df_population,'--',linewidth=3)

            axes[i][j].set_ylabel(f"{key_dataset}\n S(t)" ,fontsize=SMALL_SIZE//1.5)
            axes[i][j].set_xlabel("t",fontsize=SMALL_SIZE//1.5)

        axes[i][j].set_xlabel("t",fontsize=SMALL_SIZE//1.5)

for ax, col in zip(axes[0], algorithms):
    col = "SAWAR" if col == "crownibp" else col
    col = "DRAFT" if col == "draft" else col
    col = "PGD" if col == "pgd" else col
    col = "FGSM" if col == "fgsm" else col
    col = "Noise" if col == "noise" else col
    col = "AAE-Cox" if col == "aae" else col

    ax.set_title(col,fontsize=SMALL_SIZE//1.5)

labels = base_models.columns.tolist() + df_population.columns.tolist()
# labels[labels.index("baseline")] = "non-robust"

fig.legend([base1, base2,perturb], labels=labels, 
           loc="upper center",ncols=5,fontsize=30,bbox_to_anchor=(.5,1.06),prop={'size':SMALL_SIZE//1.7}) 

# axes[0][3].legend(base_models.columns.tolist() + robust_df.columns.tolist(),fontsize=20,ncol=2,loc=1)

# plt.legend(base_models.columns.tolist() + robust_df.columns.tolist(),loc='upper center',ncol=5)
plt.tight_layout(pad=0)
plt.savefig(save_folder,dpi=1600,bbox_inches="tight")
plt.show()

### DIST PLOTS

In [None]:
# CI , IBS , NegLL
results_folder = fr"results\{attack}"
img_name = "curve_distributions_test"

 # aggregate all the CI files
os.listdir(results_folder)
excels = []
for folder in os.listdir(results_folder):
    glob_search = os.path.join(results_folder,folder,"*",f"{img_name}.xlsx")
    excels.extend(glob.glob(glob_search))

for dataset in exclude_datasets:
    for exceli in excels:
        if dataset in exceli:
            print("remove ",dataset)
            excels.remove(exceli)

In [None]:
excels = np.sort(excels)
files = pd.DataFrame(excels.reshape(-1,len(algorithms)-1,order="F"),columns=["crownibp","fgsm","noise","pgd"]).reindex(["noise","fgsm","pgd","crownibp"],axis=1).values

In [None]:
save_folder = results_folder = os.path.join(r"results",attack,"dist_curves.pdf")

In [None]:
import seaborn as sns

In [None]:
fig,axes = plt.subplots(n_rows,n_cols+1,figsize=(30,64),sharey=True)

SMALL_SIZE = 80
plt.rc('xtick', labelsize=SMALL_SIZE//2)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE//2)  
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE)     # fontsize of the x and y labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=SMALL_SIZE)   # fontsize of the figure title


for i in range(n_rows):
    for j in range(n_cols):
        print(i,j)
        fileij = files[i][j]
        algo,dataset = fileij.split("\\")[-3:-1]
        algo = re.sub("results_","",algo)

        df_population = pd.read_excel(fileij)

        t = df_population.pop("t")
        base_models = df_population.iloc[:,:3]
        base_models.columns = [col.split("_")[0] for col in base_models.columns]
        
        robust_idx = ["robust" in col for col in df_population.columns]
        baseline_idx = ["baseline" in col for col in df_population.columns]
        robust_df = df_population.iloc[:,robust_idx]
        baseline_df = df_population.iloc[:,baseline_idx]
        
        robust_df.columns = [col.split("_")[1] for col in robust_df.columns]
        baseline_df.columns =[col.split("_")[1] for col in baseline_df.columns]
        
        mu = sns.lineplot(x=t, y=robust_df.iloc[:,0], label='Average S(t)', linewidth=3.0, ax=axes[i][j+1],c='b',legend=False)
        q95 = sns.lineplot(x=t, y=robust_df.iloc[:,1], label='Confidence', linewidth=3.0, ax=axes[i][j+1],c='r',legend=False)
        q05 = sns.lineplot(x=t, y=robust_df.iloc[:,2], label='Confidence', linewidth=3.0, ax=axes[i][j+1],c='r',legend=False)

        line = q05.get_lines()
        axes[i][j+1].fill_between(line[0].get_xdata(), line[1].get_ydata(), line[2].get_ydata(), color='blue', alpha=.3)
        axes[i][j+1].set_xlabel("t",fontsize=SMALL_SIZE//1.5)

        if j == 0:
            mu = sns.lineplot(x=t, y=baseline_df.iloc[:,0], label='Average S(t)', linewidth=3.0, ax=axes[i][j],c='b',legend=False)
            q95 = sns.lineplot(x=t, y=baseline_df.iloc[:,1], label='Confidence', linewidth=3.0, ax=axes[i][j],c='r',legend=False)
            q05 = sns.lineplot(x=t, y=baseline_df.iloc[:,2], label='Confidence', linewidth=3.0, ax=axes[i][j],c='r',legend=False)

            axes[i][j].set_ylabel(f"{dataset}\n S(t)" ,fontsize=SMALL_SIZE//1.5)
            axes[i][j].set_xlabel("t",fontsize=SMALL_SIZE//1.5)
            line = q05.get_lines()
            axes[i][j].fill_between(line[0].get_xdata(), line[1].get_ydata(), line[2].get_ydata(), color='blue', alpha=.3)
            axes[i][j].set_xlabel("t",fontsize=SMALL_SIZE//1.5)

for ax, col in zip(axes[0], algorithms):
    col = "SAWAR" if col == "crownibp" else col
    col = "DRAFT" if col == "baseline" else col
    col = "PGD" if col == "pgd" else col
    col = "FGSM" if col == "fgsm" else col
    col = "Noise" if col == "noise" else col
    ax.set_title(col,fontsize=SMALL_SIZE//1.5)

labels = ["S(t)","Credible Interval","$Q_{95},Q_{05}$"]
fig.legend([mu,q95,q05], labels=labels, 
           loc="upper center",ncols=4,fontsize=30,bbox_to_anchor=(.5,1.05),prop={'size':SMALL_SIZE}) 

plt.tight_layout(pad=0)
plt.savefig(save_folder,dpi=1600,bbox_inches="tight")
plt.show()

In [None]:
save_folder = results_folder = os.path.join(r"results",attack,"dist_curves_subset.pdf")
file_subset = files[[-7,-2,-1],:]

fig,axes = plt.subplots(3,n_cols+1,figsize=(30,20),sharey=True)

SMALL_SIZE = 20
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)  

for i in range(3):
    for j in range(n_cols):
        print(i,j)
        fileij = file_subset[i][j]
        algo,dataset = fileij.split("\\")[-3:-1]
        algo = re.sub("results_","",algo)

        df_population = pd.read_excel(fileij)

        t = df_population.pop("t")
        base_models = df_population.iloc[:,:3]
        base_models.columns = [col.split("_")[0] for col in base_models.columns]
        
        robust_idx = ["robust" in col for col in df_population.columns]
        baseline_idx = ["baseline" in col for col in df_population.columns]
        robust_df = df_population.iloc[:,robust_idx]
        baseline_df = df_population.iloc[:,baseline_idx]
        
        robust_df.columns = [col.split("_")[1] for col in robust_df.columns]
        baseline_df.columns =[col.split("_")[1] for col in baseline_df.columns]
        
        mu = sns.lineplot(x=t, y=robust_df.iloc[:,0], label='Average S(t)', linewidth=3.0, ax=axes[i][j+1],c='b',legend=False)
        q95 = sns.lineplot(x=t, y=robust_df.iloc[:,1], label='Confidence', linewidth=3.0, ax=axes[i][j+1],c='r',legend=False)
        q05 = sns.lineplot(x=t, y=robust_df.iloc[:,2], label='Confidence', linewidth=3.0, ax=axes[i][j+1],c='r',legend=False)
        axes[i][j+1].set_xlabel("t",fontsize=20)

        line = q05.get_lines()
        axes[i][j+1].fill_between(line[0].get_xdata(), line[1].get_ydata(), line[2].get_ydata(), color='blue', alpha=.3)
        if j == 0:
            mu = sns.lineplot(x=t, y=baseline_df.iloc[:,0], label='Average S(t)', linewidth=3.0, ax=axes[i][j],c='b',legend=False)
            q95 = sns.lineplot(x=t, y=baseline_df.iloc[:,1], label='Confidence', linewidth=3.0, ax=axes[i][j],c='r',legend=False)
            q05 = sns.lineplot(x=t, y=baseline_df.iloc[:,2], label='Confidence', linewidth=3.0, ax=axes[i][j],c='r',legend=False)

            axes[i][j].set_ylabel(f"S(t) {dataset}" ,fontsize=30)
            axes[i][j].set_xlabel("t",fontsize=20)
            line = q05.get_lines()
            axes[i][j].fill_between(line[0].get_xdata(), line[1].get_ydata(), line[2].get_ydata(), color='blue', alpha=.3)
            axes[i][j].set_xlabel("t",fontsize=20)

for ax, col in zip(axes[0], algorithms):
    col = "SAWAR" if col == "crownibp" else col
    col = "DRAFT" if col == "baseline" else col
    col = "PGD" if col == "pgd" else col
    col = "FGSM" if col == "fgsm" else col
    col = "Noise" if col == "noise" else col
    ax.set_title(col,fontsize=30)


labels = ["S(t)","Credible Interval","$Q_{95},Q_{05}$"]
fig.legend([mu,q95,q05], labels=labels, 
           loc="upper center",ncols=4,fontsize=30,bbox_to_anchor=(.5,1.05)) 

plt.tight_layout()
plt.savefig(save_folder,dpi=1600,bbox_inches="tight")
plt.show()