In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
active_directory = '/content/drive/MyDrive/Desktop/DP_Finetuning_Harnet_Submission'
os.chdir(active_directory)

Mounted at /content/drive


## IMPORT LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import json
import pickle
from warnings import filterwarnings
from pandas.errors import SettingWithCopyWarning

filterwarnings("ignore", category=SettingWithCopyWarning)
filterwarnings('ignore', category=UserWarning)

In [3]:
def fold_plot(fold_train_accuracies, fold_val_accuracies, val_subjects):
  import matplotlib.pyplot as plt
  import math
  # Get the list of train and val accuracies
  acc_list_zipped = list(zip(fold_train_accuracies, fold_val_accuracies, val_subjects))

  # Get the total number of folds to plot
  num_folds = len(acc_list_zipped)

  # --- Subplot Layout Calculation ---
  ncols = 2
  nrows = math.ceil(num_folds / ncols)

  # --- Create the Figure and Subplots ---
  fig, axes = plt.subplots(nrows, ncols, figsize=(14, 5 * nrows))
  fig.suptitle('Training vs. Validation Accuracy Across Folds', fontsize=16, y=1.02)
  axes = axes.flatten()


  # --- Loop and Plot on Each Subplot ---
  for i, (tr_acc, vl_acc, vl_subjects) in enumerate(acc_list_zipped):
      ax = axes[i] # Get the current axis
      ax.plot(tr_acc, label='Train Accuracy', color='royalblue')
      ax.plot(vl_acc, label='Validation Accuracy', color='darkorange')
      ax.set_title(f'Fold: {i + 1} for  Validation Subjects {vl_subjects}')
      ax.set_xlabel('Epoch')
      ax.set_ylabel('Accuracy')
      ax.legend()
      ax.grid(True, linestyle='--', alpha=0.6)

  # --- Clean Up and Display ---
  # If the number of folds is odd, the last subplot in the grid will be empty.
  # This loop hides any unused subplots.
  for i in range(num_folds, len(axes)):
      axes[i].axis('off')

  # Adjusts subplot params so that subplots are nicely fit in the figure.
  fig.tight_layout(rect=[0, 0, 1, 0.98])

  return fig


def plot_epochs(train_accs, val_accs, val_subj):
  import matplotlib.pyplot as plt
  epochs = np.arange(1, len(train_accs) + 1)
  fig, ax = plt.subplots()
  ax.plot(epochs, train_accs, label='Train Accuracy', color='royalblue')
  ax.plot(epochs, val_accs, label='Validation Accuracy', color='darkorange')
  ax.set_title(f'Training vs. Validation Accuracy for Validation Subjects {val_subj}')
  ax.set_xlabel('Epoch')
  ax.set_ylabel('Accuracy')
  ax.set_xticks(epochs)
  ax.legend()
  ax.grid(True, linestyle='--', alpha=0.6)
  return fig


def remove_module_prefix(state_dict):
    return {k.replace("_module.", ""): v for k, v in state_dict.items()}


def conversion_of_table(series_df:pd.Series, experiment_name:str):
  sdf = series_df.copy()
  sdf.index = [ind[0] + '-' + ind[1] for ind in  sdf.index]
  sdf = sdf.reset_index().T
  sdf.columns = sdf.iloc[0]
  sdf = sdf[1:].reset_index(drop=True)
  sdf["Experiment"] = experiment_name

  return sdf

def find_defined_params(value:str):
  """
  Extracting defined values from the string ordering Noise Scale, window length in sec, sample rate in Hz and overlap in %.
  """
  import re
  pattern_4 = r'([0-9.]+_[0-9]+_[0-9]+_[0-9.]+)_\d{8}'
  match = re.search(pattern_4, value)

  if match:
      return match.group(1).split('_')

  # For 3 parameters (like CH case)
  pattern_3 = r'([0-9.]+_[0-9]+_[0-9.]+)_\d{8}'
  match = re.search(pattern_3, value)

  if match:
      return match.group(1).split('_')

  return None


## FOR NOISE SCALE 0.0 No DP RESULTS

In [4]:
file_dir = 'downsampled_results_noDP'
file_names = os.listdir(file_dir)

all_file_paths = [os.path.join(file_dir, file_name) for file_name in file_names]

fold_file_paths = [f for f in all_file_paths if f.split('/')[1].startswith('fold')]
summary_file_paths = [f for f in all_file_paths if f.split('/')[1].startswith('summary')]

map_file_list = []
# Mapping the fold and summary files
for f in fold_file_paths:
  for s in summary_file_paths:
    if f.split('/')[1].lstrip('fold_details_') == s.split('/')[1].lstrip('summary_df_'):
      map_file_list.append((f, s))



fine_tune_strategy = None

all_final_dfs = []

for f, s in map_file_list:
  if (f.split('.csv')[0].endswith('all_layers')) & (s.split('.csv')[0].endswith('all_layers')):
    fine_tune_strategy = 'Full Model'
  elif (f.split('.csv')[0].endswith('4-2-2')) & (s.split('.csv')[0].endswith('4-2-2')):
    fine_tune_strategy = 'Classifier Head'
  else:
    raise ValueError('Invalid file name')

  summary_df = pd.read_csv(s)
  fold_details_df = pd.read_csv(f)
  w_length, s_rate, o_lap = find_defined_params(f)

  final_dfs = []
  for i in range(len(fold_details_df)):
    stats_s = summary_df[['Test_Subjects', 'Test_Losses', 'Best_Val_Losses', 'Test_Accuracies', 'Best_Val_Accuracies', 'Test_F1s', 'Best_Val_F1s', 'Best_Train_Losses', 'Best_Train_Accuracies', 'Best_Val_Epochs']].loc[i]
    best_val_epoch = stats_s['Best_Val_Epochs']
    stats_sT = stats_s.reset_index().T
    stats_sT.columns = stats_sT.iloc[0]
    stats_sT = stats_sT[1:].reset_index(drop=True)


    safe_dict = eval(fold_details_df['Epoch_Results'][i], {"array": np.array, "np": np, "__builtins__": {}})
    valid_cols = [k for k in safe_dict.keys() if k != 'val_subjects']

    new_safe_dict = {k: safe_dict[k] for k in valid_cols}
    fold_df = pd.DataFrame(new_safe_dict)
    fold_df['val_subjects'] = '-'.join([val for val in safe_dict['val_subjects']])

    eps_df = fold_df[fold_df['epoch']==best_val_epoch][['val_subjects']].reset_index(drop=True)

    fin_df = pd.concat([stats_sT, eps_df], axis=1)
    final_dfs.append(fin_df)

  final_df = pd.concat(final_dfs, axis=0).reset_index(drop=True)
  final_df['Fine_Tune_Strategy'] = fine_tune_strategy

  if fold_details_df["Batch_Size"].nunique() == 1:
    final_df["Batch_Size"] = fold_details_df["Batch_Size"].unique()[0]
  else:
    raise ValueError("Batch size is not unique")

  if fold_details_df["Learning_Rate"].nunique() == 1:
    final_df["Learning_Rate"] = fold_details_df["Learning_Rate"].unique()[0]
  else:
    raise ValueError("Learning rate is not unique")

  final_df["window_length"] = float(w_length)
  final_df["sample_rate"] = float(s_rate)
  final_df["overlap"] = float(o_lap)
  final_df["Experiment"] = "Non-private baseline"

  all_final_dfs.append(final_df)

all_final_df = pd.concat(all_final_dfs, axis=0).reset_index(drop=True)

# Average performances across the test sets
agg_0_df_ft_sb = all_final_df.groupby(['Experiment', 'Fine_Tune_Strategy', 'Test_Subjects', 'Batch_Size', 'Learning_Rate', 'window_length', 'sample_rate', 'overlap'])[['Test_Accuracies', 'Best_Val_Accuracies', 'Test_F1s', 'Best_Val_F1s', 'Best_Val_Epochs']].aggregate(['mean', 'std'])


# Average performances according to fine-tune strategies
agg_df_fts = agg_0_df_ft_sb.reset_index()
agg_df_fts.columns = [c[0] if c[1] == '' else c[0] + '_' + c[1] for c in agg_df_fts.columns]
desc_cols = [col for col in agg_df_fts.columns if col.endswith('mean') | col.endswith('std')]
gr_agg_df_fts = agg_df_fts.groupby(['Experiment', 'Fine_Tune_Strategy', 'Batch_Size', 'Learning_Rate', 'window_length', 'sample_rate', 'overlap'])[desc_cols].aggregate(['mean'])
gr_agg_df_fts.columns = gr_agg_df_fts.columns.droplevel(1)



## FOR DP RESULTS BASED ON FINE-TUNING STRATEGIES (DP-SGD)

In [7]:
file_dir = 'downsampled_results_DP'
file_names = os.listdir(file_dir)

all_file_paths = [os.path.join(file_dir, file_name) for file_name in file_names]

fold_file_paths = [f for f in all_file_paths if f.split('/')[1].startswith('fold')]
summary_file_paths = [f for f in all_file_paths if f.split('/')[1].startswith('summary')]

map_file_list = []
# Mapping the fold and summary files
for f in fold_file_paths:
  for s in summary_file_paths:
    if f.split('/')[1].lstrip('fold_details_') == s.split('/')[1].lstrip('summary_df_'):
      map_file_list.append((f, s))



fine_tune_strategy = None

all_final_dfs = []

for f, s in map_file_list:
  if (f.split('.csv')[0].endswith('all_layers')) & (s.split('.csv')[0].endswith('all_layers')):
    fine_tune_strategy = 'Full Model'
  elif (f.split('.csv')[0].endswith('4-2-2')) & (s.split('.csv')[0].endswith('4-2-2')):
    fine_tune_strategy = 'Classifier Head'
  else:
    raise ValueError('Invalid file name')

  summary_df = pd.read_csv(s)
  fold_details_df = pd.read_csv(f)

  ns, w_length, s_rate, o_lap = find_defined_params(f)


  final_dfs = []
  for i in range(len(fold_details_df)):
    stats_s = summary_df[['Test_Subjects', 'Test_Losses', 'Best_Val_Losses', 'Test_Accuracies', 'Best_Val_Accuracies', 'Test_F1s', 'Best_Val_F1s', 'Best_Train_Losses', 'Best_Train_Accuracies', 'Best_Val_Epochs']].loc[i]
    best_val_epoch = stats_s['Best_Val_Epochs']
    stats_sT = stats_s.reset_index().T
    stats_sT.columns = stats_sT.iloc[0]
    stats_sT = stats_sT[1:].reset_index(drop=True)


    safe_dict = eval(fold_details_df['Epoch_Results'][i], {"array": np.array, "np": np, "__builtins__": {}})
    valid_cols = [k for k in safe_dict.keys() if k != 'val_subjects']

    new_safe_dict = {k: safe_dict[k] for k in valid_cols}
    fold_df = pd.DataFrame(new_safe_dict)
    fold_df['val_subjects'] = '-'.join([val for val in safe_dict['val_subjects']])

    eps_df = fold_df[fold_df['epoch']==best_val_epoch][['val_subjects', 'noise_scale', 'achieved_epsilon']].reset_index(drop=True)

    fin_df = pd.concat([stats_sT, eps_df], axis=1)
    final_dfs.append(fin_df)

  final_df = pd.concat(final_dfs, axis=0).reset_index(drop=True)
  final_df['Fine_Tune_Strategy'] = fine_tune_strategy

  if fold_details_df["Batch_Size"].nunique() == 1:
    final_df["Batch_Size"] = fold_details_df["Batch_Size"].unique()[0]
  else:
    raise ValueError("Batch size is not unique")

  if fold_details_df["Learning_Rate"].nunique() == 1:
    final_df["Learning_Rate"] = fold_details_df["Learning_Rate"].unique()[0]
  else:
    raise ValueError("Learning rate is not unique")

  final_df["window_length"] = float(w_length)
  final_df["sample_rate"] = float(s_rate)
  final_df["overlap"] = float(o_lap)
  final_df["Experiment"] = f"Gradient Perturbation (σ={ns})"

  all_final_dfs.append(final_df)

all_final_df = pd.concat(all_final_dfs, axis=0).reset_index(drop=True)

# Average performances across the test sets
dsc_cols = ['Test_Accuracies', 'Best_Val_Accuracies', 'Test_F1s', 'Best_Val_F1s', 'Best_Val_Epochs', 'achieved_epsilon']
agg_dp_df_ft_sb = all_final_df.groupby(['Experiment', 'Fine_Tune_Strategy', 'noise_scale', 'Test_Subjects', 'Batch_Size', 'Learning_Rate', 'window_length', 'sample_rate', 'overlap'])[dsc_cols].aggregate(['mean', 'std'])


# Average performances according to fine-tune strategies
agg_dp_df_fts = agg_dp_df_ft_sb.reset_index()
agg_dp_df_fts.columns = [c[0] if c[1] == '' else c[0] + '_' + c[1] for c in agg_dp_df_fts.columns]
desc_cols = [col for col in agg_dp_df_fts.columns if col.endswith('mean') | col.endswith('std')]
gr_agg_dp_df_fts = agg_dp_df_fts.groupby(['Experiment', 'Fine_Tune_Strategy', 'noise_scale', 'Batch_Size', 'Learning_Rate', 'window_length', 'sample_rate', 'overlap'])[desc_cols].aggregate(['mean'])
gr_agg_dp_df_fts.columns = gr_agg_dp_df_fts.columns.droplevel(1)



## COMPARISON

In [9]:
pb_df = pd.concat([gr_agg_df_fts.reset_index(), gr_agg_dp_df_fts.reset_index()], axis=0, ignore_index=True)
f1s_pb_cols = [col for col in pb_df.columns if 'F1' in col]
pb_df[f1s_pb_cols] = pb_df[f1s_pb_cols] * 100
pb_df[['noise_scale',	'achieved_epsilon_mean',	'achieved_epsilon_std']] = pb_df[['noise_scale',	'achieved_epsilon_mean',	'achieved_epsilon_std']].fillna(0)

In [11]:
#pb_df.to_excel('pb_df.xlsx', index=False)

# VISUALIZATION FOR FINE-TUNED CLASSIFIER HEADS

In [18]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import qualitative

def get_symbol(method):
    # plotly symbols: 'x' = x-marker, 'circle' = filled circle
    return 'x' if method == 'Classifier Head' else 'circle-open'

def plot_validation_utility_vs_epsilon_plotly(df):
    # --- Prepare private vs. baselines ---
    private_df = df[df['Experiment'] != 'Non-private baseline'].copy()

    # extract the two non-private baselines
    baseline_classifier_head = df[
        (df['Experiment'] == 'Non-private baseline') &
        (df['Fine_Tune_Strategy'] == 'Classifier Head')
    ]
    baseline_full_model = df[
        (df['Experiment'] == 'Non-private baseline') &
        (df['Fine_Tune_Strategy'] == 'Full Model')
    ]

    if baseline_classifier_head.empty:
        print("Warning: Classifier Head non-private baseline not found.")
        bch_acc = bch_f1 = None
    else:
        bch_acc = baseline_classifier_head['Test_Accuracies_mean'].iloc[0]
        bch_f1  = baseline_classifier_head['Test_F1s_mean'].iloc[0]

    if baseline_full_model.empty:
        print("Warning: Full Model non-private baseline not found.")
        bfm_acc = bfm_f1 = None
    else:
        bfm_acc = baseline_full_model['Test_Accuracies_mean'].iloc[0]
        bfm_f1  = baseline_full_model['Test_F1s_mean'].iloc[0]

    # map noise scales to colors
    private_df['Noise_Scale'] = private_df['noise_scale']
    scales = sorted(private_df['Noise_Scale'].unique())
    palette = qualitative.Plotly  # 10 distinct colors
    color_map = {s: palette[i % len(palette)] for i, s in enumerate(scales)}

    # Calculate axis ranges for standardization
    all_x = private_df['achieved_epsilon_mean']
    x_min, x_max = all_x.min(), all_x.max()
    x_range = [x_min - 0.1 * (x_max - x_min), x_max + 0.1 * (x_max - x_min)]

    all_y_acc = private_df['Test_Accuracies_mean']
    all_y_f1 = private_df['Test_F1s_mean']
    y_min = min(all_y_acc.min(), all_y_f1.min())
    y_max = max(all_y_acc.max(), all_y_f1.max())

    # Include baselines in y-range calculation
    if bch_acc is not None:
        y_min = min(y_min, bch_acc)
        y_max = max(y_max, bch_acc)
    if bfm_acc is not None:
        y_min = min(y_min, bfm_acc)
        y_max = max(y_max, bfm_acc)
    if bch_f1 is not None:
        y_min = min(y_min, bch_f1)
        y_max = max(y_max, bch_f1)
    if bfm_f1 is not None:
        y_min = min(y_min, bfm_f1)
        y_max = max(y_max, bfm_f1)

    y_range = [y_min - 0.2 * (y_max - y_min), y_max + 0.1 * (y_max - y_min)]

    # create 2×1 subplots (vertical layout)
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        subplot_titles=("Test Accuracy", "Test Macro F1"),
        vertical_spacing=0.1
    )

    # loop groups for accuracy and F1
    for metric, row, y_mean_col, y_std_col, baseline_val in [
        ("Accuracy", 1, 'Test_Accuracies_mean', 'Test_Accuracies_std', (bch_acc, bfm_acc)),
        ("F1",       2, 'Test_F1s_mean',       'Test_F1s_std',       (bch_f1,  bfm_f1 ))
    ]:
        # add each (method, noise) trace
        for (method, scale), group in private_df.groupby(['Fine_Tune_Strategy','Noise_Scale']):
            x = group['achieved_epsilon_mean']
            y = group[y_mean_col]
            err = group[y_std_col]

            # shift text left/right to avoid overlap
            if method == 'Classifier Head':
                xshift, xanchor = +5, 'left'
                yshift, yanchor = -15, 'bottom'
            else:
                xshift, xanchor = -5, 'right'
                yshift, yanchor = -10, 'bottom'

            fig.add_trace(
                go.Scatter(
                    x=x, y=y,
                    error_y=dict(type='data', array=err, thickness=1.5, width=4),
                    mode='markers+lines',
                    marker=dict(symbol=get_symbol(method), size=10, color=color_map[scale]),
                    line=dict(dash='solid', width=1),
                    name=f"{method} (σ={int(scale)})",
                    showlegend=(row==1)  # only show legend once
                ),
                row=row, col=1
            )
            # per-point text annotation
            for xi, yi in zip(x, y):
                fig.add_annotation(
                    x=xi, y=yi,
                    text=f"{yi:.1f}%" if metric=="Accuracy" else f"{yi:.1f}%",
                    showarrow=False,
                    bgcolor='rgba(255,255,255,0.8)',
                    borderwidth=1,
                    borderpad=3,
                    xshift=xshift,
                    yshift=yshift,
                    xanchor=xanchor,
                    yanchor=yanchor,
                    font=dict(size=14),
                    xref=f"x{row if row>1 else ''}", yref=f"y{row if row>1 else ''}"
                )

        # add baseline lines & labels
        bch, bfm = baseline_val
        if bch is not None:
            fig.add_hline(
                y=bch,
                line_dash="dot",
                line_color="black",
                annotation_text="Classifier Head (Non-private)",
                annotation_position="top right",
                annotation_font_size=14,
                annotation_font_color="black",
                row=row, col=1
            )

            xr = f"x{row if row>1 else ''} domain"
            yr = f"y{row if row>1 else ''}"

            fig.add_annotation(
                xref=xr,
                yref=yr,
                x=1,
                y=bch,
                text=f"{bch:.2f}%",
                showarrow=True,
                arrowhead=2,
                ax=40, ay=-20, arrowcolor="black",
                font=dict(size=14, color="black")
            )

        if bfm is not None:
            fig.add_hline(
                y=bfm,
                line_dash="dash",
                line_color="grey",
                annotation_text="Full Model (Non-private)",
                annotation_position="bottom right",
                annotation_font_size=14,
                annotation_font_color="grey",
                row=row, col=1
            )
            xr = f"x{row if row>1 else ''} domain"
            yr = f"y{row if row>1 else ''}"

            fig.add_annotation(
                xref=xr,
                yref=yr,
                x=1,
                y=bfm,
                text=f"{bfm:.2f}%",
                showarrow=True,
                arrowhead=2,
                ax=40, ay=20, arrowcolor="grey",
                font=dict(size=14, color="grey")
            )

        # axes titles & grid with standardized ranges
        fig.update_xaxes(title_text="Privacy Budget ε", row=row, col=1, showgrid=True, range=x_range)
        y_title = "Accuracy (%)" if metric=="Accuracy" else "F1 Score (%)"
        fig.update_yaxes(title_text=y_title, row=row, col=1, showgrid=True, range=y_range)

    # Update only subplot titles to be bold and bigger
    for annotation in fig['layout']['annotations']:
        # Subplot titles are the first annotations added by make_subplots
        if annotation['text'] in ['Test Accuracy', 'Test Macro F1']:
            annotation['font'] = dict(size=18)


    # finalize layout
    fig.update_layout(
        height=1000, width=800,
        margin=dict(b=120),
        legend=dict(
            orientation="h",
            x=0.5,
            y=-0.1,
            xanchor="center",
            yanchor="top",
            font=dict(size=14)
        )
    )

    fig.show()

# Usage:
plot_validation_utility_vs_epsilon_plotly(pb_df)

In [None]:
baseline_classifier_head = pb_df[(pb_df['Experiment'] == 'Non-private baseline') & (pb_df['Fine_Tune_Strategy'] == 'Classifier Head')]
baseline_classifier_head_acc = baseline_classifier_head['Test_Accuracies_mean'].iloc[0]
baseline_classifier_head_f1 = baseline_classifier_head['Test_F1s_mean'].iloc[0]


baseline_full_model = pb_df[(pb_df['Experiment'] == 'Non-private baseline') & (pb_df['Fine_Tune_Strategy'] == 'Full Model')]
baseline_full_model_acc = baseline_full_model['Test_Accuracies_mean'].iloc[0]
baseline_full_model_f1 = baseline_full_model['Test_F1s_mean'].iloc[0]

## VALIDATION SCORES

In [19]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import qualitative

def get_symbol(method):
    # plotly symbols: 'x' = x-marker, 'circle' = filled circle
    return 'x' if method == 'Classifier Head' else 'circle-open'

def plot_validation_utility_vs_epsilon_plotly(df):
    # --- Prepare private vs. baselines ---
    private_df = df[df['Experiment'] != 'Non-private baseline'].copy()

    # extract the two non-private baselines
    baseline_classifier_head = df[
        (df['Experiment'] == 'Non-private baseline') &
        (df['Fine_Tune_Strategy'] == 'Classifier Head')
    ]
    baseline_full_model = df[
        (df['Experiment'] == 'Non-private baseline') &
        (df['Fine_Tune_Strategy'] == 'Full Model')
    ]

    if baseline_classifier_head.empty:
        print("Warning: Classifier Head non-private baseline not found.")
        bch_acc = bch_f1 = None
    else:
        bch_acc = baseline_classifier_head['Best_Val_Accuracies_mean'].iloc[0]
        bch_f1  = baseline_classifier_head['Best_Val_F1s_mean'].iloc[0]

    if baseline_full_model.empty:
        print("Warning: Full Model non-private baseline not found.")
        bfm_acc = bfm_f1 = None
    else:
        bfm_acc = baseline_full_model['Best_Val_Accuracies_mean'].iloc[0]
        bfm_f1  = baseline_full_model['Best_Val_F1s_mean'].iloc[0]

    # map noise scales to colors
    private_df['Noise_Scale'] = private_df['noise_scale']
    scales = sorted(private_df['Noise_Scale'].unique())
    palette = qualitative.Plotly  # 10 distinct colors
    color_map = {s: palette[i % len(palette)] for i, s in enumerate(scales)}

    # create 1×2 subplots
    fig = make_subplots(
        rows=1, cols=2,
        shared_xaxes=True,
        subplot_titles=("Validation Accuracy", "Validation Macro F1"),
        horizontal_spacing=0.08
    )

    # loop groups for accuracy and F1
    for metric, col, y_mean_col, y_std_col, baseline_val in [
        ("Accuracy", 1, 'Best_Val_Accuracies_mean', 'Best_Val_Accuracies_std', (bch_acc, bfm_acc)),
        ("F1",       2, 'Best_Val_F1s_mean',       'Best_Val_F1s_std',       (bch_f1,  bfm_f1 ))
    ]:
        # add each (method, noise) trace
        for (method, scale), group in private_df.groupby(['Fine_Tune_Strategy','Noise_Scale']):
            x = group['achieved_epsilon_mean']
            y = group[y_mean_col]
            err = group[y_std_col]

            # shift text left/right to avoid overlap
            if method == 'Classifier Head':
                xshift, xanchor = +5, 'left'
                yshift, yanchor =    -10, 'bottom'
            else:
                xshift, xanchor = -5, 'right'
                yshift, yanchor =    -10, 'bottom'




            fig.add_trace(
                go.Scatter(
                    x=x, y=y,
                    error_y=dict(type='data', array=err, thickness=1.5, width=4),
                    mode='markers+lines',
                    marker=dict(symbol=get_symbol(method), size=10, color=color_map[scale]),
                    line=dict(dash='solid', width=1),
                    name=f"{method} (σ={int(scale)})",
                    showlegend=(col==1)  # only show legend once
                ),
                row=1, col=col
            )
            # per-point text annotation
            for xi, yi in zip(x, y):
                fig.add_annotation(
                    x=xi, y=yi,
                    text=f"{yi:.1f}%" if metric=="Accuracy" else f"{yi:.1f}%",
                    showarrow=False,
                    bgcolor='rgba(255,255,255,0.8)',
                    borderwidth=1,
                    borderpad=3,
                    xshift=xshift,
                    yshift=yshift,
                    xanchor=xanchor,
                    yanchor=yanchor,
                    font=dict(size=14),
                    xref=f"x{col}", yref=f"y{col}"
                )

        # add baseline lines & labels
        bch, bfm = baseline_val
        if bch is not None:
            fig.add_hline(
                y=bch,
                line_dash="dot",
                line_color="black",
                annotation_text="Classifier Head (Non-private)",
                annotation_position="bottom right",
                annotation_font_size=14,       # <-- bigger text
                annotation_font_color="black",   # <-- match line color
                row=1, col=col
            )

            xr = f"x{col if col>1 else ''} domain"
            yr = f"y{col if col>1 else ''}"

            fig.add_annotation(
                xref=xr,
                yref=yr,
                x=1,                  # 95% across the x-axis
                y=bch,                   # exactly at the baseline y
                text=f"{bch:.1f}%",      # show the numeric value
                showarrow=True,
                arrowhead=2,
                ax=40, ay=20,  arrowcolor="black",      # tweak arrow tail offset
                font=dict(size=14, color="black")
            )

        if bfm is not None:
            fig.add_hline(
                y=bfm,
                line_dash="dash",
                line_color="grey",
                annotation_text="Full Model (Non-private)",
                annotation_position="top right",
                annotation_font_size=14,
                annotation_font_color="grey",
                row=1, col=col
            )
            xr = f"x{col if col>1 else ''} domain"
            yr = f"y{col if col>1 else ''}"

            fig.add_annotation(
                xref=xr,
                yref=yr,
                x=1,
                y=bfm,
                text=f"{bfm:.1f}%",
                showarrow=True,
                arrowhead=2,
                ax=40, ay=-20, arrowcolor="grey",
                font=dict(size=14, color="grey")
            )

        # axes titles & grid
        fig.update_xaxes(title_text="Privacy Budget ε", row=1, col=col, showgrid=True)
        y_title = "Accuracy (%)" if metric=="Accuracy" else "F1 Score (%)"
        fig.update_yaxes(title_text=y_title, row=1, col=col, showgrid=True)

    # finalize layout
    fig.update_layout(
    height=600, width=1500,
    margin=dict(b=120),   # add extra space at the bottom
    legend=dict(
        orientation="h",   # horizontal
        x=1,               # at the far right
        y=-0.15,               # at the very bottom
        xanchor="right",   # anchor the legend’s right-hand side to x=1
        yanchor="top",     # anchor the legend’s top to y=0
        font=dict(size=14)
    )
)

    fig.show()

# Usage:
plot_validation_utility_vs_epsilon_plotly(pb_df)


## THE RELATION BETWEEN NOISE SCALE AND EPSILON

In [None]:
import plotly.graph_objects as go
import pandas as pd

def plot_noise_vs_epsilon(df):
    # Filter DP-only data
    dp_df = df[df['Experiment'] != 'Non-private baseline'].copy()
    dp_df['Noise_Scale'] = dp_df['noise_scale'].astype(int)
    dp_df['Fine-Tune Strategy'] = dp_df['Fine_Tune_Strategy']

    # Define colors and styles for each method
    method_config = {
        'Classifier Head': {
            'color': '#1f77b4',  # Blue
            'line_dash': 'solid',
            'symbol': 'circle'
        },
        'Full Model': {
            'color': '#ff7f0e',  # Orange
            'line_dash': 'dash',
            'symbol': 'x'
        }
    }

    # Create the figure
    fig = go.Figure()

    # Add one line+marker trace per strategy
    for method, group in dp_df.groupby('Fine-Tune Strategy'):
        cfg = method_config[method]
        group = group.sort_values('Noise_Scale')

        fig.add_trace(
            go.Scatter(
                x=group['Noise_Scale'],
                y=group['achieved_epsilon_mean'],
                mode='lines+markers',
                name=method,
                line=dict(color=cfg['color'], width=2, dash=cfg['line_dash']),
                marker=dict(color=cfg['color'], size=8, symbol=cfg['symbol']),
                hovertemplate=(
                    f"<b>{method}</b><br>"
                    "Noise σ: %{x}<br>"
                    "ε: %{y:.3f}<extra></extra>"
                )
            )
        )

    # Add vertical stem lines
    for _, row in dp_df.iterrows():
        x = row['Noise_Scale']
        y = row['achieved_epsilon_mean']
        fig.add_shape(
            type="line",
            x0=x, x1=x,
            y0=0, y1=y,
            line=dict(color="gray", width=1, dash="dot"),
            opacity=0.6
        )

    # Add text annotations, colored per strategy
    for _, row in dp_df.iterrows():
        method = row['Fine-Tune Strategy']
        ann_color = method_config[method]['color']
        x = row['Noise_Scale']
        y = row['achieved_epsilon_mean']

        # shift text left/right to avoid overlap
        if method == 'Classifier Head':
            xshift, xanchor = +10, 'left'
            yshift, yanchor =    0, 'bottom'
        else:
            xshift, xanchor = -10, 'right'
            yshift, yanchor =    0, 'bottom'

        fig.add_annotation(
            x=x,
            y=y,
            text=f"{y:.2f}",
            showarrow=False,
            xshift=xshift,
            yshift=yshift,
            xanchor=xanchor,
            yanchor=yanchor,
            font=dict(size=12, color=ann_color, family="Arial Black"),
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor=ann_color,
            borderwidth=1,
            borderpad=3
        )

    # X-axis ticks
    unique_scales = sorted(dp_df['Noise_Scale'].unique())

    # Layout styling
    fig.update_layout(
        title={
            'text': "Privacy Budget ε vs Noise Scale σ",
            'font': {'size': 20, 'family': 'Arial', 'color': '#2F2F2F'},
            'x': 0.5, 'xanchor': 'center',
            'pad': {'t': 20}
        },

        xaxis={
            'title': {
                'text': "Noise Scale σ",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickmode': 'array',
            'tickvals': unique_scales,
            'ticktext': [str(s) for s in unique_scales],
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False
        },
        yaxis={
            'title': {
                'text': "Privacy Budget ε",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False,
            'rangemode': 'tozero'
        },
        #width=1000,
        #height=500,
        width=1000,
        height=600,
        margin=dict(l=80, r=50, t=80, b=80),
        plot_bgcolor='white',

        paper_bgcolor='white',
        #margin=dict(l=60, r=40, t=60, b=60),
        legend=dict(
            orientation='v',
            font=dict(size=14, family='Arial'),
            x=1, y=1.02, xanchor='right', yanchor='middle',
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='rgba(128,128,128,0.3)',
            borderwidth=1
        )
    )

    return fig

# Usage:
fig = plot_noise_vs_epsilon(pb_df)
fig.show()


## UTILITY LOSS

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def plot_accuracy_utility_loss_by_method(df):
    # Get baseline
    baseline_df = df[df['Experiment'] == 'Non-private baseline']
    if baseline_df.empty:
        print("No baseline found.")
        return

    # Prepare DP data
    dp_df = df[df['Experiment'] != 'Non-private baseline'].copy()
    dp_df['Noise_Scale'] = dp_df['noise_scale'].astype(int)
    dp_df.reset_index(drop=True, inplace=True)

    dp_df['Accuracy_Loss_%'] = np.zeros(len(dp_df))
    for i in range(len(dp_df)):
      ft_strategy = dp_df['Fine_Tune_Strategy'][i]
      baseline_f1 = baseline_df.loc[baseline_df['Fine_Tune_Strategy'] == ft_strategy, 'Test_Accuracies_mean'].values[0]
      dp_df.loc[i, 'Accuracy_Loss_%'] = 100 * (baseline_f1 - dp_df['Test_Accuracies_mean'][i]) / baseline_f1

    dp_df.rename(columns={'Fine_Tune_Strategy': 'Fine-Tune Strategy'}, inplace=True)

    # Define colors and markers for each method
    method_config = {
        'Classifier Head': {
            'color': '#2E86AB',
            'symbol': 'circle',
            'line_dash': 'solid'
        },
        'Full Model': {
            'color': '#F24236',
            'symbol': 'square',
            'line_dash': 'solid'
        }
    }

    # Create the figure
    fig = go.Figure()

    # Plot each method
    for method, group in dp_df.groupby('Fine-Tune Strategy'):
        group = group.sort_values('achieved_epsilon_mean')
        config = method_config[method]

        # Create hover text with detailed information
        hover_text = [
            f"Method: {method}<br>"
            f"Privacy Budget (ε): {row['achieved_epsilon_mean']:.3f}<br>"
            f"Accuracy Utility Loss: {row['Accuracy_Loss_%']:.1f}%<br>"
            f"Test Accuracy Score: {row['Test_Accuracies_mean']:.3f}<br>"
            f"Noise Scale: {row['Noise_Scale']}"
            for _, row in group.iterrows()
        ]

        # Add the trace
        fig.add_trace(
            go.Scatter(
                x=group['achieved_epsilon_mean'],
                y=group['Accuracy_Loss_%'],
                mode='lines+markers',
                name=method,
                line=dict(
                    color=config['color'],
                    width=3,
                    dash=config['line_dash']
                ),
                marker=dict(
                    color=config['color'],
                    size=12,
                    symbol=config['symbol'],
                    line=dict(color='white', width=2)
                ),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_text,
                showlegend=True
            )
        )

        # Add value annotations on points
        for _, row in group.iterrows():
            x = row['achieved_epsilon_mean']
            y = row['Accuracy_Loss_%']

            # Position annotations to avoid overlap
            if method == 'Classifier Head':
                y_shift = 0
                y_anchor = 'bottom'
                x_shift = +10
                x_anchor = 'left'
            else:
                y_shift = 0
                y_anchor = 'bottom'
                x_shift = -10
                x_anchor = 'right'

            fig.add_annotation(
                x=x,
                y=y,
                text=f"{y:.1f}%",
                showarrow=False,
                yshift=y_shift,
                xshift=x_shift,
                xanchor=x_anchor,
                font=dict(size=16, color=config['color'], family="Arial Black"),
                bgcolor='rgba(255,255,255,0.8)',
                bordercolor=config['color'],
                borderwidth=1,
                borderpad=3,
                yanchor=y_anchor
            )

    # Collect epsilon values for better x-axis formatting
    epsilon_values = sorted(dp_df['achieved_epsilon_mean'].unique())

    # Create custom tick labels with appropriate precision
    tick_labels = []
    for eps in epsilon_values:
        if eps < 0.1:
            tick_labels.append(f"{eps:.3f}")
        elif eps < 1.0:
            tick_labels.append(f"{eps:.2f}")
        else:
            tick_labels.append(f"{eps:.1f}")

    # Update layout with professional styling
    fig.update_layout(
        title={
            'text': "Test Accuracy Utility Loss vs Privacy Budget",
            'font': {'size': 20, 'family': 'Arial', 'color': '#2F2F2F'},
            'x': 0.5,
            'xanchor': 'center',
            'pad': {'t': 20}
        },
        xaxis={
            'title': {
                'text': "Privacy Budget ε",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickmode': 'array',
            'tickvals': epsilon_values,
            'ticktext': tick_labels,
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False
        },
        yaxis={
            'title': {
                'text': "Accuracy Utility Loss (%)",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False,
            #'rangemode': 'tozero',
            'range': [0, 70]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        width=1000,
        height=600,
        margin=dict(l=80, r=50, t=80, b=80),
        legend={
            'orientation':'v',
            'font': {'size': 14, 'family': 'Arial'},
            'x': 1,
            'y': 1.02,
            'yanchor' :'middle',
            'xanchor': 'right',
            'bgcolor': 'rgba(255,255,255,0.8)',
            'bordercolor': 'rgba(128,128,128,0.3)',
            'borderwidth': 1,
        },
        hovermode='closest'
    )

    # Add a horizontal reference line at y=0
    fig.add_hline(
        y=0,
        line_dash="dot",
        line_color="rgba(128,128,128,0.5)",
        line_width=1
    )

    eps = epsilon_values
    fig.update_xaxes(tickmode='array', tickvals=eps, ticktext=[f"{e:.2f}" for e in eps], tickangle=270, automargin=True)
    fig.update_yaxes(tickfont=dict(size=14))

    # Optional: Save as high-quality images for thesis
    # fig.write_image("f1_utility_loss_plot.png", width=1200, height=800, scale=2)
    # fig.write_html("f1_utility_loss_plot.html")

    return fig



In [None]:
plot_accuracy_utility_loss_by_method(pb_df)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def plot_f1_utility_loss_by_method(df):
    # Get baseline
    baseline_df = df[df['Experiment'] == 'Non-private baseline']
    if baseline_df.empty:
        print("No baseline found.")
        return

    # Prepare DP data
    dp_df = df[df['Experiment'] != 'Non-private baseline'].copy()
    dp_df['Noise_Scale'] = dp_df['noise_scale'].astype(int)
    dp_df.reset_index(drop=True, inplace=True)

    dp_df['F1_Loss_%'] = np.zeros(len(dp_df))
    for i in range(len(dp_df)):
      ft_strategy = dp_df['Fine_Tune_Strategy'][i]
      baseline_f1 = baseline_df.loc[baseline_df['Fine_Tune_Strategy'] == ft_strategy, 'Test_F1s_mean'].values[0]
      dp_df.loc[i, 'F1_Loss_%'] = 100 * (baseline_f1 - dp_df['Test_F1s_mean'][i]) / baseline_f1

    dp_df.rename(columns={'Fine_Tune_Strategy': 'Fine-Tune Strategy'}, inplace=True)

    # Define colors and markers for each method
    method_config = {
        'Classifier Head': {
            'color': '#2E86AB',
            'symbol': 'circle',
            'line_dash': 'solid'
        },
        'Full Model': {
            'color': '#F24236',
            'symbol': 'square',
            'line_dash': 'solid'
        }
    }

    # Create the figure
    fig = go.Figure()

    # Plot each method
    for method, group in dp_df.groupby('Fine-Tune Strategy'):
        group = group.sort_values('achieved_epsilon_mean')
        config = method_config[method]

        # Create hover text with detailed information
        hover_text = [
            f"Method: {method}<br>"
            f"Privacy Budget (ε): {row['achieved_epsilon_mean']:.3f}<br>"
            f"F1 Utility Loss: {row['F1_Loss_%']:.1f}%<br>"
            f"Test F1 Score: {row['Test_F1s_mean']:.3f}<br>"
            f"Noise Scale: {row['Noise_Scale']}"
            for _, row in group.iterrows()
        ]

        # Add the trace
        fig.add_trace(
            go.Scatter(
                x=group['achieved_epsilon_mean'],
                y=group['F1_Loss_%'],
                mode='lines+markers',
                name=method,
                line=dict(
                    color=config['color'],
                    width=3,
                    dash=config['line_dash']
                ),
                marker=dict(
                    color=config['color'],
                    size=12,
                    symbol=config['symbol'],
                    line=dict(color='white', width=2)
                ),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_text,
                showlegend=True
            )
        )

        # Add value annotations on points
        for _, row in group.iterrows():
            x = row['achieved_epsilon_mean']
            y = row['F1_Loss_%']

            # Position annotations to avoid overlap
            if method == 'Classifier Head':
                y_shift = -15
                y_anchor = 'bottom'
            else:
                y_shift = +15
                y_anchor = 'top'

            fig.add_annotation(
                x=x,
                y=y,
                text=f"{y:.1f}%",
                showarrow=False,
                yshift=y_shift,
                font=dict(size=16, color=config['color'], family="Arial Black"),
                bgcolor='rgba(255,255,255,0.8)',
                bordercolor=config['color'],
                borderwidth=1,
                borderpad=3,
                yanchor=y_anchor
            )

    # Collect epsilon values for better x-axis formatting
    epsilon_values = sorted(dp_df['achieved_epsilon_mean'].unique())

    # Create custom tick labels with appropriate precision
    tick_labels = []
    for eps in epsilon_values:
        if eps < 0.1:
            tick_labels.append(f"{eps:.3f}")
        elif eps < 1.0:
            tick_labels.append(f"{eps:.2f}")
        else:
            tick_labels.append(f"{eps:.1f}")

    # Update layout with professional styling
    fig.update_layout(
        title={
            'text': "Test F1 Score Utility Loss vs Privacy Budget",
            'font': {'size': 20, 'family': 'Arial', 'color': '#2F2F2F'},
            'x': 0.5,
            'xanchor': 'center',
            'pad': {'t': 20}
        },
        xaxis={
            'title': {
                'text': "Privacy Budget ε",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickmode': 'array',
            'tickvals': epsilon_values,
            'ticktext': tick_labels,
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False
        },
        yaxis={
            'title': {
                'text': "F1 Utility Loss (%)",
                'font': {'size': 16, 'family': 'Arial', 'color': '#2F2F2F'}
            },
            'tickfont': {'size': 12, 'family': 'Arial'},
            'showgrid': True,
            'gridwidth': 1,
            'gridcolor': 'rgba(128,128,128,0.3)',
            'showline': True,
            'linewidth': 2,
            'linecolor': '#2F2F2F',
            'mirror': False,
            #'rangemode': 'tozero',
            'range': [0, 70]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        width=1000,
        height=600,
        margin=dict(l=80, r=50, t=80, b=80),
        legend={
            'orientation':'v',
            'font': {'size': 14, 'family': 'Arial'},
            'x': 1,
            'y': 1.02,
            'yanchor' :'middle',
            'xanchor': 'right',
            'bgcolor': 'rgba(255,255,255,0.8)',
            'bordercolor': 'rgba(128,128,128,0.3)',
            'borderwidth': 1,
        },
        hovermode='closest'
    )

    # Add a horizontal reference line at y=0
    fig.add_hline(
        y=0,
        line_dash="dot",
        line_color="rgba(128,128,128,0.5)",
        line_width=1
    )

    eps = epsilon_values
    fig.update_xaxes(tickmode='array', tickvals=eps, ticktext=[f"{e:.2f}" for e in eps], tickangle=270, automargin=True)
    fig.update_yaxes(tickfont=dict(size=14))

    # Optional: Save as high-quality images for thesis
    # fig.write_image("f1_utility_loss_plot.png", width=1200, height=800, scale=2)
    # fig.write_html("f1_utility_loss_plot.html")

    return fig



In [None]:
# Call the function
plot_f1_utility_loss_by_method(pb_df)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def plot_utility_loss_side_by_side(df):
    # --- 1) Compute baselines and prepare DP dataframe ---
    baseline = df[df['Experiment'] == 'Non-private baseline']
    if baseline.empty:
        print("No baseline found.")
        return
    baseline_acc = baseline['Test_Accuracies_mean'].iloc[0]
    baseline_f1  = baseline['Test_F1s_mean'].iloc[0]

    dp_df = df[df['Experiment'] != 'Non-private baseline'].copy()
    dp_df['Noise_Scale']     = dp_df['noise_scale'].astype(int)
    dp_df['Accuracy_Loss_%'] = 100 * (baseline_acc - dp_df['Test_Accuracies_mean']) / baseline_acc
    dp_df['F1_Loss_%']       = 100 * (baseline_f1  - dp_df['Test_F1s_mean'])       / baseline_f1
    dp_df['Fine-Tune Strategy'] = dp_df['Fine_Tune_Strategy']

    # --- 2) Plot config per method ---
    method_config = {
        'Classifier Head': dict(color='#2E86AB', symbol='circle', line_dash='solid'),
        'Full Model':       dict(color='#F24236', symbol='square', line_dash='solid')
    }

    # --- 3) Create 1×2 subplots ---
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(
            "Test Accuracy Utility Loss vs Privacy Budget",
            "Test F1 Score Utility Loss vs Privacy Budget"
        ),
        shared_xaxes=True
    )

    # --- 4) Add traces & annotations for each method, in each panel ---
    for method, group in dp_df.groupby('Fine-Tune Strategy'):
        cfg = method_config[method]
        group = group.sort_values('achieved_epsilon_mean')

        # prepare hover‐texts
        hover_acc = [
            f"Method: {method}<br>"
            f"ε: {row['achieved_epsilon_mean']:.3f}<br>"
            f"Acc Loss: {row['Accuracy_Loss_%']:.1f}%<br>"
            f"Test Acc: {row['Test_Accuracies_mean']:.3f}<br>"
            f"Noise: {row['Noise_Scale']}"
            for _, row in group.iterrows()
        ]
        hover_f1 = [
            f"Method: {method}<br>"
            f"ε: {row['achieved_epsilon_mean']:.3f}<br>"
            f"F1 Loss: {row['F1_Loss_%']:.1f}%<br>"
            f"Test F1: {row['Test_F1s_mean']:.3f}<br>"
            f"Noise: {row['Noise_Scale']}"
            for _, row in group.iterrows()
        ]

        # accuracy‐loss trace → col=1
        fig.add_trace(
            go.Scatter(
                x=group['achieved_epsilon_mean'],
                y=group['Accuracy_Loss_%'],
                mode='lines+markers',
                name=method,
                line=dict(color=cfg['color'], width=3, dash=cfg['line_dash']),
                marker=dict(symbol=cfg['symbol'], size=10, line=dict(width=2, color='white')),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_acc,
                showlegend=True
            ),
            row=1, col=1
        )
        # F1‐loss trace → col=2
        fig.add_trace(
            go.Scatter(
                x=group['achieved_epsilon_mean'],
                y=group['F1_Loss_%'],
                mode='lines+markers',
                name=method,
                line=dict(color=cfg['color'], width=3, dash=cfg['line_dash']),
                marker=dict(symbol=cfg['symbol'], size=10, line=dict(width=2, color='white')),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_f1,
                showlegend=False  # hide duplicate legend
            ),
            row=1, col=2
        )

        # annotations on each point
        for _, row in group.iterrows():
            eps = row['achieved_epsilon_mean']
            # accuracy subplot annotation
            fig.add_annotation(
                x=eps, y=row['Accuracy_Loss_%'],
                text=f"{row['Accuracy_Loss_%']:.1f}%",
                showarrow=False,
                xshift=10 if method=='Classifier Head' else -10,
                xanchor='left' if method=='Classifier Head' else 'right',
                font=dict(color=cfg['color'], family="Arial Black", size=11),
                bgcolor="rgba(255,255,255,0.8)",
                bordercolor=cfg['color'],
                borderwidth=1,
                row=1, col=1
            )
            # F1 subplot annotation
            fig.add_annotation(
                x=eps, y=row['F1_Loss_%'],
                text=f"{row['F1_Loss_%']:.1f}%",
                showarrow=False,
                yshift=( -15 if method=='Classifier Head' else +15),
                font=dict(color=cfg['color'], family="Arial Black", size=11),
                bgcolor="rgba(255,255,255,0.8)",
                bordercolor=cfg['color'],
                borderwidth=1,
                row=1, col=2
            )

    # --- 5) Shared epsilon ticks, axis titles, styling ---
    eps_vals = sorted(dp_df['achieved_epsilon_mean'].unique())
    # custom tick labels
    tick_labels = [
        f"{e:.3f}" if e<0.1 else (f"{e:.2f}" if e<1 else f"{e:.1f}")
        for e in eps_vals
    ]

    # X axes
    for c in (1, 2):
        fig.update_xaxes(
            title_text="Privacy Budget ε",
            tickmode='array',
            tickvals=eps_vals,
            ticktext=tick_labels,
            tickangle=270,
            automargin=True,
            showgrid=True,
            gridcolor='rgba(128,128,128,0.3)',
            row=1, col=c
        )
    # Y axes
    fig.update_yaxes(
        title_text="Accuracy Utility Loss (%)",
        rangemode='tozero',
        showgrid=True,
        gridcolor='rgba(128,128,128,0.3)',
        row=1, col=1
    )
    fig.update_yaxes(
        title_text="F1 Utility Loss (%)",
        rangemode='tozero',
        showgrid=True,
        gridcolor='rgba(128,128,128,0.3)',
        row=1, col=2
    )

    # reference line y=0 across the whole figure
    fig.add_hline(y=0, line_dash="dot", line_color="rgba(128,128,128,0.5)", line_width=1)

    # overall layout
    fig.update_layout(
        width=1200, height=600,
        plot_bgcolor='white', paper_bgcolor='white',
        margin=dict(l=80, r=50, t=100, b=80),
        legend=dict(
            orientation='v', x=1.02, y=1,
            bordercolor='rgba(128,128,128,0.3)', borderwidth=1,
            bgcolor='rgba(255,255,255,0.8)',
            font=dict(size=14, family="Arial")
        ),
        hovermode='closest'
    )

    return fig


In [None]:
fig = plot_utility_loss_side_by_side(pb_df)
fig.show()

In [None]:
pb_df

Unnamed: 0,Experiment,Fine_Tune_Strategy,Batch_Size,Learning_Rate,window_length,sample_rate,overlap,Test_Accuracies_mean,Test_Accuracies_std,Best_Val_Accuracies_mean,Best_Val_Accuracies_std,Test_F1s_mean,Test_F1s_std,Best_Val_F1s_mean,Best_Val_F1s_std,Best_Val_Epochs_mean,Best_Val_Epochs_std,noise_scale,achieved_epsilon_mean,achieved_epsilon_std
0,Non-private baseline,Classifier Head,32,0.001,10.0,30.0,50.0,73.503192,1.691858,74.926598,2.493415,70.986642,2.328764,72.837373,2.951468,14.583333,4.883283,0.0,0.0,0.0
1,Non-private baseline,Full Model,32,0.001,10.0,30.0,50.0,71.563451,6.602758,75.174231,5.454711,71.041675,6.430119,73.568855,5.493255,11.0,4.878454,0.0,0.0,0.0
2,Gradient Perturbation (σ=1.0),Classifier Head,32,0.001,10.0,30.0,50.0,62.334792,5.565727,61.783914,3.922921,58.182543,6.045221,57.48955,3.808732,15.5,4.803493,1.0,3.589667,0.625577
3,Gradient Perturbation (σ=1.0),Full Model,32,0.001,10.0,30.0,50.0,56.88392,6.560545,60.379261,9.152984,45.044123,7.08893,48.81507,9.304615,6.75,2.712395,1.0,2.41675,0.440078
4,Gradient Perturbation (σ=10.0),Classifier Head,32,0.001,10.0,30.0,50.0,33.245751,6.214186,32.52985,7.022338,27.377,3.936223,27.467152,6.376315,19.25,0.971688,10.0,0.225,0.006018
5,Gradient Perturbation (σ=10.0),Full Model,32,0.001,10.0,30.0,50.0,33.490441,5.283362,33.81234,7.687917,22.600931,5.094552,23.043753,7.040945,15.583333,3.7762,10.0,0.1995,0.026086
6,Gradient Perturbation (σ=5.0),Classifier Head,32,0.001,10.0,30.0,50.0,44.538263,2.737179,44.68076,5.132395,38.997956,1.657922,39.692593,3.937954,19.083333,1.221688,5.0,0.471833,0.017153
7,Gradient Perturbation (σ=5.0),Full Model,32,0.001,10.0,30.0,50.0,42.706462,4.232548,43.891286,3.615455,29.965763,4.429104,31.260912,3.571137,15.083333,4.344729,5.0,0.413333,0.068248


In [None]:
#pb_df.to_excel("pb_df.xlsx")

## Paired Sample T-Test

In [None]:
import pandas as pd

ch_summary_path = 'downsampled_results_DP/summary_df_DP_CH_NS_1.0_10_30_50.0_20250616_Split_4-2-2.csv'
fm_summary_path = 'downsampled_results_DP/summary_df_DP_CH_NS_1.0_10_30_50.0_20250616_Split_4-2-2_all_layers.csv'

selected_cols = ['Test_Subjects', 'Test_Accuracies', 'Test_F1s']

ch_1_summary_df = pd.read_csv(ch_summary_path)
ch_1_summary_df.columns = [col + '_ch' if col in selected_cols else col for col in ch_1_summary_df.columns]
ch_1_selected_cols = [col for col in ch_1_summary_df.columns if col.endswith('_ch')]
ch_1_summary_df = ch_1_summary_df[ch_1_selected_cols]

fm_1_summary_df = pd.read_csv(fm_summary_path)
fm_1_summary_df.columns = [col + '_fm' if col in selected_cols else col for col in fm_1_summary_df.columns]
fm_1_selected_cols = [col for col in fm_1_summary_df.columns if col.endswith('_fm')]
fm_1_summary_df = fm_1_summary_df[fm_1_selected_cols]

merge_ch_fm_df = pd.concat([ch_1_summary_df, fm_1_summary_df], axis=1).drop(columns=['Test_Subjects_fm'])
merge_ch_fm_df.rename(columns={'Test_Subjects_ch': 'Test_Subjects'}, inplace=True)

In [None]:
merge_ch_fm_df

Unnamed: 0,Test_Subjects,Test_Accuracies_ch,Test_F1s_ch,Test_Accuracies_fm,Test_F1s_fm
0,['subject102' 'subject107'],65.384615,0.629835,61.126374,0.476814
1,['subject102' 'subject107'],68.818681,0.649467,63.598901,0.502295
2,['subject102' 'subject107'],67.307692,0.643445,67.71978,0.571583
3,['subject105' 'subject108'],55.450875,0.49976,52.220727,0.377913
4,['subject105' 'subject108'],52.759085,0.501758,45.89502,0.357283
5,['subject105' 'subject108'],68.909825,0.671129,53.162853,0.418531
6,['subject103' 'subject106'],59.971711,0.521545,49.787836,0.41275
7,['subject103' 'subject106'],70.29703,0.660135,69.448373,0.603129
8,['subject103' 'subject106'],67.326733,0.621189,64.922207,0.509273
9,['subject101' 'subject104'],49.929478,0.457967,41.748942,0.268284


In [None]:
diff_acc = merge_ch_fm_df['Test_Accuracies_ch'] - merge_ch_fm_df['Test_Accuracies_fm']
diff_f1 = merge_ch_fm_df['Test_F1s_ch'] - merge_ch_fm_df['Test_F1s_fm']

In [None]:
from typing import List, Tuple, Dict

def paired_t_test_cv_results(strategy_a_scores: List[float],
                           strategy_b_scores: List[float],
                           strategy_a_name: str = "Strategy A",
                           strategy_b_name: str = "Strategy B",
                           alpha: float = 0.05) -> Dict:
    """
    Perform paired t-test on k-fold cross-validation results.

    Args:
        strategy_a_scores: List of CV scores for strategy A (e.g., classifier head)
        strategy_b_scores: List of CV scores for strategy B (e.g., baseline)
        strategy_a_name: Name of strategy A
        strategy_b_name: Name of strategy B
        alpha: Significance level (default 0.05)

    Returns:
        Dictionary with test results and statistics
    """
    from scipy import stats
    import numpy as np

    # Convert to numpy arrays
    scores_a = np.array(strategy_a_scores)
    scores_b = np.array(strategy_b_scores)

    # Check if arrays have same length
    if len(scores_a) != len(scores_b):
        raise ValueError("Both strategies must have the same number of CV scores")

    # Calculate differences (A - B)
    differences = scores_a - scores_b

    stat, p_shapiro = stats.shapiro(differences)
    print(f"Shapiro–Wilk p-value for normality: {p_shapiro:.3f}")

    if p_shapiro > 0.05:
        # If p_shapiro > 0.05, go ahead with paired t-test:
        t_statistic, p_value = stats.ttest_rel(scores_a, scores_b)
        print(f'Paired t-test is applied: {p_value:.3f}')
    else:
        t_statistic, p_value = stats.wilcoxon(scores_a, scores_b)
        print(f'Wilcoxon signed-rank test is applied: {p_value:.3f}')

    # Calculate confidence interval for the mean difference
    mean_diff = np.mean(differences)
    se_diff = stats.sem(differences)
    df = len(differences) - 1
    t_critical = stats.t.ppf(1 - alpha/2, df)
    ci_lower = mean_diff - t_critical * se_diff
    ci_upper = mean_diff + t_critical * se_diff

    # Effect size (Cohen's d for paired samples)
    cohens_d = mean_diff / np.std(differences, ddof=1)

    # Determine significance
    is_significant = p_value < alpha

    results = {
        'strategy_a_name': strategy_a_name,
        'strategy_b_name': strategy_b_name,
        'strategy_a_mean': np.mean(scores_a),
        'strategy_b_mean': np.mean(scores_b),
        'strategy_a_std': np.std(scores_a, ddof=1),
        'strategy_b_std': np.std(scores_b, ddof=1),
        'mean_difference': mean_diff,
        'std_difference': np.std(differences, ddof=1),
        't_statistic': t_statistic,
        'p_value': p_value,
        'degrees_of_freedom': df,
        'confidence_interval': (ci_lower, ci_upper),
        'cohens_d': cohens_d,
        'is_significant': is_significant,
        'alpha': alpha,
        'n_folds': len(scores_a)
    }

    return results



def interpret_results(results: Dict) -> str:
    """
    Provide interpretation of statistical test results.
    """
    interpretation = []

    # Basic results
    interpretation.append(f"Statistical Test Results:")
    interpretation.append(f"{'='*50}")
    interpretation.append(f"{results['strategy_a_name']}: {results['strategy_a_mean']:.4f} ± {results['strategy_a_std']:.4f}")
    interpretation.append(f"{results['strategy_b_name']}: {results['strategy_b_mean']:.4f} ± {results['strategy_b_std']:.4f}")
    interpretation.append(f"Mean Difference: {results['mean_difference']:.4f}")
    interpretation.append(f"95% Confidence Interval: [{results['confidence_interval'][0]:.4f}, {results['confidence_interval'][1]:.4f}]")
    interpretation.append("")

    # Test statistics
    interpretation.append(f"Test Statistics:")
    interpretation.append(f"t-statistic: {results['t_statistic']:.4f}")
    interpretation.append(f"p-value: {results['p_value']:.6f}")
    interpretation.append(f"Degrees of freedom: {results['degrees_of_freedom']}")
    interpretation.append(f"Cohen's d (effect size): {results['cohens_d']:.4f}")
    interpretation.append("")

    # Significance interpretation
    if results['is_significant']:
        interpretation.append(f"✓ SIGNIFICANT DIFFERENCE (p < {results['alpha']})")
        if results['mean_difference'] > 0:
            interpretation.append(f"  {results['strategy_a_name']} is significantly better than {results['strategy_b_name']}")
        else:
            interpretation.append(f"  {results['strategy_b_name']} is significantly better than {results['strategy_a_name']}")
    else:
        interpretation.append(f"✗ NO SIGNIFICANT DIFFERENCE (p ≥ {results['alpha']})")
        interpretation.append(f"  Cannot conclude one strategy is significantly better than the other")

    # Effect size interpretation
    interpretation.append("")
    interpretation.append("Effect Size Interpretation:")
    abs_d = abs(results['cohens_d'])
    if abs_d < 0.2:
        effect_size = "negligible"
    elif abs_d < 0.5:
        effect_size = "small"
    elif abs_d < 0.8:
        effect_size = "medium"
    else:
        effect_size = "large"

    interpretation.append(f"Cohen's d = {results['cohens_d']:.4f} indicates a {effect_size} effect size")

    return "\n".join(interpretation)

In [None]:
from typing import List
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def visualize_comparison(
    strategy_a_scores: List[float],
    strategy_b_scores: List[float],
    strategy_a_name: str = "Strategy A",
    strategy_b_name: str = "Strategy B",
    metric_comparison: str = "Test Accuracy"
) -> None:
    """
    Create interactive visualization of the comparison between strategies using Plotly,
    with the two plots arranged in a single column (stacked vertically).
    """
    # Create subplots: 2 rows, 1 column
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Fold-by-Fold Comparison', 'Performance Differences by Fold'),
        specs=[[{"type": "scatter"}],
               [{"type": "bar"}]]
    )

    # 1st plot: fold-by-fold comparison
    folds = list(range(1, len(strategy_a_scores) + 1))
    fig.add_trace(
        go.Scatter(
            x=folds, y=strategy_a_scores,
            mode='lines+markers', name=strategy_a_name,
            line=dict(width=2), marker=dict(size=8, symbol='circle')
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=folds, y=strategy_b_scores,
            mode='lines+markers', name=strategy_b_name,
            line=dict(width=2), marker=dict(size=8, symbol='square')
        ),
        row=1, col=1
    )

    # 2nd plot: differences by fold
    differences = np.array(strategy_a_scores) - np.array(strategy_b_scores)
    colors = ['green' if d > 0 else 'red' for d in differences]
    fig.add_trace(
        go.Bar(
            x=folds, y=differences,
            marker=dict(color=colors, opacity=0.7),
            name='Difference',
            text=[f'{d:.3f}' for d in differences],
            textposition='auto'
        ),
        row=2, col=1
    )
    # Horizontal zero‐line
    fig.add_hline(
        y=0, row=2, col=1,
        line_dash="dash", line_color="black", opacity=0.5
    )

    # Layout tweaks
    fig.update_layout(
        title=f'Statistical Comparison of {metric_comparison}: {strategy_a_name} vs {strategy_b_name}',
        height=1000,  # a bit taller to accommodate stacking
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        margin=dict(t=100, b=50)
    )

    # Axis labels
    fig.update_yaxes(
        title_text="Score", row=1, col=1
    )
    fig.update_yaxes(
        title_text=f"Difference ({strategy_a_name} - {strategy_b_name})", row=2, col=1
    )
    fig.update_xaxes(
        title_text="Fold", row=1, col=1, showgrid=True, gridcolor='lightgray'
    )
    fig.update_xaxes(
        title_text="Fold", row=2, col=1, showgrid=True, gridcolor='lightgray'
    )
    fig.update_yaxes(showgrid=True, gridcolor='lightgray')

    # Render
    fig.show()


In [None]:
merge_ch_fm_df["Test_F1s_ch"] = merge_ch_fm_df["Test_F1s_ch"] * 100
merge_ch_fm_df["Test_F1s_fm"] = merge_ch_fm_df["Test_F1s_fm"] * 100

In [None]:
grouped_scores = merge_ch_fm_df.groupby('Test_Subjects').mean()
grouped_scores

Unnamed: 0_level_0,Test_Accuracies_ch,Test_F1s_ch,Test_Accuracies_fm,Test_F1s_fm
Test_Subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
['subject101' 'subject104'],57.263752,52.788086,51.574988,39.190801
['subject102' 'subject107'],67.17033,64.091579,64.148352,51.689709
['subject103' 'subject106'],65.865158,60.095617,61.386139,50.838404
['subject105' 'subject108'],59.039928,55.754889,50.4262,38.457576


In [None]:
paired_t_test_results = paired_t_test_cv_results(strategy_a_scores = grouped_scores['Test_Accuracies_ch'],
                         strategy_b_scores = grouped_scores['Test_Accuracies_fm'],
                         strategy_a_name='Classifier Head',
                         strategy_b_name='Full Model')

print(interpret_results(paired_t_test_results))

Shapiro–Wilk p-value for normality: 0.826
Paired t-test is applied: 0.019
Statistical Test Results:
Classifier Head: 62.3348 ± 4.9132
Full Model: 56.8839 ± 6.9024
Mean Difference: 5.4509
95% Confidence Interval: [1.6737, 9.2281]

Test Statistics:
t-statistic: 4.5926
p-value: 0.019397
Degrees of freedom: 3
Cohen's d (effect size): 2.2963

✓ SIGNIFICANT DIFFERENCE (p < 0.05)
  Classifier Head is significantly better than Full Model

Effect Size Interpretation:
Cohen's d = 2.2963 indicates a large effect size


In [None]:
paired_t_test_results_f1 = paired_t_test_cv_results(strategy_a_scores = grouped_scores['Test_F1s_ch'],
                         strategy_b_scores = grouped_scores['Test_F1s_fm'],
                         strategy_a_name='Classifier Head',
                         strategy_b_name='Full Model')

print(interpret_results(paired_t_test_results_f1))

Shapiro–Wilk p-value for normality: 0.956
Paired t-test is applied: 0.004
Statistical Test Results:
Classifier Head: 58.1825 ± 4.9521
Full Model: 45.0441 ± 7.1968
Mean Difference: 13.1384
95% Confidence Interval: [7.8519, 18.4250]

Test Statistics:
t-statistic: 7.9092
p-value: 0.004213
Degrees of freedom: 3
Cohen's d (effect size): 3.9546

✓ SIGNIFICANT DIFFERENCE (p < 0.05)
  Classifier Head is significantly better than Full Model

Effect Size Interpretation:
Cohen's d = 3.9546 indicates a large effect size


In [None]:
visualize_comparison(grouped_scores['Test_Accuracies_fm'],
                     grouped_scores['Test_Accuracies_ch'],
                    strategy_a_name='Full Model',
                    strategy_b_name='Classifier Head'
                    )

In [None]:
visualize_comparison(grouped_scores['Test_F1s_fm'],
                     grouped_scores['Test_F1s_ch'],
                    strategy_a_name='Full Model',
                    strategy_b_name='Classifier Head',
                     metric_comparison="Test F1 Scores"
                    )

In [None]:
import scipy
print(scipy.__version__)

1.15.3


In [None]:
!python --version

Python 3.11.13
