In [1]:
import wandb 
import dill
import pandas as pd
import os 
import numpy as np 

In [2]:
def download_runs(project_name):
    if not os.path.exists(
        f"./results_data/data_{project_name}.pkl"
    ):
        project_details = wandb.Api().runs(f"lucacorbucci/{project_name}")
        project_data = {}
        for run in project_details:
            print("Downloading run ", run.id)
            run_df = pd.DataFrame(
                wandb.Api().run(f"lucacorbucci/{project_name}/{run.id}").scan_history()
            )
            if run.name not in project_data:
                project_data[run.name] = []
            project_data[run.name].append(run_df)
        with open(
            f"./results_data/data_{project_name}.pkl", "wb"
        ) as f:
            dill.dump(project_data, f)
    else:
        with open(
            f"./results_data/data_{project_name}.pkl", "rb"
        ) as f:
            project_data = dill.load(f)
    return project_data

# Explanation Metrics

In [3]:
project_data = download_runs(project_name="tango_explanation_metrics")
project_name = "tango_explanation_metrics"

In [4]:
methods = ["dt", "svm", "logistic", "lime", "shap", "lore", "lore_genetic"]
datasets = ["adult", "house16", "letter", "dutch", "covertype", "shuttle"]
top_k = [3, 5, 8, 10, 20]

In [5]:
metrics = {}

for dataset in datasets:
    metrics[dataset] = {}   
    for method in methods: 
        metrics[dataset][method] = {}
        if f"{method}_{dataset}" in project_data:
            results = project_data[f"{method}_{dataset}"][0]
            if "faithfulness" in results.columns:
                faithfulness = round(float(results["faithfulness"]), 3)
                faithfulness_std = round(float(results["faithfulness_std"]), 3)
                metrics[dataset][method]["Faithfulness"] = f"{faithfulness} $\pm$ {faithfulness_std}"
            if "stability" in results.columns:
                stability = round(float(results["stability"]), 3)
                stability_std = round(float(results["stability_std"]), 3)
                metrics[dataset][method]["stability"] = f"{stability} $\pm$ {stability_std}"
            for k in top_k:
                if f"robustness_top_{k}" in results.columns:
                    robustness = round(float(results[f"robustness_top_{k}"]), 3)
                    robustness_std = round(float(results[f"robustness_std_top_{k}"]), 3)
                    metrics[dataset][method][f"robustness_top_{k}"] = f"{robustness} $\pm$ {robustness_std}"

  stability = round(float(results["stability"]), 3)
  stability_std = round(float(results["stability_std"]), 3)
  robustness = round(float(results[f"robustness_top_{k}"]), 3)
  robustness_std = round(float(results[f"robustness_std_top_{k}"]), 3)
  faithfulness = round(float(results["faithfulness"]), 3)
  faithfulness_std = round(float(results["faithfulness_std"]), 3)


In [6]:
import pandas as pd

# Initialize an empty list to store the rows
rows = []

top_k_table = [5,10,20]
# Iterate over the datasets and methods to extract the metrics
for dataset in datasets:
    for method in methods:
        row = {
            'Dataset': dataset,
            'Method': method,
            'Stability': metrics[dataset][method].get('stability', '-'),
            'Faithfulness': metrics[dataset][method].get('Faithfulness', '-')
        }
        for k in top_k_table:
            row[f"Robustness K={k}"] = metrics[dataset][method].get(f'robustness_top_{k}', '-')
        rows.append(row)

# Create a dataframe from the rows
df_metrics = pd.DataFrame(rows)


# Map method names to their display names
method_mapping = {
    'dt': 'Decision Tree',
    'svm': 'SVM',
    'logistic': 'Logistic Regr.',
    'lime': 'LIME',
    'shap': 'SHAP',
    'lore': 'Lore (Random)',
    'lore_genetic': 'Lore (Genetic)'
}

# Map method names to their display names
dataset_name_mapping = {
    'adult': 'Adult',
    'house16': 'House 16',
    'letter': 'Letter',
    'dutch': 'Dutch',
    'covertype': 'Covertype',
    'shuttle': 'Shuttle'
}

# Apply the mapping to the Method column
df_metrics["Method"] = df_metrics["Method"].map(method_mapping)

df_metrics["Dataset"] = df_metrics["Dataset"].map(dataset_name_mapping)

# Sort the DataFrame by Dataset and Method
df_metrics = df_metrics.sort_values(by=['Dataset', 'Method'])

# Create custom method order for better visualization
method_order = {
    'Decision Tree': 1, 
    'SVM': 2, 
    'Logistic Regr.': 3, 
    'LIME': 4, 
    'SHAP': 5, 
    'Lore (Random)': 6, 
    'Lore (Genetic)': 7
}

# Create a new column for sorting by custom method order
df_metrics['method_order'] = df_metrics['Method'].map(method_order)

# Sort by Dataset first, then by the custom method order
df_metrics = df_metrics.sort_values(by=['Dataset', 'method_order'])

# Drop the helper column
df_metrics = df_metrics.drop(columns=['method_order'])

df_metrics.head(9)


Unnamed: 0,Dataset,Method,Stability,Faithfulness,Robustness K=5,Robustness K=10,Robustness K=20
0,Adult,Decision Tree,0.876 $\pm$ 0.193,-,0.542 $\pm$ 0.154,0.529 $\pm$ 0.139,0.512 $\pm$ 0.13
1,Adult,SVM,0.856 $\pm$ 0.263,0.009 $\pm$ 0.112,0.295 $\pm$ 0.108,0.292 $\pm$ 0.099,0.288 $\pm$ 0.093
2,Adult,Logistic Regr.,0.409 $\pm$ 0.284,-0.025 $\pm$ 0.214,0.224 $\pm$ 0.137,0.219 $\pm$ 0.126,0.214 $\pm$ 0.12
3,Adult,LIME,0.056 $\pm$ 0.021,0.063 $\pm$ 0.178,0.054 $\pm$ 0.009,0.054 $\pm$ 0.007,0.054 $\pm$ 0.005
4,Adult,SHAP,0.408 $\pm$ 0.195,0.515 $\pm$ 0.16,0.264 $\pm$ 0.112,0.257 $\pm$ 0.096,0.251 $\pm$ 0.086
5,Adult,Lore (Random),0.31 $\pm$ 0.218,-,0.309 $\pm$ 0.175,0.314 $\pm$ 0.171,0.314 $\pm$ 0.171
6,Adult,Lore (Genetic),0.548 $\pm$ 0.172,-,0.253 $\pm$ 0.104,0.252 $\pm$ 0.084,0.252 $\pm$ 0.071
28,Covertype,Decision Tree,0.903 $\pm$ 0.156,-,0.572 $\pm$ 0.117,0.558 $\pm$ 0.104,0.545 $\pm$ 0.095
29,Covertype,SVM,0.586 $\pm$ 0.355,-0.033 $\pm$ 0.241,0.187 $\pm$ 0.089,0.18 $\pm$ 0.078,0.173 $\pm$ 0.07


In [7]:
print(df_metrics.to_latex(index=False))

\begin{tabular}{lllllll}
\toprule
Dataset & Method & Stability & Faithfulness & Robustness K=5 & Robustness K=10 & Robustness K=20 \\
\midrule
Adult & Decision Tree & 0.876 $\pm$ 0.193 & - & 0.542 $\pm$ 0.154 & 0.529 $\pm$ 0.139 & 0.512 $\pm$ 0.13 \\
Adult & SVM & 0.856 $\pm$ 0.263 & 0.009 $\pm$ 0.112 & 0.295 $\pm$ 0.108 & 0.292 $\pm$ 0.099 & 0.288 $\pm$ 0.093 \\
Adult & Logistic Regr. & 0.409 $\pm$ 0.284 & -0.025 $\pm$ 0.214 & 0.224 $\pm$ 0.137 & 0.219 $\pm$ 0.126 & 0.214 $\pm$ 0.12 \\
Adult & LIME & 0.056 $\pm$ 0.021 & 0.063 $\pm$ 0.178 & 0.054 $\pm$ 0.009 & 0.054 $\pm$ 0.007 & 0.054 $\pm$ 0.005 \\
Adult & SHAP & 0.408 $\pm$ 0.195 & 0.515 $\pm$ 0.16 & 0.264 $\pm$ 0.112 & 0.257 $\pm$ 0.096 & 0.251 $\pm$ 0.086 \\
Adult & Lore (Random) & 0.31 $\pm$ 0.218 & - & 0.309 $\pm$ 0.175 & 0.314 $\pm$ 0.171 & 0.314 $\pm$ 0.171 \\
Adult & Lore (Genetic) & 0.548 $\pm$ 0.172 & - & 0.253 $\pm$ 0.104 & 0.252 $\pm$ 0.084 & 0.252 $\pm$ 0.071 \\
Covertype & Decision Tree & 0.903 $\pm$ 0.156 & - & 0.572 $

In [8]:
# Prepare dataframe for custom LaTeX output
df_grouped = df_metrics.groupby('Dataset')

# Start building the LaTeX table
latex_output = "\\begin{tabular}{" + "l" * len(df_metrics.columns) + "}\n"
latex_output += "\\toprule\n"

# Add headers
latex_output += " & ".join(df_metrics.columns) + " \\\\\n"
latex_output += "\\midrule\n"

# Add rows with midrules between datasets
datasets = df_metrics['Dataset'].unique()
for i, dataset in enumerate(datasets):
    group = df_grouped.get_group(dataset)
    
    # Convert group dataframe to LaTeX rows
    rows_latex = group.to_latex(index=False, header=False)
    
    # Extract just the rows part (not headers or table structure)
    rows_only = "\n".join(rows_latex.split("\n")[3:-3])
    
    latex_output += rows_only
    
    # Add midrule if not the last dataset
    if i < len(datasets) - 1:
        latex_output += "\\midrule\n"

latex_output += "\\bottomrule\n\\end{tabular}"

print(latex_output)

\begin{tabular}{lllllll}
\toprule
Dataset & Method & Stability & Faithfulness & Robustness K=5 & Robustness K=10 & Robustness K=20 \\
\midrule
Adult & Decision Tree & 0.876 $\pm$ 0.193 & - & 0.542 $\pm$ 0.154 & 0.529 $\pm$ 0.139 & 0.512 $\pm$ 0.13 \\
Adult & SVM & 0.856 $\pm$ 0.263 & 0.009 $\pm$ 0.112 & 0.295 $\pm$ 0.108 & 0.292 $\pm$ 0.099 & 0.288 $\pm$ 0.093 \\
Adult & Logistic Regr. & 0.409 $\pm$ 0.284 & -0.025 $\pm$ 0.214 & 0.224 $\pm$ 0.137 & 0.219 $\pm$ 0.126 & 0.214 $\pm$ 0.12 \\
Adult & LIME & 0.056 $\pm$ 0.021 & 0.063 $\pm$ 0.178 & 0.054 $\pm$ 0.009 & 0.054 $\pm$ 0.007 & 0.054 $\pm$ 0.005 \\
Adult & SHAP & 0.408 $\pm$ 0.195 & 0.515 $\pm$ 0.16 & 0.264 $\pm$ 0.112 & 0.257 $\pm$ 0.096 & 0.251 $\pm$ 0.086 \\
Adult & Lore (Random) & 0.31 $\pm$ 0.218 & - & 0.309 $\pm$ 0.175 & 0.314 $\pm$ 0.171 & 0.314 $\pm$ 0.171 \\
Adult & Lore (Genetic) & 0.548 $\pm$ 0.172 & - & 0.253 $\pm$ 0.104 & 0.252 $\pm$ 0.084 & 0.252 $\pm$ 0.071 \\\midrule
Covertype & Decision Tree & 0.903 $\pm$ 0.156 & - &

In [9]:
import os

import matplotlib.pyplot as plt

def plot_robustness_per_dataset(df_metrics):
    # Create the plots directory if it doesn't exist
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Get the unique datasets
    datasets = df_metrics['Dataset'].unique()
    top_k = [3, 5, 8, 10, 20]
    
    for dataset in datasets:
        plt.figure(figsize=(10, 6))
        subset = df_metrics[df_metrics['Dataset'] == dataset]
        
        for method in subset['Method'].unique():
            robustness_values = []
            for k in top_k:
                robustness_value = subset[subset['Method'] == method][f'Robustness\\_top\\_{k}'].values[0]
                robustness_values.append(float(robustness_value.split(' ')[0]))
            
            plt.plot(top_k, robustness_values, marker='o', markersize=10, label=method)
        
        plt.title(f'Robustness per K for {dataset}', fontsize=25)
        plt.xlabel('K', fontsize=25)
        plt.ylabel('Robustness', fontsize=25)
        plt.xticks(top_k, fontsize=20)
        plt.yticks(fontsize=20)
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3, fontsize=20)
        plt.grid(True)
        
        # Save the plot
        plt.savefig(f'plots/robustness_{dataset}.png', bbox_inches='tight')
        plt.close()

# Call the function to plot the robustness per dataset
plot_robustness_per_dataset(df_metrics)

KeyError: 'Robustness\\_top\\_3'

<Figure size 1000x600 with 0 Axes>

# Fidelity

In [29]:
project_data = download_runs(project_name="tango_eval")
project_name = "tango_eval"

In [30]:
project_data_comparison = download_runs(project_name="comparison_tango")
project_name = "comparison_tango"

Downloading run  atw9xiir
Downloading run  fqhezqc6
Downloading run  zro78ljz
Downloading run  pqhq8893
Downloading run  jfyrc466
Downloading run  ew09kxhc
Downloading run  ow7abfwn
Downloading run  w970mb4d
Downloading run  ashjhyxu
Downloading run  coj85sk5
Downloading run  6l8ingce
Downloading run  s9cuiccy
Downloading run  rxc8lhce
Downloading run  s067qrl9
Downloading run  kktf0dw2
Downloading run  azpa5w6i
Downloading run  tby0gao2
Downloading run  e8q3p408
Downloading run  sx8gjzeq
Downloading run  ka1gffmv
Downloading run  66ti7ajw
Downloading run  vps9w5h0
Downloading run  iurmhuh9
Downloading run  vuy3wdbc
Downloading run  jhhmlzkc
Downloading run  2t9mifpk
Downloading run  7m6kfd3j
Downloading run  c8wei8fz
Downloading run  26bojefp
Downloading run  j6olyyp0
Downloading run  i7de0qa9
Downloading run  7ltu9s3i
Downloading run  xf9z8ml0
Downloading run  9sdp0bdq
Downloading run  h62sp4yz
Downloading run  gm6caoms
Downloading run  mgw1jr72
Downloading run  lukwrutc
Downloading 

In [31]:
methods = ["dt", "svm", "logistic"]
datasets = ["house16", "letter", "dutch", "adult", "covertype", "shuttle"]

In [32]:
metrics = {}

for dataset in datasets:
    metrics[dataset] = {}   
    for method in methods: 
        metrics[dataset][method] = {}
        results = project_data[f"{method}_{dataset}"][0]
        fidelity_list = []
        if "Fidelity" in results.columns:
            fidelity_list = [float(f) for f in results["Fidelity"].values]
        
        fidelity = round(np.mean(fidelity_list), 3)
        fidelity_std = round(np.std(fidelity_list), 3)
        metrics[dataset][method]["Fidelity"] = f"{fidelity} $\pm$ {fidelity_std}"

In [33]:
import pandas as pd

# Initialize an empty list to store the rows
rows = []

# Iterate over the datasets and methods to extract the metrics
for dataset in datasets:
    for method in methods:
        row = {
            'Dataset': dataset,
            'Method': method,
            'Fidelity': metrics[dataset][method].get('Fidelity', '-'),
        }
        rows.append(row)



In [34]:
methods = ["lime", "lore", "lore_genetic"]
datasets = ["house16", "letter", "dutch", "adult", "covertype", "shuttle"]

In [35]:
for dataset in datasets:
    for method in methods: 
        print(method, dataset)
        metrics[dataset][method] = {}
        results = project_data_comparison[f"{method}_{dataset}"]
        fidelity_list = []
        for result in results: 
            if "fidelity" in result.columns:
                fidelity_list.append(float(result["fidelity"]))
            
        
        fidelity = round(np.mean(fidelity_list), 3)
        fidelity_std = round(np.std(fidelity_list), 3)
        metrics[dataset][method]["Fidelity"] = f"{fidelity} $\pm$ {fidelity_std}"
        print(fidelity, fidelity_std)

lime house16
0.874 0.001
lore house16
0.533 0.0
lore_genetic house16
0.61 0.0
lime letter
0.04 0.0
lore letter
0.04 0.001
lore_genetic letter
0.044 0.0
lime dutch
0.896 0.001
lore dutch
0.498 0.001
lore_genetic dutch
0.501 0.0
lime adult
0.901 0.0
lore adult
0.668 0.001
lore_genetic adult
0.636 0.0
lime covertype
0.711 0.001
lore covertype
0.36 0.001
lore_genetic covertype
0.394 0.0
lime shuttle
0.801 0.0
lore shuttle
0.659 0.0
lore_genetic shuttle
0.663 0.0


  fidelity_list.append(float(result["fidelity"]))


In [36]:
metrics

{'house16': {'dt': {'Fidelity': '0.896 $\\pm$ 0.0'},
  'svm': {'Fidelity': '0.743 $\\pm$ 0.0'},
  'logistic': {'Fidelity': '0.962 $\\pm$ 0.0'},
  'lime': {'Fidelity': '0.874 $\\pm$ 0.001'},
  'lore': {'Fidelity': '0.533 $\\pm$ 0.0'},
  'lore_genetic': {'Fidelity': '0.61 $\\pm$ 0.0'}},
 'letter': {'dt': {'Fidelity': '0.708 $\\pm$ 0.0'},
  'svm': {'Fidelity': '0.861 $\\pm$ 0.0'},
  'logistic': {'Fidelity': '0.843 $\\pm$ 0.0'},
  'lime': {'Fidelity': '0.04 $\\pm$ 0.0'},
  'lore': {'Fidelity': '0.04 $\\pm$ 0.001'},
  'lore_genetic': {'Fidelity': '0.044 $\\pm$ 0.0'}},
 'dutch': {'dt': {'Fidelity': '0.995 $\\pm$ 0.0'},
  'svm': {'Fidelity': '0.996 $\\pm$ 0.0'},
  'logistic': {'Fidelity': '0.997 $\\pm$ 0.0'},
  'lime': {'Fidelity': '0.896 $\\pm$ 0.001'},
  'lore': {'Fidelity': '0.498 $\\pm$ 0.001'},
  'lore_genetic': {'Fidelity': '0.501 $\\pm$ 0.0'}},
 'adult': {'dt': {'Fidelity': '0.95 $\\pm$ 0.0'},
  'svm': {'Fidelity': '0.522 $\\pm$ 0.0'},
  'logistic': {'Fidelity': '0.894 $\\pm$ 0.0'},
  

In [37]:
# Iterate over the datasets and methods to extract the metrics
for dataset in datasets:
    for method in methods:
        row = {
            'Dataset': dataset,
            'Method': method,
            'Fidelity': metrics[dataset][method].get('Fidelity', '-'),
        }
        rows.append(row)


In [38]:
# Create a dataframe from the rows
df_metrics = pd.DataFrame(rows)

df_metrics = df_metrics.sort_values(by=['Dataset', 'Method'])
df_metrics.head(10)


# Map method names to their display names
method_mapping = {
    'dt': 'Decision Tree',
    'svm': 'SVM',
    'logistic': 'Logistic Regr.',
    'lime': 'LIME',
    'shap': 'SHAP',
    'lore': 'Lore (Random)',
    'lore_genetic': 'Lore (Genetic)'
}

# Map method names to their display names
dataset_name_mapping = {
    'adult': 'Adult',
    'house16': 'House 16',
    'letter': 'Letter',
    'dutch': 'Dutch',
    'covertype': 'Covertype',
    'shuttle': 'Shuttle'
}

# Apply the mapping to the Method column
df_metrics["Method"] = df_metrics["Method"].map(method_mapping)

df_metrics["Dataset"] = df_metrics["Dataset"].map(dataset_name_mapping)

# Sort the DataFrame by Dataset and Method
df_metrics = df_metrics.sort_values(by=['Dataset', 'Method'])

# Create custom method order for better visualization
method_order = {
    'Decision Tree': 1, 
    'SVM': 2, 
    'Logistic Regr.': 3, 
    'LIME': 4, 
    'SHAP': 5, 
    'Lore (Random)': 6, 
    'Lore (Genetic)': 7
}

# Create a new column for sorting by custom method order
df_metrics['method_order'] = df_metrics['Method'].map(method_order)

# Sort by Dataset first, then by the custom method order
df_metrics = df_metrics.sort_values(by=['Dataset', 'method_order'])

# Drop the helper column
df_metrics = df_metrics.drop(columns=['method_order'])

df_metrics.head(9)

Unnamed: 0,Dataset,Method,Fidelity
9,Adult,Decision Tree,0.95 $\pm$ 0.0
10,Adult,SVM,0.522 $\pm$ 0.0
11,Adult,Logistic Regr.,0.894 $\pm$ 0.0
27,Adult,LIME,0.901 $\pm$ 0.0
28,Adult,Lore (Random),0.668 $\pm$ 0.001
29,Adult,Lore (Genetic),0.636 $\pm$ 0.0
12,Covertype,Decision Tree,0.834 $\pm$ 0.0
13,Covertype,SVM,0.6 $\pm$ 0.0
14,Covertype,Logistic Regr.,0.872 $\pm$ 0.0


In [39]:
# sort df_metrics by dataset name

# remove index 
df_metrics = df_metrics.reset_index(drop=True)

In [40]:
# Prepare dataframe for custom LaTeX output
df_grouped = df_metrics.groupby('Dataset')

# Start building the LaTeX table
latex_output = "\\begin{tabular}{" + "l" * len(df_metrics.columns) + "}\n"
latex_output += "\\toprule\n"

# Add headers
latex_output += " & ".join(df_metrics.columns) + " \\\\\n"
latex_output += "\\midrule\n"

# Add rows with midrules between datasets
datasets = df_metrics['Dataset'].unique()
for i, dataset in enumerate(datasets):
    group = df_grouped.get_group(dataset)
    
    # Convert group dataframe to LaTeX rows
    rows_latex = group.to_latex(index=False, header=False)
    
    # Extract just the rows part (not headers or table structure)
    rows_only = "\n".join(rows_latex.split("\n")[3:-3])
    
    latex_output += rows_only
    
    # Add midrule if not the last dataset
    if i < len(datasets) - 1:
        latex_output += "\\midrule\n"

latex_output += "\\bottomrule\n\\end{tabular}"

print(latex_output)

\begin{tabular}{lll}
\toprule
Dataset & Method & Fidelity \\
\midrule
Adult & Decision Tree & 0.95 $\pm$ 0.0 \\
Adult & SVM & 0.522 $\pm$ 0.0 \\
Adult & Logistic Regr. & 0.894 $\pm$ 0.0 \\
Adult & LIME & 0.901 $\pm$ 0.0 \\
Adult & Lore (Random) & 0.668 $\pm$ 0.001 \\
Adult & Lore (Genetic) & 0.636 $\pm$ 0.0 \\\midrule
Covertype & Decision Tree & 0.834 $\pm$ 0.0 \\
Covertype & SVM & 0.6 $\pm$ 0.0 \\
Covertype & Logistic Regr. & 0.872 $\pm$ 0.0 \\
Covertype & LIME & 0.711 $\pm$ 0.001 \\
Covertype & Lore (Random) & 0.36 $\pm$ 0.001 \\
Covertype & Lore (Genetic) & 0.394 $\pm$ 0.0 \\\midrule
Dutch & Decision Tree & 0.995 $\pm$ 0.0 \\
Dutch & SVM & 0.996 $\pm$ 0.0 \\
Dutch & Logistic Regr. & 0.997 $\pm$ 0.0 \\
Dutch & LIME & 0.896 $\pm$ 0.001 \\
Dutch & Lore (Random) & 0.498 $\pm$ 0.001 \\
Dutch & Lore (Genetic) & 0.501 $\pm$ 0.0 \\\midrule
House 16 & Decision Tree & 0.896 $\pm$ 0.0 \\
House 16 & SVM & 0.743 $\pm$ 0.0 \\
House 16 & Logistic Regr. & 0.962 $\pm$ 0.0 \\
House 16 & LIME & 0.874 