In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ptitprince # For raincloud plots
import numpy as np
from scipy import stats # For the statistical test
from statsmodels.stats.multitest import multipletests # For Bonferroni correction

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid", font_scale=1.1)

In [None]:
# --- Configuration ---
# ***
# *** UPDATE THIS PATH ***
# ***
STATS_FILE_PATH = "/Volumes/T9/Concepts/Target/circling_eeg/tcav/mg_leadership_l/results/tcav_statistics_all_runs.json"
# ***
# ***
# ---

# Load the raw data
try:
    with open(STATS_FILE_PATH, 'r') as f:
        raw_data = json.load(f)
    print(f"Successfully loaded data from {STATS_FILE_PATH}")
except FileNotFoundError:
    print(f"ERROR: File not found. Please check STATS_FILE_PATH.")
    # Stop execution in a notebook
    raise

# Process the data into a long-form DataFrame
processed_data = []
for run_dict in raw_data:
    run_id = run_dict['run_id']
    for layer_str, metrics in run_dict['layers'].items():
        layer_id = int(layer_str) # Convert layer string "0" to integer 0
        processed_data.append({
            "run": run_id,
            "layer": layer_id,
            "cav_accuracy": metrics.get('cav_accuracy', np.nan),
            "tcav_score": metrics.get('tcav_score', np.nan)
        })

df = pd.DataFrame(processed_data)

# Ensure 'layer' is treated as a categorical/ordered variable for plotting
layer_order = sorted(df['layer'].unique())
df['layer'] = pd.Categorical(df['layer'], categories=layer_order, ordered=True)

print(f"\nData processed into DataFrame with {len(df)} rows.")
print("DataFrame head:")
print(df.head())

In [None]:
# Create a figure for the CAV accuracy plot
fig, ax = plt.subplots(figsize=(16, 8))

print("Generating CAV Accuracy plot...")

# Generate the RainCloud plot
ptitprince.RainCloud(
    data=df,
    x='layer',
    y='cav_accuracy',
    order=layer_order,
    palette="Set2",
    bw=0.2, # Controls the smoothness of the "cloud"
    width_viol=.8, # Width of the violin plot
    ax=ax,
    orient="v",
    move=.2, # Move the rain/dots to the side
    point_size=3,
    point_alpha=0.5
)

# Add a horizontal line at 0.5 for the random-chance baseline
ax.axhline(0.5, ls='--', color='black', lw=1.5, label='Random Chance (0.5)')

# Set titles and labels
ax.set_title("Distribution of CAV Classifier Accuracies Across Layers (50 Runs)", fontsize=18, pad=20)
ax.set_xlabel("Transformer Block (Layer)", fontsize=14)
ax.set_ylabel("CAV Classifier Accuracy", fontsize=14)
ax.set_ylim(0, 1.05) # Accuracy is between 0 and 1
ax.legend()

plt.show()

In [None]:
# Create a figure for the TCAV score plot
fig, ax = plt.subplots(figsize=(16, 8))

print("Generating TCAV Score plot...")

# Generate the RainCloud plot
ptitprince.RainCloud(
    data=df,
    x='layer',
    y='tcav_score',
    order=layer_order,
    palette="Blues",
    bw=0.2,
    width_viol=.8,
    ax=ax,
    orient="v",
    move=.2,
    point_size=3,
    point_alpha=0.5
)

# Add a horizontal line at 0.5 for the random-chance baseline
ax.axhline(0.5, ls='--', color='red', lw=1.5, label='Random Baseline (0.5)')

# Set titles and labels
ax.set_title("Distribution of TCAV Scores Across Layers (50 Runs) - Sanity Check", fontsize=18, pad=20)
ax.set_xlabel("Transformer Block (Layer)", fontsize=14)
ax.set_ylabel("TCAV Score", fontsize=14)
ax.set_ylim(0, 1.05) # TCAV score is also between 0 and 1
ax.legend()

plt.show()

In [None]:
print("--- Performing Statistical Analysis ---")

results = []

# Loop over each layer
for layer in layer_order:
    # Get all TCAV scores for this layer, dropping any NaNs
    layer_scores = df[df['layer'] == layer]['tcav_score'].dropna()
    
    # Get the mean CAV accuracy for this layer
    layer_acc = df[df['layer'] == layer]['cav_accuracy'].dropna().mean()
    
    if len(layer_scores) > 1:
        # Perform a one-sided t-test: is the mean score significantly different than 0.5?
        # This is the correct test for a sanity check.
        t_stat, p_val = stats.ttest_1samp(
            a=layer_scores,
            popmean=0.5,
            alternative='two-sided' # Two-sided test
        )
        
        results.append({
            'layer': layer,
            'mean_tcav_score': layer_scores.mean(),
            'mean_cav_accuracy': layer_acc,
            'p_value': p_val,
            't_statistic': t_stat,
            'n_runs': len(layer_scores)
        })
    else:
        results.append({
            'layer': layer,
            'mean_tcav_score': np.nan,
            'mean_cav_accuracy': layer_acc,
            'p_value': np.nan,
            't_statistic': np.nan,
            'n_runs': len(layer_scores)
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Apply Bonferroni correction for multiple comparisons
# We tested 12 layers, so we correct for 12 tests
p_values = results_df['p_value'].dropna()
if not p_values.empty:
    reject, p_corrected, _, _ = multipletests(
        p_values,
        alpha=0.05, # Our significance threshold
        method='bonferroni'
    )
    
    # Add corrected p-values and significance back to the DataFrame
    results_df.loc[p_values.index, 'p_corrected'] = p_corrected
    results_df.loc[p_values.index, 'significant'] = reject
else:
    results_df['p_corrected'] = np.nan
    results_df['significant'] = False

# Print the final results table
print("Statistical Test Results (One-sided t-test vs 0.5, Bonferroni corrected):")
print(results_df.to_string(index=False, float_format="%.6f"))

In [None]:
# --- New Plot: Bar Plot with Significance ---

print("\nGenerating TCAV Score bar plot with significance...")

# Create a new figure and axes for the bar plot
fig, ax = plt.subplots(figsize=(16, 8))

# Create the bar plot using seaborn
# We use the results_df which contains the mean scores and significance
sns.barplot(
    data=results_df,
    x='layer',
    y='mean_tcav_score',
    order=layer_order, # Use the same layer order
    palette="Blues",   # Use the same color palette
    ax=ax
)

# Add the horizontal baseline at 0.5
ax.axhline(0.5, ls='--', color='red', lw=1.5, label='Random Baseline (0.5)')

# --- Add Significance Stars ---
# We iterate over the layers to place the stars correctly
y_offset = 0.01 # A small vertical offset to place the star above the bar

for i, layer in enumerate(layer_order):
    # Get the corresponding row from the results DataFrame
    result_row = results_df[results_df['layer'] == layer].iloc[0]
    
    # Check if this layer was found to be significant (after correction)
    if result_row['significant']:
        # Get the height of the bar (which is the mean_tcav_score)
        bar_height = result_row['mean_tcav_score']
        
        # Add the star as text, ensuring the bar height is a valid number
        if pd.notna(bar_height):
            ax.text(
                x=i, # The x-position is the integer index of the layer
                y=bar_height + y_offset, # The y-position is just above the bar
                s='*', # The asterisk symbol
                ha='center', # Horizontally centered
                va='bottom', # Vertically aligned to the bottom (so it sits above the y-coord)
                fontsize=20, # Make the star easily visible
                color='black'
            )

# --- Final Plot Customization ---
ax.set_title("Mean TCAV Scores Across Layers (Significantly > 0.5)", fontsize=18, pad=20)
ax.set_xlabel("Transformer Block (Layer)", fontsize=14)
ax.set_ylabel("Mean TCAV Score", fontsize=14)
ax.set_ylim(0, 1.05) # Set Y-axis limit consistent with other plots
ax.legend()

# Save the plot to a file
plt.savefig("tcav_score_bar_plot_with_stars.png", bbox_inches='tight', dpi=300)
print("Bar plot with significance saved as 'tcav_score_bar_plot_with_stars.png'")

# Display the plot
plt.show()