# Experimental Results Analysis

This notebook analyzes the outcomes from the neuro-symbolic experimentation framework, evaluating syntax reliability, logic accuracy and overall pipeline performance using data generated from automated SHACL synthesis and validation runs.

In [None]:
# Standard library imports
import numpy as np

# Third-party imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix

## Setup and Configuration

This section imports necessary libraries for data analysis and visualization and sets up configuration parameters for plots and output directories.

In [None]:
# Configuration for plot aesthetics and output directory
plt.rcParams['figure.dpi'] = 120

# Directory for autosaving images
IMAGE_DIR = "Thesis/Images"

## Data Loading and Preparation

Load the experimental results from the CSV file generated by the experimentation framework, and prepare the data for analysis.

In [None]:
# Load the master results CSV generated from the experimentation runs
df = pd.read_csv("Master_Results.csv")

Feature engineering

In [None]:
# Create a runs dataframe by ignoring scenario-level details
# Select only pipeline-related columns, excluding scenario specifics
pipeline_columns = ["Run ID", "Document Name", "Prompts", "Model Name",
    "Service Graph Hash", "SHACL Graph Hash", "SHACL Valid Syntax",
    "SHACL Error Type", "SHACL Error Message", "Successfully Executed" ]

# Remove duplicate rows based on Run ID to get one row per run (essentially dropping scenario-specifics)
runs_df = df[pipeline_columns].drop_duplicates(subset=["Run ID"])

# Calculate total number of runs for context
total_n_runs = len(runs_df)

# Group runs by configuration (document, model, prompts)
runs_grouped = runs_df.groupby(["Document Name", "Model Name", "Prompts"])

# Display the number of experiments per configuration
runs_grouped.size()

## Syntax Validity

Evaluate the reliability of SHACL code generation by measuring the percentage of runs that produce syntactically valid code and analyze common error types.

In [None]:
# Calculate the percentage of runs with syntactically valid SHACL
valid_syntax_runs = runs_grouped["SHACL Valid Syntax"].sum()    # Count valid runs per group
total_runs = runs_grouped.size()    # Count total runs per group
syntax_success_rate = (valid_syntax_runs / total_runs) * 100

# Combine into a summary dataframe
syntax_stats = pd.DataFrame({
    "Total Runs": total_runs,
    "Valid Syntax Runs": valid_syntax_runs,
    "Syntax Success Rate (%)": syntax_success_rate
}).reset_index()

# Create a bar plot for syntax success rates
g = sns.catplot(
    data=syntax_stats,
    kind="bar",
    x="Prompts",
    y="Syntax Success Rate (%)",
    hue="Model Name",
    col="Document Name",
    palette="viridis",
    height=5,
    aspect=1.5,
    order=[p for p in ["Default", "ZeroShot", "Reflexion"]] # Define order for x-axis
)

# Polish the plot
g.despine(left=True)
g.set_axis_labels("Prompting Strategy", "Syntax Success Rate (%)")
g.set_titles("Document: {col_name}")
plt.ylim(0, 110)

# Add percentage labels on bars
for ax in g.axes.flat:
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%')

plt.subplots_adjust(top=0.85)
g.figure.suptitle("Syntax Reliability")
plt.savefig(f"{IMAGE_DIR}/Syntax Reliability.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate total runs per document and model
runs_totals = runs_df.groupby(["Document Name", "Model Name"]).size().reset_index(name='Total')

# Count occurrences of each error type
error_counts = runs_df.groupby(["Document Name", "Model Name", "SHACL Error Type"]).size().reset_index(name='Count')

# Merge and calculate error percentages
error_stats = pd.merge(error_counts, runs_totals, on=["Document Name", "Model Name"])
error_stats['Percentage'] = (error_stats['Count'] / error_stats['Total']) * 100

# Filter out valid runs to focus on errors
error_stats = error_stats[error_stats['SHACL Error Type'] != 'VALID']

# Plot error rates by type
g = sns.catplot(
    data=error_stats,
    kind="bar",
    x="SHACL Error Type",
    y="Percentage",
    hue="Model Name",
    col="Document Name",
    palette="viridis",
    height=5,
    aspect=1,
    sharey=True # Keep y-axis consistent
)

# Polish the plot
g.despine(left=True)
g.set_axis_labels("Error Category", "Error Rate (% of Total Runs)")
g.set_titles("{col_name}")

# Add percentage labels
for ax in g.axes.flat:
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%', padding=3)

# Set title
plt.subplots_adjust(top=0.85)
g.figure.suptitle("Syntax Error Rates by Type")

# Save the plot
plt.savefig(f"{IMAGE_DIR}/Syntax Error Analysis.png", dpi=300, bbox_inches='tight')
plt.show()

## Logic Validity

Assess the accuracy of the generated SHACL constraints by checking if they correctly identify expected violations across all test scenarios.

In [None]:
# Filter to only runs where the pipeline executed successfully
valid_syntax_df = df[df["Successfully Executed"] == 'True'].copy()

# Check if each scenario matched expected violation count exactly
valid_syntax_df['Strict_Scenario_Match'] = (
    valid_syntax_df['Actual Violation Count'] == valid_syntax_df['Expected Violation Count']
)

# Aggregate by run: check if all scenarios in the run were perfect
run_logic_df = valid_syntax_df.groupby(['Run ID', 'Document Name', 'Model Name', 'Prompts']).agg({
    'Strict_Scenario_Match': 'all' # True only if all scenarios matched
}).reset_index()

# Rename column for clarity
run_logic_df.rename(columns={'Strict_Scenario_Match': 'Perfect_Logic'}, inplace=True)

# Calculate success rate per configuration
logic_stats = run_logic_df.groupby(['Document Name', 'Model Name', 'Prompts'])['Perfect_Logic'].mean().reset_index()
logic_stats['Logic Success Rate (%)'] = logic_stats['Perfect_Logic'] * 100

# Create bar plot for logic success rates
g = sns.catplot(
    data=logic_stats,
    kind="bar",
    x="Prompts",
    y='Logic Success Rate (%)',
    hue="Model Name",
    col="Document Name",
    palette="magma",
    height=5,
    aspect=1.5,
    order=[p for p in ["Default", "ZeroShot", "Reflexion"]] # Define order for x-axis
)

# Polish the plot
g.despine(left=True)
g.set_axis_labels("Prompting Strategy", "Logic Success Rate (%)")
g.set_titles("Document: {col_name}")
plt.ylim(0, 110)

# Add percentage labels
for ax in g.axes.flat:
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%')

plt.subplots_adjust(top=0.85)
g.figure.suptitle("Logic Reliability: % of Flawless Runs (All Scenarios Correct)")
plt.savefig(f"{IMAGE_DIR}/Logic Reliability.png", dpi=300, bbox_inches='tight')
plt.show()

## Recommender System Performance Indicators

Compute standard machine learning metrics like confusion matrix to evaluate the pipeline's feasibility as a recommender system for citizen eligibility.

In [None]:
# Filter to only valid (successfully executed) runs
valid_df = df[df["Successfully Executed"] == 'True'].copy()

# Define ground truth and predictions for recommender accuracy
y_true = valid_df['Expected Violation Count'] == 0 # True if citizen is eligible (no expected violations)
y_pred = valid_df['Actual Violation Count'] == 0 # True if system recommends eligibility (no actual violations)

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=[False, True]) # Labels: [Ineligible, Eligible]

# Plot the confusion matrix
plt.figure(figsize=(8, 6))

# Custom labels for clarity
group_names = ['True Negative\n(Correct Rejection)', 'False Positive\n(Bad Recommendation)',
               'False Negative\n(Missed Opportunity)', 'True Positive\n(Correct Recommendation)']

# Flatten matrix to match names
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(
    cm,
    annot=labels,
    fmt='',
    cmap='Blues',
    xticklabels=['Predicted Ineligible', 'Predicted Eligible'],
    yticklabels=['Actually Ineligible', 'Actually Eligible']
)

plt.title("Overall Pipeline Feasibility: Recommender Accuracy")
plt.ylabel("Ground Truth (Citizen Status)")
plt.xlabel("System Output (Recommendation)")
plt.savefig(f"{IMAGE_DIR}/Recommender Confusion Matrix.png", dpi=300, bbox_inches='tight')
plt.show()

## Overall Reliability

Categorize and visualize the distribution of run outcomes to understand the overall performance and failure modes of the experimentation pipeline.

In [None]:
# Define a function to categorize the outcome of each run
def categorize_run_outcome(group):
    # 'group' is a DataFrame for one Run ID (all scenarios)

    # Check if pipeline crashed
    successfully_executed = str(group['Successfully Executed'].iloc[0])
    if successfully_executed not in ["True", "False"]:
        return "Python Kernel Crash"

    # Check if SHACL compiled
    syntax_error_type = group['SHACL Error Type'].iloc[0]
    if syntax_error_type != "VALID":
        return f"{syntax_error_type.split('_')[0]} Syntax Error" # Return error type

    # Check if all logic tests passed
    is_perfect = (group['Actual Violation Count'] == group['Expected Violation Count']).all()
    if is_perfect:
        return "Perfect Run"
    else:
        return "Logic Failure"

# Apply categorization to each run
run_outcomes = df.groupby("Run ID").apply(categorize_run_outcome, include_groups=False) # Group by Run ID

# Count occurrences of each outcome
outcome_counts = run_outcomes.value_counts().reset_index()
outcome_counts.columns = ['Outcome', 'Count']

# Visualize the distribution
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    data=outcome_counts,
    y="Outcome",
    x="Count",
    hue="Outcome",
    palette="coolwarm_r" # Red for crashes, blue for perfect
)

# Polish the plot
plt.title(f"Pipeline Outcome Distribution (N={total_n_runs})")
plt.xlabel(f"Number of Runs")
plt.ylabel("")

# Add count labels
for container in ax.containers:
    ax.bar_label(container, padding=3)

plt.savefig(f"{IMAGE_DIR}/Outcome Distribution.png", dpi=300, bbox_inches='tight')
plt.show()