In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate

def parse_and_print_results(csv_file_path):
    """
    Parse the test log CSV and print a comprehensive performance table.
    
    Args:
        csv_file_path (str): Path to the CSV file containing test results
    """
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    print("=" * 80)
    print("BAYESIAN NETWORK MODEL PERFORMANCE ANALYSIS")
    print("=" * 80)
    print(f"Total tests: {len(df)}")
    print(f"Models tested: {', '.join(df['model'].unique())}")
    print(f"Test types: {', '.join(df['test_type'].unique())}")
    print(f"Question categories: {', '.join(df['question_set_name'].unique())}")
    print(f"Network size: {df['network_size'].iloc[0]} nodes")
    print("=" * 80)
    
    # Create summary statistics
    summary_data = []
    
    # Group by model and question type
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        
        # Raw model performance
        raw_scores = model_data['raw_model_score'].values
        raw_avg = np.mean(raw_scores)
        raw_std = np.std(raw_scores)
        
        # BayMin framework performance  
        baymin_scores = model_data['baymin_score'].values
        baymin_avg = np.mean(baymin_scores)
        baymin_std = np.std(baymin_scores)
        
        # Performance improvement
        improvement = baymin_avg - raw_avg
        
        summary_data.append({
            'Model': model,
            'Framework': 'Raw Model',
            'Avg Score': f"{raw_avg:.3f}",
            'Std Dev': f"{raw_std:.3f}",
            'Total Tests': len(raw_scores),
            'Improvement': f"{improvement:+.3f}"
        })
        
        summary_data.append({
            'Model': model,
            'Framework': 'BayMin',
            'Avg Score': f"{baymin_avg:.3f}",
            'Std Dev': f"{baymin_std:.3f}",
            'Total Tests': len(baymin_scores),
            'Improvement': f"{improvement:+.3f}"
        })
    
    # Print overall summary table
    print("\nOVERALL PERFORMANCE SUMMARY")
    print("-" * 80)
    summary_df = pd.DataFrame(summary_data)
    print(tabulate(summary_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Detailed breakdown by question type
    print("\n\nDETAILED BREAKDOWN BY QUESTION TYPE")
    print("=" * 80)
    
    for question_type in df['question_set_name'].unique():
        print(f"\n{question_type.upper().replace('_', ' ')} QUESTIONS")
        print("-" * 50)
        
        type_data = df[df['question_set_name'] == question_type]
        type_summary = []
        
        for model in type_data['model'].unique():
            model_type_data = type_data[type_data['model'] == model]
            
            raw_avg = np.mean(model_type_data['raw_model_score'].values)
            baymin_avg = np.mean(model_type_data['baymin_score'].values)
            improvement = baymin_avg - raw_avg
            
            type_summary.append({
                'Model': model,
                'Raw Avg': f"{raw_avg:.3f}",
                'BayMin Avg': f"{baymin_avg:.3f}",
                'Improvement': f"{improvement:+.3f}",
                'Tests': len(model_type_data)
            })
        
        type_df = pd.DataFrame(type_summary)
        print(tabulate(type_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Performance by test type (elementary vs numerical)
    print("\n\nPERFORMANCE BY TEST TYPE")
    print("=" * 80)
    
    for test_type in df['test_type'].unique():
        print(f"\n{test_type.upper().replace('_', ' ')}")
        print("-" * 30)
        
        test_data = df[df['test_type'] == test_type]
        test_summary = []
        
        for model in test_data['model'].unique():
            model_test_data = test_data[test_data['model'] == model]
            
            raw_avg = np.mean(model_test_data['raw_model_score'].values)
            baymin_avg = np.mean(model_test_data['baymin_score'].values)
            improvement = baymin_avg - raw_avg
            
            test_summary.append({
                'Model': model,
                'Raw Avg': f"{raw_avg:.3f}",
                'BayMin Avg': f"{baymin_avg:.3f}",
                'Improvement': f"{improvement:+.3f}",
                'Tests': len(model_test_data)
            })
        
        test_df = pd.DataFrame(test_summary)
        print(tabulate(test_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Runtime analysis
    print("\n\nRUNTIME ANALYSIS")
    print("=" * 80)
    
    runtime_summary = []
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        
        raw_runtime = model_data['raw_model_runtime'].values
        baymin_runtime = model_data['baymin_runtime'].values
        
        runtime_summary.append({
            'Model': model,
            'Raw Avg Runtime (s)': f"{np.mean(raw_runtime):.2f}",
            'BayMin Avg Runtime (s)': f"{np.mean(baymin_runtime):.2f}",
            'Runtime Overhead': f"{np.mean(baymin_runtime) - np.mean(raw_runtime):+.2f}s"
        })
    
    runtime_df = pd.DataFrame(runtime_summary)
    print(tabulate(runtime_df, headers='keys', tablefmt='grid', stralign='center'))
    
    print("\n" + "=" * 80)
    print("ANALYSIS COMPLETE")
    print("=" * 80)

# Test the function
csv_path = "benchmarking/results/latest/test_log.csv"
parse_and_print_results(csv_path)


BAYESIAN NETWORK MODEL PERFORMANCE ANALYSIS
Total tests: 540
Models tested: gpt-oss:latest, llama3.1:70b, qwen3:8b
Test types: elementary_test, numerical_test
Question categories: dependency, common_cause, common_effect, blocked_evidence, probability, evidence_change_relationship
Network size: 5 nodes

OVERALL PERFORMANCE SUMMARY
--------------------------------------------------------------------------------
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|    |     Model      |  Framework  |   Avg Score |   Std Dev |   Total Tests |   Improvement |
|  0 | gpt-oss:latest |  Raw Model  |       0.594 |     0.491 |           180 |         0.333 |
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|  1 | gpt-oss:latest |   BayMin    |       0.928 |     0.259 |           180 |         0.333 |
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|  2 |  lla

In [None]:
import pandas as pd
import numpy as np

MODEL_ORDER = ['gpt-oss:latest', 'llama3.1:70b', 'qwen3:8b']
MODEL_LABEL = {
	'gpt-oss:latest': 'gpt-oss:20b',
	'llama3.1:70b': 'llama3.1:70b',
	'qwen3:8b': 'qwen3:8b',
}


def _pct(x):
	if len(x) == 0 or np.isnan(x).all():
		return None
	return round(float(np.nanmean(x) * 100.0), 1)


def print_compact_performance_table(csv_file_path: str) -> pd.DataFrame:
	"""
	Print a compact table per requirement:
	QuestionType NetworkSize Raw[gpt-oss:20b llama3.1:70b qwen3:8b] BayMin[gpt-oss:20b llama3.1:70b qwen3:8b]
	Example row: "Dependency 5 60 30 50 90 70 80"
	Returns the DataFrame used to print, for reuse/saving if needed.
	"""
	# Load
	df = pd.read_csv(csv_file_path)

	# Aggregate means per (question_set_name, network_size, model)
	agg = (
		df.groupby(['question_set_name', 'network_size', 'model'])
		.agg(raw_mean=('raw_model_score', 'mean'), baymin_mean=('baymin_score', 'mean'))
		.reset_index()
	)

	# Ensure all models appear even if missing for a question type
	qns = sorted(df['question_set_name'].unique().tolist())
	nsizes = sorted(df['network_size'].unique().tolist())
	rows = []
	for q in qns:
		for ns in nsizes:
			row = {
				'QuestionType': q.replace('_', ' ').title(),
				'NetworkSize': ns,
			}
			# Raw block
			for m in MODEL_ORDER:
				val = agg[(agg['question_set_name'] == q) & (agg['network_size'] == ns) & (agg['model'] == m)]['raw_mean']
				score = _pct(val.values) if len(val) else None
				row[f'Raw {MODEL_LABEL[m]}'] = score
			# BayMin block
			for m in MODEL_ORDER:
				val = agg[(agg['question_set_name'] == q) & (agg['network_size'] == ns) & (agg['model'] == m)]['baymin_mean']
				score = _pct(val.values) if len(val) else None
				row[f'BayMin {MODEL_LABEL[m]}'] = score
			rows.append(row)

	wide_df = pd.DataFrame(rows)

	# Desired column ordering
	cols = (
		['QuestionType', 'NetworkSize'] +
		[*(f'Raw {MODEL_LABEL[m]}' for m in MODEL_ORDER)] +
		[*(f'BayMin {MODEL_LABEL[m]}' for m in MODEL_ORDER)]
	)
	wide_df = wide_df[cols]

	# Print header with groups
	raw_labels = [MODEL_LABEL[m] for m in MODEL_ORDER]
	bay_labels = [MODEL_LABEL[m] for m in MODEL_ORDER]
	header = (
		"QuestionType NetworkSize "
		"[Raw] " + " ".join(raw_labels) + " "
		"[BayMin] " + " ".join(bay_labels)
	)
	print(header)
	for _, r in wide_df.iterrows():
		vals = [
			str(r['QuestionType']),
			str(r['NetworkSize']),
			*[("" if pd.isna(r[c]) else str(int(r[c]))) for c in [f'Raw {lab}' for lab in raw_labels]],
			*[("" if pd.isna(r[c]) else str(int(r[c]))) for c in [f'BayMin {lab}' for lab in bay_labels]],
		]
		print(" ".join(vals))

	return wide_df

# Example usage (uncomment to run in the notebook):
_ = print_compact_performance_table('benchmarking/results/latest/test_log.csv')



QuestionType NetworkSize [Raw] gpt-oss:20b llama3.1:70b qwen3:8b [BayMin] gpt-oss:20b llama3.1:70b qwen3:8b
Blocked Evidence 5 23 27 27 90 83 43
Common Cause 5 80 47 67 97 93 97
Common Effect 5 67 40 67 93 70 100
Dependency 5 73 47 77 100 63 100
Evidence Change Relationship 5 73 53 63 80 90 97
Probability 5 40 3 3 97 3 50


In [7]:
import pandas as pd
import numpy as np
from IPython.display import display


def build_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
	"""
	Return a DataFrame indexed by (QuestionType, NetworkSize) with grouped column
	headers: top-level framework (Raw, BayMin) and second-level model labels
	(gpt-oss:20b, llama3.1:70b, qwen3:8b). Values are mean percentage scores.
	Includes all network sizes present for each question type.
	Concatenates main CSV with baymin CSV if provided.
	"""
	from __main__ import MODEL_ORDER, MODEL_LABEL, _pct  # use definitions from earlier cell

	# Load main CSV
	df = pd.read_csv(csv_file_path)
	
	# Load and concatenate baymin CSV if provided
	if baymin_csv_file_path:
		df_baymin = pd.read_csv(baymin_csv_file_path)
		df = pd.concat([df, df_baymin], ignore_index=True)

	# Aggregate means once
	agg = (
		df.groupby(['question_set_name', 'network_size', 'model'])
		.agg(raw_mean=('raw_model_score', 'mean'), baymin_mean=('baymin_score', 'mean'))
		.reset_index()
	)

	# Build MultiIndex columns: (Framework, ModelLabel)
	top = []
	bottom = []
	for fw in ['Raw', 'BayMin']:
		for m in MODEL_ORDER:
			top.append(fw)
			bottom.append(MODEL_LABEL[m])
	cols = pd.MultiIndex.from_arrays([top, bottom])

	rows = []
	row_index = []
	for q in sorted(agg['question_set_name'].unique().tolist()):
		sub = agg[agg['question_set_name'] == q]
		# Include all network sizes that exist in the data for this question type
		available_sizes = sorted(sub['network_size'].unique().tolist())
		for ns in available_sizes:
			row_vals = []
			# Raw values (in model order)
			for m in MODEL_ORDER:
				val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]['raw_mean']
				row_vals.append(_pct(val.values) if len(val) else None)
			# BayMin values (in model order)
			for m in MODEL_ORDER:
				val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]['baymin_mean']
				row_vals.append(_pct(val.values) if len(val) else None)
			rows.append(row_vals)
			row_index.append((q.replace('_', ' ').title(), ns))

	idx = pd.MultiIndex.from_tuples(row_index, names=['QuestionType', 'NetworkSize'])
	table = pd.DataFrame(rows, index=idx, columns=cols)

	# Replace NaN values and 0 values with "-" for better readability
	table = table.fillna("-")
	table = table.replace(0, "-")

	return table


def show_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
	"""Build and display the grouped performance table in the notebook."""
	table = build_performance_table(csv_file_path, baymin_csv_file_path)
	display(table)
	return table

# Example usage (uncomment to run):
_ = show_performance_table('benchmarking/results/latest/test_log.csv', 'baymin_test_log.csv')



Unnamed: 0_level_0,Unnamed: 1_level_0,Raw,Raw,Raw,BayMin,BayMin,BayMin
Unnamed: 0_level_1,Unnamed: 1_level_1,gpt-oss:20b,llama3.1:70b,qwen3:8b,gpt-oss:20b,llama3.1:70b,qwen3:8b
QuestionType,NetworkSize,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Blocked Evidence,5,12,27.0,13,83,83.0,48
Blocked Evidence,10,-,-,-,87,-,50
Blocked Evidence,30,-,-,-,87,-,57
Blocked Evidence,60,-,-,-,93,-,80
Common Cause,5,40,47.0,33,93,93.0,95
Common Cause,10,-,-,-,100,-,100
Common Cause,30,-,-,-,100,-,93
Common Cause,60,-,-,-,100,-,87
Common Effect,5,33,40.0,33,93,70.0,97
Common Effect,10,-,-,-,90,-,90
