In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate

def parse_and_print_results(csv_file_path):
    """
    Parse the test log CSV and print a comprehensive performance table.
    
    Args:
        csv_file_path (str): Path to the CSV file containing test results
    """
    # Read the CSV file
    df = pd.read_csv(csv_file_path)
    
    print("=" * 80)
    print("BAYESIAN NETWORK MODEL PERFORMANCE ANALYSIS")
    print("=" * 80)
    print(f"Total tests: {len(df)}")
    print(f"Models tested: {', '.join(df['model'].unique())}")
    print(f"Test types: {', '.join(df['test_type'].unique())}")
    print(f"Question categories: {', '.join(df['question_set_name'].unique())}")
    print(f"Network size: {df['network_size'].iloc[0]} nodes")
    print("=" * 80)
    
    # Create summary statistics
    summary_data = []
    
    # Group by model and question type
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        
        # Raw model performance
        raw_scores = model_data['raw_model_score'].values
        raw_avg = np.mean(raw_scores)
        raw_std = np.std(raw_scores)
        
        # BayMin framework performance  
        baymin_scores = model_data['baymin_score'].values
        baymin_avg = np.mean(baymin_scores)
        baymin_std = np.std(baymin_scores)
        
        # Performance improvement
        improvement = baymin_avg - raw_avg
        
        summary_data.append({
            'Model': model,
            'Framework': 'Raw Model',
            'Avg Score': f"{raw_avg:.3f}",
            'Std Dev': f"{raw_std:.3f}",
            'Total Tests': len(raw_scores),
            'Improvement': f"{improvement:+.3f}"
        })
        
        summary_data.append({
            'Model': model,
            'Framework': 'BayMin',
            'Avg Score': f"{baymin_avg:.3f}",
            'Std Dev': f"{baymin_std:.3f}",
            'Total Tests': len(baymin_scores),
            'Improvement': f"{improvement:+.3f}"
        })
    
    # Print overall summary table
    print("\nOVERALL PERFORMANCE SUMMARY")
    print("-" * 80)
    summary_df = pd.DataFrame(summary_data)
    print(tabulate(summary_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Detailed breakdown by question type
    print("\n\nDETAILED BREAKDOWN BY QUESTION TYPE")
    print("=" * 80)
    
    for question_type in df['question_set_name'].unique():
        print(f"\n{question_type.upper().replace('_', ' ')} QUESTIONS")
        print("-" * 50)
        
        type_data = df[df['question_set_name'] == question_type]
        type_summary = []
        
        for model in type_data['model'].unique():
            model_type_data = type_data[type_data['model'] == model]
            
            raw_avg = np.mean(model_type_data['raw_model_score'].values)
            baymin_avg = np.mean(model_type_data['baymin_score'].values)
            improvement = baymin_avg - raw_avg
            
            type_summary.append({
                'Model': model,
                'Raw Avg': f"{raw_avg:.3f}",
                'BayMin Avg': f"{baymin_avg:.3f}",
                'Improvement': f"{improvement:+.3f}",
                'Tests': len(model_type_data)
            })
        
        type_df = pd.DataFrame(type_summary)
        print(tabulate(type_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Performance by test type (elementary vs numerical)
    print("\n\nPERFORMANCE BY TEST TYPE")
    print("=" * 80)
    
    for test_type in df['test_type'].unique():
        print(f"\n{test_type.upper().replace('_', ' ')}")
        print("-" * 30)
        
        test_data = df[df['test_type'] == test_type]
        test_summary = []
        
        for model in test_data['model'].unique():
            model_test_data = test_data[test_data['model'] == model]
            
            raw_avg = np.mean(model_test_data['raw_model_score'].values)
            baymin_avg = np.mean(model_test_data['baymin_score'].values)
            improvement = baymin_avg - raw_avg
            
            test_summary.append({
                'Model': model,
                'Raw Avg': f"{raw_avg:.3f}",
                'BayMin Avg': f"{baymin_avg:.3f}",
                'Improvement': f"{improvement:+.3f}",
                'Tests': len(model_test_data)
            })
        
        test_df = pd.DataFrame(test_summary)
        print(tabulate(test_df, headers='keys', tablefmt='grid', stralign='center'))
    
    # Runtime analysis
    print("\n\nRUNTIME ANALYSIS")
    print("=" * 80)
    
    runtime_summary = []
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        
        raw_runtime = model_data['raw_model_runtime'].values
        baymin_runtime = model_data['baymin_runtime'].values
        
        runtime_summary.append({
            'Model': model,
            'Raw Avg Runtime (s)': f"{np.mean(raw_runtime):.2f}",
            'BayMin Avg Runtime (s)': f"{np.mean(baymin_runtime):.2f}",
            'Runtime Overhead': f"{np.mean(baymin_runtime) - np.mean(raw_runtime):+.2f}s"
        })
    
    runtime_df = pd.DataFrame(runtime_summary)
    print(tabulate(runtime_df, headers='keys', tablefmt='grid', stralign='center'))
    
    print("\n" + "=" * 80)
    print("ANALYSIS COMPLETE")
    print("=" * 80)

# Test the function
csv_path = "benchmarking/results/latest/test_log.csv"
parse_and_print_results(csv_path)


BAYESIAN NETWORK MODEL PERFORMANCE ANALYSIS
Total tests: 540
Models tested: gpt-oss:latest, llama3.1:70b, qwen3:8b
Test types: elementary_test, numerical_test
Question categories: dependency, common_cause, common_effect, blocked_evidence, probability, evidence_change_relationship
Network size: 5 nodes

OVERALL PERFORMANCE SUMMARY
--------------------------------------------------------------------------------
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|    |     Model      |  Framework  |   Avg Score |   Std Dev |   Total Tests |   Improvement |
|  0 | gpt-oss:latest |  Raw Model  |       0.594 |     0.491 |           180 |         0.333 |
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|  1 | gpt-oss:latest |   BayMin    |       0.928 |     0.259 |           180 |         0.333 |
+----+----------------+-------------+-------------+-----------+---------------+---------------+
|  2 |  lla

In [2]:
import pandas as pd
import numpy as np

MODEL_ORDER = ['gpt-oss:latest', 'llama3.1:70b', 'qwen3:8b']
MODEL_LABEL = {
	'gpt-oss:latest': 'gpt-oss:20b',
	'llama3.1:70b': 'llama3.1:70b',
	'qwen3:8b': 'qwen3:8b',
}


def _pct(x):
	if len(x) == 0 or np.isnan(x).all():
		return None
	return round(float(np.nanmean(x) * 100.0), 1)


def print_compact_performance_table(csv_file_path: str) -> pd.DataFrame:
	"""
	Print a compact table per requirement:
	QuestionType NetworkSize Raw[gpt-oss:20b llama3.1:70b qwen3:8b] BayMin[gpt-oss:20b llama3.1:70b qwen3:8b]
	Example row: "Dependency 5 60 30 50 90 70 80"
	Returns the DataFrame used to print, for reuse/saving if needed.
	"""
	# Load
	df = pd.read_csv(csv_file_path)

	# Aggregate means per (question_set_name, network_size, model)
	agg = (
		df.groupby(['question_set_name', 'network_size', 'model'])
		.agg(raw_mean=('raw_model_score', 'mean'), baymin_mean=('baymin_score', 'mean'))
		.reset_index()
	)

	# Ensure all models appear even if missing for a question type
	qns = sorted(df['question_set_name'].unique().tolist())
	nsizes = sorted(df['network_size'].unique().tolist())
	rows = []
	for q in qns:
		for ns in nsizes:
			row = {
				'QuestionType': q.replace('_', ' ').title(),
				'NetworkSize': ns,
			}
			# Raw block
			for m in MODEL_ORDER:
				val = agg[(agg['question_set_name'] == q) & (agg['network_size'] == ns) & (agg['model'] == m)]['raw_mean']
				score = _pct(val.values) if len(val) else None
				row[f'Raw {MODEL_LABEL[m]}'] = score
			# BayMin block
			for m in MODEL_ORDER:
				val = agg[(agg['question_set_name'] == q) & (agg['network_size'] == ns) & (agg['model'] == m)]['baymin_mean']
				score = _pct(val.values) if len(val) else None
				row[f'BayMin {MODEL_LABEL[m]}'] = score
			rows.append(row)

	wide_df = pd.DataFrame(rows)

	# Desired column ordering
	cols = (
		['QuestionType', 'NetworkSize'] +
		[*(f'Raw {MODEL_LABEL[m]}' for m in MODEL_ORDER)] +
		[*(f'BayMin {MODEL_LABEL[m]}' for m in MODEL_ORDER)]
	)
	wide_df = wide_df[cols]

	# Print header with groups
	raw_labels = [MODEL_LABEL[m] for m in MODEL_ORDER]
	bay_labels = [MODEL_LABEL[m] for m in MODEL_ORDER]
	header = (
		"QuestionType NetworkSize "
		"[Raw] " + " ".join(raw_labels) + " "
		"[BayMin] " + " ".join(bay_labels)
	)
	print(header)
	for _, r in wide_df.iterrows():
		vals = [
			str(r['QuestionType']),
			str(r['NetworkSize']),
			*[("" if pd.isna(r[c]) else str(int(r[c]))) for c in [f'Raw {lab}' for lab in raw_labels]],
			*[("" if pd.isna(r[c]) else str(int(r[c]))) for c in [f'BayMin {lab}' for lab in bay_labels]],
		]
		print(" ".join(vals))

	return wide_df

# Example usage (uncomment to run in the notebook):
_ = print_compact_performance_table('benchmarking/results/latest/test_log.csv')



QuestionType NetworkSize [Raw] gpt-oss:20b llama3.1:70b qwen3:8b [BayMin] gpt-oss:20b llama3.1:70b qwen3:8b
Blocked Evidence 5 23 26 26 90 83 43
Common Cause 5 80 46 66 96 93 96
Common Effect 5 66 40 66 93 70 100
Dependency 5 73 46 76 100 63 100
Evidence Change Relationship 5 73 53 63 80 90 96
Probability 5 40 3 3 96 3 50


In [10]:
import pandas as pd
import numpy as np
from IPython.display import display


def build_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
    """
    Return a DataFrame indexed by (QuestionType, NetworkSize) with grouped column
    headers: top-level framework (Raw, BayMin) and second-level model labels
    (gpt-oss:20b, llama3.1:70b, qwen3:8b). Values are mean percentage scores.
    Includes all network sizes present for each question type.
    Raw means are computed ONLY from csv_file_path.
    BayMin means are computed ONLY from baymin_csv_file_path (or csv_file_path if None).
    """

    # --- 1) Load sources separately (prevents leakage) ---
    df_raw = pd.read_csv(csv_file_path)
    df_bay = pd.read_csv(baymin_csv_file_path) if baymin_csv_file_path else df_raw.copy()

    # --- 2) Aggregate separately ---
    agg_raw = (
        df_raw.groupby(['question_set_name', 'network_size', 'model'])['raw_model_score']
              .mean()
              .rename('raw_mean')
              .reset_index()
    )

    agg_bay = (
        df_bay.groupby(['question_set_name', 'network_size', 'model'])['baymin_score']
              .mean()
              .rename('baymin_mean')
              .reset_index()
    )

    # --- 3) Outer-merge the two aggregates on keys ---
    agg = pd.merge(
        agg_raw, agg_bay,
        on=['question_set_name', 'network_size', 'model'],
        how='outer'
    )

    # --- 4) Build MultiIndex columns: (Framework, ModelLabel) ---
    top, bottom = [], []
    for fw in ['Raw', 'BayMin']:
        for m in MODEL_ORDER:
            top.append(fw)
            bottom.append(MODEL_LABEL[m])
    cols = pd.MultiIndex.from_arrays([top, bottom])

    # --- 5) Assemble rows (include all sizes seen per question type across both sources) ---
    rows, row_index = [], []
    for q in sorted(agg['question_set_name'].dropna().unique().tolist()):
        sub = agg[agg['question_set_name'] == q]
        # union of sizes that appear in either raw or baymin for this question type
        available_sizes = sorted(sub['network_size'].dropna().unique().tolist())
        for ns in available_sizes:
            row_vals = []
            # Raw block (ordered by MODEL_ORDER)
            for m in MODEL_ORDER:
                val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]['raw_mean']
                row_vals.append(_pct(val.values) if len(val) and pd.notna(val.values[0]) else None)
            # BayMin block (ordered by MODEL_ORDER)
            for m in MODEL_ORDER:
                val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]['baymin_mean']
                row_vals.append(_pct(val.values) if len(val) and pd.notna(val.values[0]) else None)
            rows.append(row_vals)
            row_index.append((q.replace('_', ' ').title(), ns))

    idx = pd.MultiIndex.from_tuples(row_index, names=['QuestionType', 'NetworkSize'])
    table = pd.DataFrame(rows, index=idx, columns=cols)

		# Replace NaN values and 0 values with "-" for better readability
    table = table.fillna("-")
    excluded = table.columns[table.columns.get_level_values(0) == 'BayMin']
    cols_to_change = table.columns.difference(excluded)
    table.loc[:, cols_to_change] = table.loc[:, cols_to_change].replace(0, "-")

    return table


def show_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
	"""Build and display the grouped performance table in the notebook."""
	table = build_performance_table(csv_file_path, baymin_csv_file_path)
	display(table)
	return table

# Example usage (uncomment to run):
_ = show_performance_table('test_log.csv', 'baymin_test_log.csv')



Unnamed: 0_level_0,Unnamed: 1_level_0,Raw,Raw,Raw,BayMin,BayMin,BayMin
Unnamed: 0_level_1,Unnamed: 1_level_1,gpt-oss:20b,llama3.1:70b,qwen3:8b,gpt-oss:20b,llama3.1:70b,qwen3:8b
QuestionType,NetworkSize,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Blocked Evidence,5,23.3,26.7,26.7,76.7,86.7,53.3
Blocked Evidence,10,-,-,-,86.7,86.7,50.0
Blocked Evidence,30,-,-,-,86.7,96.7,56.7
Blocked Evidence,60,-,-,-,93.3,93.3,80.0
Common Cause,5,80.0,46.7,66.7,90.0,86.7,93.3
Common Cause,10,-,-,-,100.0,93.3,100.0
Common Cause,30,-,-,-,100.0,100.0,93.3
Common Cause,60,-,-,-,100.0,100.0,86.7
Common Effect,5,66.7,40.0,66.7,93.3,70.0,93.3
Common Effect,10,-,-,-,90.0,80.0,90.0


In [None]:
def table_to_latex(
    table,
    caption="Accuracy Comparison across Models and Network Sizes.",
    label="tab:performance",
    filename=None,
    escape=True,
    frameworks_order=("Raw", "BayMin"),
    starred=True,                 # use \begin{table*} ... \end{table*}
    placement="[t]",              # top placement like the sample
    size_cmd="\\small",           # \small before tabular
    end_size_cmd="\\normalsize",  # \normalsize after tabular
    add_vertical_bar=True,         # add vertical bar between Raw | BayMin
    display_require_packages=True
):
    """
    Render a MultiIndex performance DataFrame to LaTeX in the requested style.

    Expected table.columns to be a MultiIndex of (Framework, ModelName),
    and table.index to be a MultiIndex of (QuestionType, NetworkSize).
    """

    import pandas as pd

    # --- Helpers ---
    def tex_escape(s: str) -> str:
        if not escape or s is None:
            return s
        # Minimal escaping for underscores; extend if needed
        return str(s).replace("_", "\\_")

    # Sanity: ensure frameworks exist in columns level 0
    col_level0 = table.columns.get_level_values(0)
    frameworks_present = [fw for fw in frameworks_order if fw in set(col_level0)]
    # Also include any unexpected frameworks at the end to avoid dropping columns
    frameworks_present += [fw for fw in col_level0.unique() if fw not in frameworks_present]

    # Count columns per framework
    fw_cols = {fw: table.columns[col_level0 == fw] for fw in frameworks_present}
    raw_count = len(fw_cols.get("Raw", []))
    bay_count = len(fw_cols.get("BayMin", []))
    left_count = sum(len(fw_cols[fw]) for fw in frameworks_present[:frameworks_present.index("BayMin")] if fw in fw_cols)
    # Column spec: two left columns (ll), then model columns; insert a '|' before BayMin group if requested
    col_spec_parts = ["l", "l"]
    for fw in frameworks_present:
        if add_vertical_bar and fw == "BayMin":
            col_spec_parts.append("|")
        col_spec_parts.extend(["c"] * len(fw_cols[fw]))
    col_spec = "".join(col_spec_parts)

    # Build LaTeX
    lines = []
    if display_require_packages:
        lines.append("% Required packages:")
        lines.append("% \\usepackage{booktabs}")   # top/mid/bottom rule
        lines.append("% \\usepackage{multirow}")   # multirow Question Type
        lines.append("% \\usepackage{makecell}")   # optional: \\thead")
        lines.append("")

    env = "table*" if starred else "table"
    lines.append(f"\\begin{{{env}}}{placement}")
    lines.append("  \\centering")
    lines.append(f"  \\caption{{{caption}}}")
    lines.append(f"  \\label{{{label}}}")
    if size_cmd:
        lines.append(f"  {size_cmd}")

    lines.append(f"  \\begin{{tabular}}{{{col_spec}}}")
    lines.append("  \\toprule")

    # Header Row 1: empty for the two stub columns, then bold framework blocks
    first_hdr_cells = ["", ""]
    for fw in frameworks_present:
        n = len(fw_cols[fw])
        if n > 0:
            if n == 1:
                first_hdr_cells.append(f"\\textbf{{{tex_escape(fw)}}}")
            else:
                first_hdr_cells.append(f"\\multicolumn{{{n}}}{{c}}{{\\textbf{{{tex_escape(fw)}}}}}")
    lines.append("  " + " & ".join(first_hdr_cells) + " \\\\")

    # cmidrules aligning with framework blocks (columns start at 1)
    # Two stub columns are #1 and #2. Framework blocks start at 3.
    start_col = 3
    for fw in frameworks_present:
        n = len(fw_cols[fw])
        if n > 0:
            end_col = start_col + n - 1
            lines.append(f"  \\cmidrule(lr){{{start_col}-{end_col}}}")
            start_col = end_col + 1

    # Header Row 2: stub headers then bold model names in order
    second_hdr = ["\\textbf{Question Type}", "\\textbf{Net}"]
    for fw in frameworks_present:
        for _, model_name in fw_cols[fw]:
            second_hdr.append(f"\\textbf{{{tex_escape(model_name)}}}")
    lines.append("  " + " & ".join(second_hdr) + " \\\\")
    lines.append("  \\midrule")

    # Body with \multirow blocks for Question Type
    # Group by Question Type (level 0 of index)
    if not isinstance(table.index, pd.MultiIndex) or table.index.nlevels != 2:
        raise ValueError("table.index must be a MultiIndex with (QuestionType, NetworkSize).")

    for qtype, subdf in table.groupby(level=0, sort=False):
        qtype_tex = tex_escape(qtype)
        n_rows = len(subdf)

        # Iterate network sizes in index order
        first_row = True
        for (qt, net), row in subdf.iterrows():
            net_tex = tex_escape(net)

            # Left stub columns
            if first_row:
                left_stub = f"\\multirow{{{n_rows}}}{{*}}{{{qtype_tex}}} & {net_tex}"
                first_row = False
            else:
                left_stub = f" & {net_tex}"

            # Data columns in frameworks order
            data_cells = []
            for fw in frameworks_present:
                for col in fw_cols[fw]:
                    val = row[col]
                    if pd.isna(val) or val == "-" or val == "---":
                        data_cells.append("---")
                    else:
                        try:
                            x = float(val)
                            # Match sample: show 1 decimal; keep integers like 100 without .0 if exactly integer
                            # if abs(x - round(x)) < 1e-9:
                            if abs(x) == 100:
                                data_cells.append(f"{int(round(x))}")
                            else:
                                data_cells.append(f"{x:.1f}")
                        except (TypeError, ValueError):
                            data_cells.append(tex_escape(str(val)))

            lines.append("  " + " & ".join([left_stub] + data_cells) + " \\\\")
        lines.append("  \\midrule")

    # Replace final midrule with bottomrule
    if lines[-1].strip() == "\\midrule" or lines[-1].strip() == "  \\midrule":
        lines[-1] = "  \\bottomrule"
    else:
        lines.append("  \\bottomrule")

    lines.append("  \\end{tabular}")
    if end_size_cmd:
        lines.append(f"  {end_size_cmd}")
    lines.append(f"\\end{{{env}}}")

    latex_code = "\n".join(lines)

    if filename:
        with open(filename, "w") as f:
            f.write(latex_code)

    return latex_code


def generate_latex_table(csv_file_path: str, baymin_csv_file_path: str = None, 
                        filename: str = None, display_require_packages: bool = True):
    """
    Generate LaTeX table from CSV data and optionally save to file.
    
    Args:
        csv_file_path (str): Path to main CSV file
        baymin_csv_file_path (str, optional): Path to BayMin CSV file
        filename (str, optional): Output filename for LaTeX code
    
    Returns:
        str: LaTeX table code
    """
    # Build the performance table
    table = build_performance_table(csv_file_path, baymin_csv_file_path)
    
    # Convert to LaTeX
    latex_code = table_to_latex(table, filename=filename, display_require_packages=display_require_packages)
    
    return latex_code

# Generate LaTeX table and save to file
latex_code = generate_latex_table('test_log.csv', 
                                 'baymin_test_log.csv', 
                                 'performance_table.tex',
                                 display_require_packages=False)

print(latex_code)
