In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

MODEL_ORDER = ['gpt-oss:latest', 'llama3.1:70b', 'qwen3:8b']
MODEL_LABEL = {
	'gpt-oss:latest': 'gpt-oss:20b',
	'llama3.1:70b': 'llama3.1:70b',
	'qwen3:8b': 'qwen3:8b',
}

def _pct(x):
    if x is None:
        return None
    x = np.asarray(x, dtype=float)
    if x.size == 0 or np.isnan(x).all():
        return None
    return round(float(np.nanmean(x) * 100.0), 1)

def build_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
    """
    Return a DataFrame indexed by (QuestionType, NetworkSize) with grouped column
    headers: top-level framework (Raw, BayMin) and second-level model labels
    (gpt-oss:20b, llama3.1:70b, qwen3:8b). Values are mean percentage scores.
    Includes all network sizes present for each question type.
    Raw means are computed ONLY from csv_file_path.
    BayMin means are computed ONLY from baymin_csv_file_path (or csv_file_path if None).
    """

    # --- 1) Load sources separately (prevents leakage) ---
    df_raw = pd.read_csv(csv_file_path)
    df_bay = pd.read_csv(baymin_csv_file_path) if baymin_csv_file_path else df_raw.copy()

    # --- 2) Aggregate separately ---
    agg_raw = (
        df_raw.groupby(['question_set_name', 'network_size', 'model'])['raw_model_score']
            .agg(n='size',
                mean='mean',
                std=lambda s: s.std(ddof=0))
            .rename(columns={'mean': 'raw_mean', 'std': 'raw_std', 'n': 'raw_n'})
            .assign(raw_se=lambda d: d['raw_std'] / np.sqrt(d['raw_n']),
                    raw_ci_hw=lambda d: 1.96 * np.sqrt(d['raw_mean'] * (1 - d['raw_mean']) / d['raw_n']))
            .reset_index()
    )

    agg_bay = (
        df_bay.groupby(['question_set_name', 'network_size', 'model'])['baymin_score']
            .agg(n='size',
                mean='mean',
                std=lambda s: s.std(ddof=0))
            .rename(columns={'mean': 'baymin_mean', 'std': 'baymin_std', 'n': 'baymin_n'})
            .assign(baymin_se=lambda d: d['baymin_std'] / np.sqrt(d['baymin_n']),
                    baymin_ci_hw=lambda d: 1.96 * np.sqrt(d['baymin_mean'] * (1 - d['baymin_mean']) / d['baymin_n']))
            .reset_index()
    )

    # --- 3) Outer-merge the two aggregates on keys ---
    agg = pd.merge(
        agg_raw, agg_bay,
        on=['question_set_name', 'network_size', 'model'],
        how='outer'
    )

    # --- 4) Build MultiIndex columns: (Framework, ModelLabel) ---
    top, bottom = [], []
    for fw in ['Raw', 'BayMin']:
        for m in MODEL_ORDER:
            top.append(fw)
            bottom.append(MODEL_LABEL[m])
    cols = pd.MultiIndex.from_arrays([top, bottom])

    # --- 5) Assemble rows (include all sizes seen per question type across both sources) ---
    rows, row_index = [], []
    for q in sorted(agg['question_set_name'].dropna().unique().tolist()):
        sub = agg[agg['question_set_name'] == q]
        # union of sizes that appear in either raw or baymin for this question type
        available_sizes = sorted(sub['network_size'].dropna().unique().tolist())
        for ns in available_sizes:
            row_vals = []
            # Raw block
            for m in MODEL_ORDER:
                val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]
                if len(val):
                    mean = _pct(val['raw_mean'].values)
                    hw   = _pct(val['raw_ci_hw'].values)   # or use 'raw_se' for SE
                    cell = f"{mean} ± {hw}" if (mean is not None and hw is not None) else (f"{mean}" if mean is not None else None)
                    if mean is not None:
                        row_vals.append(cell)
                    else:
                        row_vals.append(None)
                else:
                    row_vals.append(None)

            # BayMin block
            for m in MODEL_ORDER:
                val = sub[(sub['network_size'] == ns) & (sub['model'] == m)]
                if len(val):
                    mean = _pct(val['baymin_mean'].values)
                    hw   = _pct(val['baymin_ci_hw'].values)  
                    cell = f"{mean} ± {hw}" if (mean is not None and hw is not None) else (f"{mean}" if mean is not None else None)
                    if mean is not None:
                        row_vals.append(cell)
                    else:
                        row_vals.append(None)
                else:
                    row_vals.append(None)

            rows.append(row_vals)
            row_index.append((q.replace('_', ' ').title(), ns))

    idx = pd.MultiIndex.from_tuples(row_index, names=['QuestionType', 'NetworkSize'])
    table = pd.DataFrame(rows, index=idx, columns=cols)

		# Replace NaN values and 0 values with "-" for better readability
    table = table.fillna("-")
    # excluded = table.columns[table.columns.get_level_values(0) == 'BayMin']
    # cols_to_change = table.columns.difference(excluded)
    # table.loc[:, cols_to_change] = table.loc[:, cols_to_change].replace(0, "-")

    return table


def show_performance_table(csv_file_path: str, baymin_csv_file_path: str = None) -> pd.DataFrame:
	"""Build and display the grouped performance table in the notebook."""
	table = build_performance_table(csv_file_path, baymin_csv_file_path)
	display(table)
	return table

# Example usage (uncomment to run):
_ = show_performance_table('test_log.csv', 'baymin_test_log.csv')



Unnamed: 0_level_0,Unnamed: 1_level_0,Raw,Raw,Raw,BayMin,BayMin,BayMin
Unnamed: 0_level_1,Unnamed: 1_level_1,gpt-oss:20b,llama3.1:70b,qwen3:8b,gpt-oss:20b,llama3.1:70b,qwen3:8b
QuestionType,NetworkSize,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Blocked Evidence,5,23.3 ± 15.1,26.7 ± 15.8,26.7 ± 15.8,76.7 ± 15.1,86.7 ± 12.2,53.3 ± 17.9
Blocked Evidence,10,26.7 ± 15.8,20.0 ± 14.3,16.7 ± 13.3,86.7 ± 12.2,86.7 ± 12.2,50.0 ± 17.9
Blocked Evidence,30,6.7 ± 8.9,3.3 ± 6.4,13.3 ± 12.2,86.7 ± 12.2,96.7 ± 6.4,56.7 ± 17.7
Blocked Evidence,60,26.7 ± 15.8,10.0 ± 10.7,20.0 ± 14.3,93.3 ± 8.9,93.3 ± 8.9,80.0 ± 14.3
Common Cause,5,80.0 ± 14.3,46.7 ± 17.9,66.7 ± 16.9,90.0 ± 10.7,86.7 ± 12.2,93.3 ± 8.9
Common Cause,10,63.3 ± 17.2,40.0 ± 17.5,46.7 ± 17.9,100.0 ± 0.0,93.3 ± 8.9,100.0 ± 0.0
Common Cause,30,30.0 ± 16.4,20.0 ± 14.3,36.7 ± 17.2,100.0 ± 0.0,100.0 ± 0.0,93.3 ± 8.9
Common Cause,60,26.7 ± 15.8,16.7 ± 13.3,33.3 ± 16.9,100.0 ± 0.0,100.0 ± 0.0,86.7 ± 12.2
Common Effect,5,66.7 ± 16.9,40.0 ± 17.5,66.7 ± 16.9,93.3 ± 8.9,70.0 ± 16.4,93.3 ± 8.9
Common Effect,10,63.3 ± 17.2,33.3 ± 16.9,36.7 ± 17.2,90.0 ± 10.7,80.0 ± 14.3,90.0 ± 10.7


In [None]:
def table_to_latex(
    table,
    caption="Accuracy Comparison across Models and Network Sizes.",
    label="tab:performance",
    filename=None,
    escape=True,
    frameworks_order=("Raw", "BayMin"),
    starred=True,                 # use \begin{table*} ... \end{table*}
    placement="[t]",              # top placement
    size_cmd="\\small",
    end_size_cmd="\\normalsize",
    add_vertical_bar=True,
    display_require_packages=True
):
    """
    Render a MultiIndex performance DataFrame to LaTeX in the requested style,
    highlighting 100.0 ± 0.0 (green) and 0.0 ± 0.0 (red).
    """

    import pandas as pd

    # --- Helpers ---
    def tex_escape(s: str) -> str:
        if not escape or s is None:
            return s
        return str(s).replace("_", "\\_")

    # Ensure frameworks exist in columns level 0
    col_level0 = table.columns.get_level_values(0)
    frameworks_present = [fw for fw in frameworks_order if fw in set(col_level0)]
    frameworks_present += [fw for fw in col_level0.unique() if fw not in frameworks_present]
    fw_cols = {fw: table.columns[col_level0 == fw] for fw in frameworks_present}

    # Column spec
    col_spec_parts = ["l", "l"]
    for fw in frameworks_present:
        if add_vertical_bar and fw == "BayMin":
            col_spec_parts.append("|")
        col_spec_parts.extend(["c"] * len(fw_cols[fw]))
    col_spec = "".join(col_spec_parts)

    # --- Build LaTeX ---
    lines = []
    if display_require_packages:
        lines.append("% Required packages:")
        lines.append("% \\usepackage{booktabs}")
        lines.append("% \\usepackage{multirow}")
        lines.append("% \\usepackage{makecell}")
        lines.append("% \\usepackage{xcolor}")
        lines.append("")

    env = "table*" if starred else "table"
    lines.append(f"\\begin{{{env}}}{placement}")
    lines.append("  \\centering")
    lines.append(f"  \\caption{{{caption}}}")
    lines.append(f"  \\label{{{label}}}")
    if size_cmd:
        lines.append(f"  {size_cmd}")
    lines.append(f"  \\begin{{tabular}}{{{col_spec}}}")
    lines.append("  \\toprule")

    # Header row 1
    first_hdr_cells = ["", ""]
    for fw in frameworks_present:
        n = len(fw_cols[fw])
        if n > 0:
            if n == 1:
                first_hdr_cells.append(f"\\textbf{{{tex_escape(fw)}}}")
            else:
                first_hdr_cells.append(f"\\multicolumn{{{n}}}{{c}}{{\\textbf{{{tex_escape(fw)}}}}}")
    lines.append("  " + " & ".join(first_hdr_cells) + " \\\\")

    # cmidrules
    start_col = 3
    for fw in frameworks_present:
        n = len(fw_cols[fw])
        if n > 0:
            end_col = start_col + n - 1
            lines.append(f"  \\cmidrule(lr){{{start_col}-{end_col}}}")
            start_col = end_col + 1

    # Header row 2
    second_hdr = ["\\textbf{Question Type}", "\\textbf{Net}"]
    for fw in frameworks_present:
        for _, model_name in fw_cols[fw]:
            second_hdr.append(f"\\textbf{{{tex_escape(model_name)}}}")
    lines.append("  " + " & ".join(second_hdr) + " \\\\")
    lines.append("  \\midrule")

    # --- Body ---
    if not isinstance(table.index, pd.MultiIndex) or table.index.nlevels != 2:
        raise ValueError("table.index must be a MultiIndex with (QuestionType, NetworkSize).")

    for qtype, subdf in table.groupby(level=0, sort=False):
        # Rename question type if matches
        qtype_tex = tex_escape(qtype)
        if "Evidence Change Relationship" in qtype_tex:
            qtype_tex = qtype_tex.replace("Evidence Change Relationship", "Evidence Chg. Rel.")

        n_rows = len(subdf)
        first_row = True

        for (qt, net), row in subdf.iterrows():
            net_tex = tex_escape(net)

            if first_row:
                left_stub = f"\\multirow{{{n_rows}}}{{*}}{{{qtype_tex}}} & {net_tex}"
                first_row = False
            else:
                left_stub = f" & {net_tex}"

            data_cells = []
            for fw in frameworks_present:
                for col in fw_cols[fw]:
                    val = row[col]
                    if pd.isna(val) or val in ("-", "---"):
                        data_cells.append("---")
                        continue

                    val_str = str(val).strip()
                    # Highlight special values
                    if val_str.startswith("100") and "±" in val_str and "0.0" in val_str:
                        data_cells.append("\\textbf{\\textcolor[HTML]{10ac84}{" + tex_escape(val_str) + "}}")
                    elif val_str.startswith("0") and "±" in val_str and "0.0" in val_str:
                        data_cells.append("\\textbf{\\textcolor[HTML]{FF5757}{" + tex_escape(val_str) + "}}")
                    else:
                        data_cells.append(tex_escape(val_str))

            lines.append("  " + " & ".join([left_stub] + data_cells) + " \\\\")
        lines.append("  \\midrule")

    # bottomrule
    if lines[-1].strip() == "\\midrule" or lines[-1].strip() == "  \\midrule":
        lines[-1] = "  \\bottomrule"
    else:
        lines.append("  \\bottomrule")

    lines.append("  \\end{tabular}")
    if end_size_cmd:
        lines.append(f"  {end_size_cmd}")
    lines.append(f"\\end{{{env}}}")

    latex_code = "\n".join(lines)

    if filename:
        with open(filename, "w") as f:
            f.write(latex_code)

    return latex_code


def generate_latex_table(csv_file_path: str, baymin_csv_file_path: str = None, 
                        filename: str = None, display_require_packages: bool = True):
    """
    Generate LaTeX table from CSV data and optionally save to file.
    
    Args:
        csv_file_path (str): Path to main CSV file
        baymin_csv_file_path (str, optional): Path to BayMin CSV file
        filename (str, optional): Output filename for LaTeX code
    
    Returns:
        str: LaTeX table code
    """
    # Build the performance table
    table = build_performance_table(csv_file_path, baymin_csv_file_path)
    
    # Convert to LaTeX
    latex_code = table_to_latex(table, filename=filename, display_require_packages=display_require_packages)
    
    return latex_code

# Generate LaTeX table and save to file
latex_code = generate_latex_table('test_log.csv', 
                                 'baymin_test_log.csv', 
                                 'performance_table.tex',
                                 display_require_packages=True)

print(latex_code)


% Required packages:
% \usepackage{booktabs}
% \usepackage{multirow}
% \usepackage{makecell}
% \usepackage{xcolor}

\begin{table*}[t]
\label{tab:performance}
  \centering
  \caption{Accuracy Comparison across Models and Network Sizes.}
  \small
  \begin{tabular}{llccc|ccc}
  \toprule
   &  & \multicolumn{3}{c}{\textbf{Raw}} & \multicolumn{3}{c}{\textbf{BayMin}} \\
  \cmidrule(lr){3-5}
  \cmidrule(lr){6-8}
  \textbf{Question Type} & \textbf{Net} & \textbf{gpt-oss:20b} & \textbf{llama3.1:70b} & \textbf{qwen3:8b} & \textbf{gpt-oss:20b} & \textbf{llama3.1:70b} & \textbf{qwen3:8b} \\
  \midrule
  \multirow{4}{*}{Blocked Evidence} & 5 & 23.3 ± 15.1 & 26.7 ± 15.8 & 26.7 ± 15.8 & 76.7 ± 15.1 & 86.7 ± 12.2 & 53.3 ± 17.9 \\
   & 10 & 26.7 ± 15.8 & 20.0 ± 14.3 & 16.7 ± 13.3 & 86.7 ± 12.2 & 86.7 ± 12.2 & 50.0 ± 17.9 \\
   & 30 & 6.7 ± 8.9 & 3.3 ± 6.4 & 13.3 ± 12.2 & 86.7 ± 12.2 & 96.7 ± 6.4 & 56.7 ± 17.7 \\
   & 60 & 26.7 ± 15.8 & 10.0 ± 10.7 & 20.0 ± 14.3 & 93.3 ± 8.9 & 93.3 ± 8.9 & 80.0 ± 14.3 \