In [None]:
import os
import json
import pandas as pd
import re
import Levenshtein
from pathlib import Path
import plotly.express as px
import warnings
from IPython.display import display
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats import ttest_ind
from sklearn.metrics import pairwise_distances
from sentence_transformers.util import cos_sim


In [None]:
DATA_DIR = "./data/with_features_with_embeddings"

## Data Loading

In [None]:
DATA_DIR = Path("./data/with_features_with_embeddings")

rows_1, rows_3 = [], []

for model_folder in DATA_DIR.iterdir():
    if not model_folder.is_dir():
        continue

    for json_file in model_folder.glob("*.jsonl"):
        fname = json_file.name

        if "_1_" in fname:
            target_list = rows_1
        elif "_3_" in fname:
            target_list = rows_3
        else:
            continue

        context = "exp2_" in fname
        prev_num = None
        if "_3_" in fname:
            if "context1" in fname:
                prev_num = 1
            elif "context3" in fname:
                prev_num = 3

        with json_file.open("r") as f:
            for line in f:
                try:
                    row_data = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"JSON error in {json_file}: {e}")
                    continue

                ordered_row = {
                    "model_name": model_folder.name,
                    "context": context,
                }
                if prev_num is not None:
                    ordered_row["prev_num"] = prev_num
                ordered_row.update(row_data)
                target_list.append(ordered_row)

# Note: files with "_3_" in the name correspond to what is referred to as Experiment 2 (exp2) in the paper.
# For consistency with the paper, we assign them to exp2 here.

exp1 = pd.DataFrame(rows_1)
exp2 = pd.DataFrame(rows_3)


## Data Cleaning

In [None]:
all_df = pd.concat([exp1, exp2], ignore_index=True)

unprocessed_pairs = all_df[all_df["is_processed"] == False][["student_id", "question_name"]].drop_duplicates()
print("unique {student, question} pair with is_processed = false:", len(unprocessed_pairs))

matching_rows = all_df.merge(unprocessed_pairs, on=["student_id", "question_name"])
print("rows corresponding to these pairs:", len(matching_rows))

print("portion of data losing:", len(matching_rows) / len(all_df))


In [None]:
def drop_unprocessed(df):
    return df.merge(unprocessed_pairs, on=["student_id", "question_name"], how="left", indicator=True) \
             .query('_merge == "left_only"') \
             .drop(columns=["_merge"])

exp1_ = drop_unprocessed(exp1)
exp2_ = drop_unprocessed(exp2)

In [None]:
# This cleaning is specific to our dataset
exp1_ = exp1_[~exp1_["question_name"].isin(["Mint", "accumulate"])]
exp2_ = exp2_[~exp2_["question_name"].isin(["Mint", "accumulate"])]

In [None]:
class DataFlattener:
    def __init__(self, is_flat=False):
        self.is_flat = is_flat

    def flatten_features(self, df):
        df = df.copy()
        if self.is_flat:
            for q in range(3):
                for code_type in ["gt", "synthetic"]:
                    col = f"{code_type}_code_block_q{q}_features"
                    if col in df.columns:
                        features_df = pd.json_normalize(df[col]).set_index(df.index)
                        prefix = f"{code_type}_q{q}"
                        features_df.columns = [f"{prefix}_{sub}" for sub in features_df.columns]
                        df = pd.concat([df.drop(columns=[col]), features_df], axis=1)
        else:
            for col, prefix in [("gt_code_block_features", "gt"), ("synthetic_code_block_features", "synthetic")]:
                if col in df.columns:
                    features_df = pd.json_normalize(df[col]).set_index(df.index)
                    features_df.columns = [f"{prefix}_{sub}" for sub in features_df.columns]
                    df = pd.concat([df.drop(columns=[col]), features_df], axis=1)
        return df


    def flatten_autograder(self, df):
        df = df.copy()

        def normalize_error_type(raw):
            if not isinstance(raw, str):
                return None
            if "No Error" in raw:
                return "No Error"
            elif "Logical Error" in raw:
                return "Logical Error"
            elif "Runtime Error" in raw or "NameError" in raw:
                return "Runtime Error"
            elif "Compilation Error" in raw:
                return "Compilation Error"
            return "Other"

        if self.is_flat:
            for q in range(3):
                for code_type in ["gt", "synthetic"]:
                    col = f"{code_type}_code_block_q{q}_autograder"
                    df[f"{code_type}_q{q}_error_type"] = df[col].apply(
                        lambda x: normalize_error_type(x.get("error_type")) if isinstance(x, dict) else None
                    )
                    df[f"{code_type}_q{q}_test_pass_rate"] = df[col].apply(
                        lambda x: x.get("test_pass_rate") if isinstance(x, dict) else None
                    )
        else:
            for code_type in ["gt", "synthetic"]:
                col = f"{code_type}_code_block_autograder"
                df[f"{code_type}_error_type"] = df[col].apply(
                    lambda x: normalize_error_type(x.get("error_type")) if isinstance(x, dict) else None
                )
                df[f"{code_type}_test_pass_rate"] = df[col].apply(
                    lambda x: x.get("test_pass_rate") if isinstance(x, dict) else None
                )

        return df


    def flatten_embeddings(self, df):
        df = df.copy()
        if self.is_flat:
            for q in range(3):
                for code_type in ["gt", "synthetic"]:
                    col = f"embeddings"
                    target = f"{code_type}_code_block_q{q}"
                    df[f"{target}_embedding"] = df[col].apply(
                        lambda x: x.get(target) if isinstance(x, dict) else None
                    )
        else:
            df["gt_code_block_embedding"] = df["embeddings"].apply(lambda x: x.get("gt_code_block") if isinstance(x, dict) else None)
            df["synthetic_code_block_embedding"] = df["embeddings"].apply(lambda x: x.get("synthetic_code_block") if isinstance(x, dict) else None)
        return df

flat = DataFlattener()
exp1_clean = flat.flatten_features(exp1_)
exp1_clean = flat.flatten_autograder(exp1_clean)
exp1_clean = flat.flatten_embeddings(exp1_clean)


flat = DataFlattener()
exp2_clean = flat.flatten_features(exp2_)
exp2_clean = flat.flatten_autograder(exp2_clean)
exp2_clean = flat.flatten_embeddings(exp2_clean)

In [None]:
def drop_original_columns(df, is_flat=False):
    cols_to_drop = ["input", "output_synthetic", "output_gt", "embeddings"]

    if is_flat:
        for q in range(3):
            cols_to_drop += [
                f"gt_code_block_q{q}_autograder",
                f"synthetic_code_block_q{q}_autograder",
            ]
    else:
        cols_to_drop += [
            "synthetic_code_block_autograder",
            "gt_code_block_autograder",
        ]

    return df.drop(columns=[col for col in cols_to_drop if col in df.columns])

exp1_clean = drop_original_columns(exp1_clean, is_flat=False)
exp2_clean = drop_original_columns(exp2_clean, is_flat=False)


In [None]:
def adjust_w292_violations(df, is_flat=False):
    df = df.copy()

    def has_w292(messages):
        if isinstance(messages, list):
            return any(isinstance(m, dict) and m.get("msg") == "W292 no newline at end of file" for m in messages)
        return False

    if is_flat:
        for q in range(3):
            for code_type in ["gt", "synthetic"]:
                msg_col = f"{code_type}_code_block_q{q}_features_pep8_violations.messages"
                count_col = f"{code_type}_code_block_q{q}_features_pep8_violations.count"
                if msg_col in df.columns and count_col in df.columns:
                    df[count_col] = df.apply(
                        lambda row: row[count_col] - 1 if has_w292(row.get(msg_col)) else row[count_col],
                        axis=1
                    )
    else:
        for code_type in ["gt", "synthetic"]:
            msg_col = f"{code_type}_pep8_violations.messages"
            count_col = f"{code_type}_pep8_violations.count"
            if msg_col in df.columns and count_col in df.columns:
                df[count_col] = df.apply(
                    lambda row: row[count_col] - 1 if has_w292(row.get(msg_col)) else row[count_col],
                    axis=1
                )
    return df

exp1_clean = adjust_w292_violations(exp1_clean, is_flat=False)
exp2_clean = adjust_w292_violations(exp2_clean, is_flat=False)


In [None]:
exp1_clean.keys()

In [None]:
exp1_clean.head(1)

In [None]:
exp2_clean.keys()

In [None]:
exp2_clean.head(1)

## Results

### Summary

In [None]:
def summarize_metrics(df, exp_type="exp1", question_name=None, test_class=None, group_by_quantile=False):
    metric_keys = [
        "loc", "char_count", "ast_depth", "ast_width",
        "ast_node_count", "pep8_violations.count",
        "test_pass_rate"
    ]

    # style_score will be computed in the later cells 
    
    # metric_keys = [
    #     "loc", "char_count", "ast_depth", "ast_width",
    #     "ast_node_count", "pep8_violations.count",
    #     "test_pass_rate", "style_score"
    # ]

    gt_cols = [f"gt_{k}" for k in metric_keys]
    syn_cols = [f"synthetic_{k}" for k in metric_keys]

    if question_name:
        df = df[df["question_name"] == question_name]
    if test_class:
        df = df[df["test_class"] == test_class]

    if exp_type == "exp1":
        if group_by_quantile:
            group_cols = ["model_name", "quantile", "context"]
            gt_group_cols = ["quantile"]
        else:
            group_cols = ["model_name"]
            gt_group_cols = []

    elif exp_type == "exp2":
        group_cols = ["model_name"]
        gt_group_cols = []
    else:
        raise ValueError("exp_type must be 'exp1' or 'exp2'")

    syn_summary = df.groupby(group_cols)[syn_cols].agg(['mean', 'std']).reset_index()
    syn_summary.columns = ['_'.join(col).replace('synthetic_', '').strip('_') for col in syn_summary.columns]

    if gt_group_cols:
        gt_summary = df.groupby(gt_group_cols)[gt_cols].agg(['mean', 'std']).reset_index()
        gt_summary.columns = [
            col[0].replace("gt_", "") + ("_" + col[1] if col[1] else "")
            for col in gt_summary.columns
        ]
        gt_summary["model_name"] = "GT"
        if "quantile" not in gt_summary.columns and "quantile" in syn_summary.columns:
            gt_summary["quantile"] = None
    else:
        gt_means = df[gt_cols].mean()
        gt_stds = df[gt_cols].std()
        gt_row = {f"{col.replace('gt_', '')}_mean": gt_means[col] for col in gt_cols}
        gt_row.update({f"{col.replace('gt_', '')}_std": gt_stds[col] for col in gt_cols})
        gt_row["model_name"] = "GT"
        if "quantile" in syn_summary.columns:
            gt_row["quantile"] = None
        gt_summary = pd.DataFrame([gt_row])


    for col in syn_summary.columns:
        if col not in gt_summary.columns:
            gt_summary[col] = None
    gt_summary = gt_summary[syn_summary.columns]

    summary = pd.concat([syn_summary, gt_summary], ignore_index=True)
    summary = summary.round(3)

    allowed_models = ["gpt_4_1", "qwen_2_5_coder_7b", "qwen_2_5_coder_7b_inst", "GT"]
    # allowed_models = ["qwen_3_8b", "llama_3_8b", "qwen_2_5_coder_3b", "GT"]
    summary = summary[summary["model_name"].isin(allowed_models)]

    selected_metrics = ["test_pass_rate","pep8_violations.count"]
    selected_cols = []
    for metric in selected_metrics:
        selected_cols += [f"{metric}_mean", f"{metric}_std"]
    id_cols = [col for col in summary.columns if col in ["model_name", "quantile", "context"]]
    summary = summary[id_cols + selected_cols]

    return summary



In [None]:
summarize_metrics(exp1_clean, exp_type="exp1", question_name="two_list", test_class="test1", group_by_quantile=True)

In [None]:
summarize_metrics(exp2_clean, exp_type="exp2", question_name="two_list", test_class="test1")

### Functionality 

In [None]:
model_colors = {
    "Student": "#4d4d4d",                 
    "gpt-4.1": "#8E5491",            
    "qwen-student": "#D17F27",   
    "qwen-inst": "#4F9058", 
    "qwen-2.5-coder-3b": "#BA35B5",  
    "qwen-3-8b": "#ABDC96",          
    "llama-3-8b": "#4D57B0"           
}

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

def prepare_error_df(df, model_name_col="model_name"):
    gt_df = df.drop_duplicates(subset=["student_id", "question_name", "quantile"]).copy()
    gt_df["model_source"] = "GT"
    gt_df["error_type"] = gt_df["gt_error_type"]

    gt_true = gt_df.copy()
    gt_true["context"] = True
    gt_false = gt_df.copy()
    gt_false["context"] = False
    gt_df = pd.concat([gt_true, gt_false], ignore_index=True)

    syn_df = df.copy()
    syn_df["model_source"] = syn_df[model_name_col]
    syn_df["error_type"] = syn_df["synthetic_error_type"]

    return pd.concat([gt_df, syn_df], ignore_index=True)


def plot_error_distribution(df, title_prefix, question_name=None, include_models=None):
    expected_errors = ["No Error", "Logical Error", "Runtime Error"]
    df = df[df["context"].notna()].copy()

    if question_name:
        df = df[df["question_name"] == question_name]

    if include_models is not None:
        df = df[df["model_source"].isin(include_models + ["GT"])]

    df["error_type"] = pd.Categorical(df["error_type"], categories=expected_errors)

    for test_class in sorted(df["test_class"].dropna().unique()):
        subset = df[df["test_class"] == test_class].copy()

        group_cols = ["context", "quantile", "model_source", "error_type"]
        count_df = (
            subset.groupby(group_cols)
            .size()
            .reset_index(name="count")
        )

        from itertools import product
        contexts = [True, False]
        quantiles = ["submission_q0", "submission_q1", "submission_q2"]
        model_sources = subset["model_source"].unique()
        full_index = pd.DataFrame(product(contexts, quantiles, model_sources, expected_errors), columns=group_cols)
        count_df = full_index.merge(count_df, how="left").fillna({"count": 0})
        count_df["count"] = count_df["count"].astype(int)

        count_df["proportion"] = count_df.groupby(["context", "quantile", "model_source"])["count"].transform(lambda x: x / x.sum())

        count_df["std"] = np.sqrt(
            count_df["proportion"] * (1 - count_df["proportion"]) /
            count_df.groupby(["context", "quantile", "model_source"])["count"].transform("sum")
        )

        count_df["model_source"] = count_df["model_source"].replace({
            "qwen_2_5_coder_7b":       "qwen-student",
            "qwen_2_5_coder_7b_inst":  "qwen-inst",
            "gpt_4_1":                 "gpt-4.1",
            "qwen_2_5_coder_3b": "qwen-2.5-coder-3b",
            "qwen_3_8b": "qwen-3-8b",
            "llama_3_8b": "llama-3-8b",
            "GT":                 "Student"
            })
  

        bin_rename = {
            "submission_q0": "start",
            "submission_q1": "middle",
            "submission_q2": "last"
        }
        count_df["quantile"] = count_df["quantile"].replace(bin_rename)

        count_df["quantile"] = pd.Categorical(count_df["quantile"], categories=["start", "middle", "last"], ordered=True)
        count_df["context_label"] = count_df["context"].map({False: "context=False", True: "context=True"})



        q_title = f"{title_prefix} — {test_class}" + (f" — {question_name}" if question_name else "")
        print(q_title)  


        fig = px.bar(
            count_df,
            x="error_type",
            template="plotly_white",
            y="proportion",
            color="model_source",
            color_discrete_map=model_colors,
            barmode="group",
            facet_row="context_label",
            facet_col="quantile",
            category_orders={
            "error_type": expected_errors,
            "context_label": ["context=False", "context=True"],
            "quantile": ["start", "middle", "last"]
        },

            title=None,
            labels={"proportion": "Proportion", "error_type": "Error Type", "quantile": "Bin"},
        )
        
        fig.for_each_annotation(
            lambda a: a.update(text=a.text.replace("context_label=", "").replace("quantile=", "").replace("Bin=", ""))
        )

        for context_value in [False, True]:
            sub_df = count_df[count_df["context"] == context_value].copy()
            context_str = "context=True" if context_value else "context=False"

            fig = px.bar(
                sub_df,
                x="error_type",
                template="plotly_white",
                y="proportion",
                color="model_source",
                error_y="std",
                color_discrete_map=model_colors,
                barmode="group",
                facet_col="quantile",
                category_orders={
                    "error_type": expected_errors,
                    "quantile": ["start", "middle", "last"]
                },
                labels={"proportion": "Proportion", "error_type": "Error Type", "quantile": "Bin"},
            )

            fig.for_each_annotation(
                lambda a: a.update(text=a.text.replace("quantile=", "").replace("Bin=", ""))
            )

            fig.for_each_annotation(lambda a: a.update(font=dict(size=22)))
            fig.update_traces(error_y=dict(thickness=1, width=5, color="black"))



            fig.update_layout(
                height=500,
                width=1000,
                font=dict(size=18, color="black"),
                legend_title_text=None,
                legend=dict(
                    font=dict(size=22),
                    orientation="h",
                    yanchor="bottom",
                    y=-0.7,
                    xanchor="center",
                    x=0.5,
                    bordercolor="grey",
                    borderwidth=1,
                )
            )
            fig.update_xaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=False,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
            )
            fig.update_yaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
            )

            print(f"{q_title} — {context_str}")
            # fig.write_image(f"./plots/err-dist/app/{question_name}_{test_class}_{context_str}_err_dist.pdf")
            fig.show()



exp1_errors = prepare_error_df(exp1_clean)
plot_error_distribution(
    exp1_errors,
    "exp1",
    question_name="two_list",
    include_models=["qwen_2_5_coder_7b", "qwen_2_5_coder_7b_inst", "gpt_4_1"]
    # include_models=["qwen_2_5_coder_3b", "qwen_3_8b", "llama_3_8b"]
)

In [None]:
def add_normalized_block_num(df, block_col="block_num"):
    df = df.copy()

    stream_max = (
        df.groupby(["student_id", "question_name", "model_name", "prev_num", "test_class", "context"])[block_col]
        .max()
        .reset_index()
    )

    median_map = (
        stream_max.groupby(["question_name", "test_class"])[block_col]
        .median()
        .round()
        .astype(int)
        .reset_index()
        .rename(columns={block_col: "global_max_block"})
    )

    df = df.merge(median_map, on=["question_name", "test_class"], how="left")

    df = df.sort_values(by=["student_id", "question_name", "model_name", "prev_num","context", block_col])

    def normalize(group):
        T = group["global_max_block"].iloc[0]
        n = len(group)
        if n == 1:
            return pd.Series([1] * n, index=group.index)
        if n == T:
            return pd.Series(range(1, T + 1), index=group.index)
        if n < T:
            norm_idx = np.linspace(1, T, n).round().astype(int)
            return pd.Series(norm_idx, index=group.index)
        bin_edges = np.linspace(0, n, T + 1).astype(int)
        norm_idx = np.zeros(n, dtype=int)
        for i in range(T):
            norm_idx[bin_edges[i]:bin_edges[i+1]] = i + 1
        return pd.Series(norm_idx, index=group.index)
    
    df["norm_block_num"] = (
        df.groupby(["student_id", "question_name", "model_name", "prev_num", "context"])
        .apply(normalize)
        .reset_index(level=[0, 1, 2, 3, 4], drop=True)
    )

    return df

exp2_clean = add_normalized_block_num(exp2_clean)

In [None]:
def prepare_passrate_df(df, x_col, is_exp2=False):
    df = df[df["context"].notna()].copy()

    if is_exp2 and "prev_num" in df.columns:
        df["prev"] = df["prev_num"].astype(str)

    gt_df = df.drop_duplicates(subset=["student_id", "question_name", x_col]).copy()
    gt_df["model_name"] = "Student"
    gt_df["test_pass_rate"] = gt_df["gt_test_pass_rate"]
    gt_df["context"] = False
    gt_df2 = gt_df.copy()
    gt_df2["context"] = True
    gt_df = pd.concat([gt_df, gt_df2], ignore_index=True)

    syn_df = df.copy()
    syn_df["test_pass_rate"] = syn_df["synthetic_test_pass_rate"]

    combined = pd.concat([gt_df, syn_df], ignore_index=True)

    if is_exp2:
        combined[x_col] = combined[x_col].astype(int)
    else:
        if x_col == "quantile":
            quantile_map = {
                "submission_q0": "start",
                "submission_q1": "middle",
                "submission_q2": "last"
            }
            combined[x_col] = combined[x_col].map(quantile_map)
            combined[x_col] = pd.Categorical(
                combined[x_col],
                categories=["start", "middle", "last"],
                ordered=True
            )
        else:
            combined[x_col] = combined[x_col].astype(int)
    combined["model_name"] = combined["model_name"].replace({
        "qwen_2_5_coder_7b": "qwen-student",
        "qwen_2_5_coder_7b_inst": "qwen-inst",
        "gpt_4_1": "gpt-4.1",
    })


    return combined


def plot_avg_passrate_progress_multi(df_list, x_col, is_exp2=False, question_name=None, include_models=None):
    combined_all = []
    for df, label in df_list:
        df_prepped = prepare_passrate_df(df, x_col, is_exp2).copy()
        if is_exp2 and "prev_num" in df.columns:
            df_prepped["prev_num"] = df_prepped["prev"]
        else:
            df_prepped["prev_num"] = "All"
        combined_all.append(df_prepped)

    combined_df = pd.concat(combined_all, ignore_index=True)

    if question_name:
        combined_df = combined_df[combined_df["question_name"] == question_name]

    if include_models is not None:
        combined_df = combined_df[
            (combined_df["model_name"] == "Student")|
            (combined_df["model_name"].isin(include_models))
        ]

    for test_class in sorted(combined_df["test_class"].dropna().unique()):
        print(f"== {question_name} — {test_class} — Avg Pass Rate Progress ==")
        df_tc = combined_df[combined_df["test_class"] == test_class].copy()

        if is_exp2:
            bin_size = 1
            df_tc["binned_step"] = (df_tc[x_col] // bin_size) * bin_size
            x_group = "binned_step"
        else:
            x_group = x_col

        grouped = (
            df_tc.groupby([x_group, "model_name", "context", "prev_num"])["test_pass_rate"]
            .agg(["mean", "std", "count"])
            .reset_index()
        )
        grouped["sem"] = grouped["std"] / grouped["count"]**0.5
        grouped["ci95"] = 1.96 * grouped["sem"]
        grouped["ci95"] = grouped["ci95"]

        for context_value in [False, True]:
            df_context = grouped[grouped["context"] == context_value].copy()
            context_str = "context=True" if context_value else "context=False"

            fig = px.line(
                df_context,
                line_shape="spline",
                x=x_group,
                template="plotly_white",
                y="mean",
                error_y="ci95",
                color="model_name",
                color_discrete_map=model_colors,
                category_orders={"model_name": ["Student", "qwen-student", "qwen-inst", "gpt-4.1"]},
                facet_col="prev_num" if is_exp2 else None,
                markers=True,
                title=None,
                labels={"mean": "Avg. Test Pass Rate", x_group: "Submission Step"},
            )

            for trace in fig.data:
                trace.line.width = 2.5

            fig.update_layout(
                height=400,
                width=1000,
                font=dict(size=18, color='black'),
                legend_title_text=None,
                legend=dict(
                    font=dict(size=22),
                    orientation="h",
                    yanchor="bottom",
                    y=-0.7,
                    xanchor="center",
                    x=0.5,
                    bordercolor="grey",
                    borderwidth=1,
                ),
            )
            fig.update_yaxes(matches=None)
            fig.update_xaxes(tickmode="array", tickvals=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
            fig.update_yaxes(tickmode="array", tickvals=[0.2, 0.4, 0.6, 0.8, 1.0])
            fig.update_xaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
            )
            fig.update_yaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
)

            print(f"{question_name} — {test_class} — {context_str}")
            # fig.write_image(f"plots/pass-rate/app/{question_name}_{test_class}_{context_str}_passrate.pdf")
            fig.show()

plot_avg_passrate_progress_multi(
    [
        (exp2_clean[exp2_clean["prev_num"] == 1], "exp2 prev=1"),
        (exp2_clean[exp2_clean["prev_num"] == 3], "exp2 prev=3")
    ],
    x_col="norm_block_num",
    is_exp2=True,
    question_name="two_list",
    include_models=["qwen-student", "qwen-inst", "gpt-4.1"]
)


### Style

In [None]:
style_keys = [
    "loc", "char_count",
    "ast_depth", "ast_width", "ast_node_count"
]

gt_cols = [f"gt_{k}" for k in style_keys]
syn_cols = [f"synthetic_{k}" for k in style_keys]

exp1_style_df = exp1_clean[gt_cols + syn_cols].dropna()
exp2_style_df = exp2_clean[gt_cols + syn_cols].dropna()

combined_df = pd.concat([exp1_style_df, exp2_style_df], ignore_index=True)

combined_style_values = pd.concat([
    combined_df[gt_cols].rename(columns=lambda x: x.replace("gt_", "")),
    combined_df[syn_cols].rename(columns=lambda x: x.replace("synthetic_", ""))
], ignore_index=True)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(combined_style_values)

pca = PCA(n_components=1)
style_scores = pca.fit_transform(scaled_data).flatten()

combined_df["gt_style_score"] = style_scores[:len(combined_df)]
combined_df["synthetic_style_score"] = style_scores[len(combined_df):]


In [None]:
pc1_weights = pd.Series(pca.components_[0], index=combined_style_values.columns)
print(pc1_weights.sort_values(ascending=False))


In [None]:
exp1_clean = exp1_clean.copy()
exp2_clean = exp2_clean.copy()

exp1_clean["gt_style_score"] = combined_df["gt_style_score"].iloc[:len(exp1_clean)].values
exp1_clean["synthetic_style_score"] = combined_df["synthetic_style_score"].iloc[:len(exp1_clean)].values

exp2_clean["gt_style_score"] = combined_df["gt_style_score"].iloc[len(exp1_clean):].values
exp2_clean["synthetic_style_score"] = combined_df["synthetic_style_score"].iloc[len(exp1_clean):].values


In [None]:
def prepare_style_df(df, x_col, is_exp2=False):
    df = df[df["context"].notna()].copy()

    if is_exp2 and "prev_num" in df.columns:
        df["prev"] = df["prev_num"].astype(str)

    gt_df = df.drop_duplicates(subset=["student_id", "question_name", x_col]).copy()
    gt_df["model_name"] = "Student"
    gt_df["style_score"] = gt_df["gt_style_score"]
    gt_df["context"] = False
    gt_df2 = gt_df.copy()
    gt_df2["context"] = True
    gt_df = pd.concat([gt_df, gt_df2], ignore_index=True)

    syn_df = df.copy()
    syn_df["style_score"] = syn_df["synthetic_style_score"]

    combined = pd.concat([gt_df, syn_df], ignore_index=True)

    if is_exp2:
        combined[x_col] = combined[x_col].astype(int)
    else:
        if x_col == "quantile":
            quantile_map = {
                "submission_q0": "start",
                "submission_q1": "middle",
                "submission_q2": "last"
            }
            combined[x_col] = combined[x_col].map(quantile_map)
            combined[x_col] = pd.Categorical(
                combined[x_col],
                categories=["start", "middle", "last"],
                ordered=True
            )
        else:
            combined[x_col] = combined[x_col].astype(int)
    combined["model_name"] = combined["model_name"].replace({
        "qwen_2_5_coder_7b": "qwen-student",
        "qwen_2_5_coder_7b_inst": "qwen-inst",
        "gpt_4_1": "gpt-4.1",
    })


    return combined


def plot_avg_style_progress_multi(df_list, x_col, is_exp2=False, question_name=None, include_models=None):
    combined_all = []
    for df, label in df_list:
        df_prepped = prepare_style_df(df, x_col, is_exp2).copy()
        if is_exp2 and "prev_num" in df.columns:
            df_prepped["prev_num"] = df_prepped["prev"]
        else:
            df_prepped["prev_num"] = "All"
        combined_all.append(df_prepped)

    combined_df = pd.concat(combined_all, ignore_index=True)

    if question_name:
        combined_df = combined_df[combined_df["question_name"] == question_name]

    if include_models is not None:
        combined_df = combined_df[
            (combined_df["model_name"] == "Student")|
            (combined_df["model_name"].isin(include_models))
        ]

    for test_class in sorted(combined_df["test_class"].dropna().unique()):
        print(f"== {question_name} — {test_class} — Avg Pass Rate Progress ==")
        df_tc = combined_df[combined_df["test_class"] == test_class].copy()

        if is_exp2:
            bin_size = 1
            df_tc["binned_step"] = (df_tc[x_col] // bin_size) * bin_size
            x_group = "binned_step"
        else:
            x_group = x_col

        grouped = (
            df_tc.groupby([x_group, "model_name", "context", "prev_num"])["style_score"]
            .agg(["mean", "std", "count"])
            .reset_index()
        )
        grouped["sem"] = grouped["std"] / grouped["count"]**0.5
        grouped["ci95"] = 1.96 * grouped["sem"]
        grouped["ci95"] = grouped["ci95"]

        for context_value in [False, True]:
            df_context = grouped[grouped["context"] == context_value].copy()
            context_str = "context=True" if context_value else "context=False"

            fig = px.line(
                df_context,
                line_shape="spline",
                x=x_group,
                template="plotly_white",
                y="mean",
                error_y="ci95",
                color="model_name",
                color_discrete_map=model_colors,
                category_orders={"model_name": ["Student", "qwen-student", "qwen-inst", "gpt-4.1"]},
                facet_col="prev_num" if is_exp2 else None,
                markers=True,
                title=None,
                labels={"mean": "Avg. Style Score", x_group: "Submission Step"},
            )

            for trace in fig.data:
                trace.line.width = 2.5

            fig.update_layout(
                height=400,
                width=1000,
                font=dict(size=18, color='black'),
                legend_title_text=None,
                legend=dict(
                    font=dict(size=22),
                    orientation="h",
                    yanchor="bottom",
                    y=-0.7,
                    xanchor="center",
                    x=0.5,
                    bordercolor="grey",
                    borderwidth=1,
                ),
            )
            fig.update_yaxes(matches=None)
            fig.update_xaxes(tickmode="array", tickvals=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
            fig.update_xaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
            )
            fig.update_yaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
)

            print(f"{question_name} — {test_class} — {context_str}")
            # fig.write_image(f"plots/style/app/{question_name}_{test_class}_{context_str}_style.pdf")
            fig.show()


# plot_avg_passrate_progress_multi(
#     [(exp1_clean, "exp1")],
#     x_col="quantile",
#     is_exp2=False,
#     question_name="count_coins",
#     include_models=["qwen-student", "qwen-inst", "gpt-4.1"]
# )

plot_avg_style_progress_multi(
    [
        (exp2_clean[exp2_clean["prev_num"] == 1], "exp2 prev=1"),
        (exp2_clean[exp2_clean["prev_num"] == 3], "exp2 prev=3")
    ],
    x_col="norm_block_num",
    is_exp2=True,
    question_name="two_list",
    include_models=["qwen-student", "qwen-inst", "gpt-4.1"]
)


In [None]:
group_cols = ['model_name', 'context', 'prev_num', 'test_class', 'student_id', 'question_name']
df = exp2_clean.sort_values(group_cols + ['block_num']).copy()

df['gt_code_prev'] = df.groupby(group_cols)['gt_code_block'].shift(1)
df['synthetic_code_prev'] = df.groupby(group_cols)['synthetic_code_block'].shift(1)

df['gt_code_diff'] = df.apply(lambda row: Levenshtein.distance(str(row['gt_code_prev']), str(row['gt_code_block']))
                              if pd.notnull(row['gt_code_prev']) else 0, axis=1)


df['synthetic_code_diff'] = df.apply(lambda row: Levenshtein.distance(str(row['synthetic_code_prev']), str(row['synthetic_code_block']))
                                     if pd.notnull(row['synthetic_code_prev']) else 0, axis=1)


exp2_clean = df


In [None]:
def prepare_diff_df(df, x_col, is_exp2=False):
    df = df[df["context"].notna()].copy()

    if is_exp2 and "prev_num" in df.columns:
        df["prev"] = df["prev_num"].astype(str)

    gt_df = df.drop_duplicates(subset=["student_id", "question_name", x_col]).copy()
    gt_df["model_name"] = "Student"
    gt_df["code_diff"] = gt_df["gt_code_diff"]
    gt_df["context"] = False
    gt_df2 = gt_df.copy()
    gt_df2["context"] = True
    gt_df = pd.concat([gt_df, gt_df2], ignore_index=True)

    syn_df = df.copy()
    syn_df["code_diff"] = syn_df["synthetic_code_diff"]

    combined = pd.concat([gt_df, syn_df], ignore_index=True)

    if is_exp2:
        combined[x_col] = combined[x_col].astype(int)
    else:
        if x_col == "quantile":
            quantile_map = {
                "submission_q0": "start",
                "submission_q1": "middle",
                "submission_q2": "last"
            }
            combined[x_col] = combined[x_col].map(quantile_map)
            combined[x_col] = pd.Categorical(
                combined[x_col],
                categories=["start", "middle", "last"],
                ordered=True
            )
        else:
            combined[x_col] = combined[x_col].astype(int)
    combined["model_name"] = combined["model_name"].replace({
        "qwen_2_5_coder_7b": "qwen-student",
        "qwen_2_5_coder_7b_inst": "qwen-inst",
        "gpt_4_1": "gpt-4.1",
    })


    return combined


def plot_avg_diff_progress_multi(df_list, x_col, is_exp2=False, question_name=None, include_models=None):
    combined_all = []
    for df, label in df_list:
        df_prepped = prepare_diff_df(df, x_col, is_exp2).copy()
        if is_exp2 and "prev_num" in df.columns:
            df_prepped["prev_num"] = df_prepped["prev"]
        else:
            df_prepped["prev_num"] = "All"
        combined_all.append(df_prepped)

    combined_df = pd.concat(combined_all, ignore_index=True)

    if question_name:
        combined_df = combined_df[combined_df["question_name"] == question_name]

    if include_models is not None:
        combined_df = combined_df[
            (combined_df["model_name"] == "Student")|
            (combined_df["model_name"].isin(include_models))
        ]

    for test_class in sorted(combined_df["test_class"].dropna().unique()):
        print(f"== {question_name} — {test_class} — Avg Pass Rate Progress ==")
        df_tc = combined_df[combined_df["test_class"] == test_class].copy()

        if is_exp2:
            bin_size = 1
            df_tc["binned_step"] = (df_tc[x_col] // bin_size) * bin_size
            x_group = "binned_step"
        else:
            x_group = x_col

        grouped = (
            df_tc.groupby([x_group, "model_name", "context", "prev_num"])["code_diff"]
            .agg(["mean", "std", "count"])
            .reset_index()
        )
        grouped["sem"] = grouped["std"] / grouped["count"]**0.5
        grouped["ci95"] = 1.96 * grouped["sem"]
        grouped["ci95"] = grouped["ci95"]
        print(grouped[[x_group, "model_name", "mean", "ci95", "count"]].head(10))


        for context_value in [False, True]:
            df_context = grouped[grouped["context"] == context_value].copy()
            context_str = "context=True" if context_value else "context=False"

            fig = px.line(
                df_context,
                line_shape="spline",
                x=x_group,
                template="plotly_white",
                y="mean",
                error_y="ci95",
                color="model_name",
                color_discrete_map=model_colors,
                category_orders={"model_name": ["Student", "qwen-student", "qwen-inst", "gpt-4.1"]},
                facet_col="prev_num" if is_exp2 else None,
                markers=True,
                title=None,
                labels={"mean": "Avg. Code Edit Distance", x_group: "Submission Step"},
            )

            for trace in fig.data:
                trace.line.width = 2.5

            fig.update_layout(
                height=400,
                width=1000,
                font=dict(size=18, color='black'),
                legend_title_text=None,
                legend=dict(
                    font=dict(size=22),
                    orientation="h",
                    yanchor="bottom",
                    y=-0.7,
                    xanchor="center",
                    x=0.5,
                    bordercolor="grey",
                    borderwidth=1,
                ),
            )
            fig.update_yaxes(matches=None)
            fig.update_xaxes(tickmode="array", tickvals=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
            fig.update_xaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
            )
            fig.update_yaxes(
                tickfont=dict(size=16, color='black'),
                showgrid=True,
                gridcolor='lightgray',
                linecolor='black',
                linewidth=1.0
)

            print(f"{question_name} — {test_class} — {context_str}")
            # fig.write_image(f"plots/diff/app/{question_name}_{test_class}_{context_str}_diff.pdf")
            fig.show()




plot_avg_diff_progress_multi(
    [
        (exp2_clean[exp2_clean["prev_num"] == 1], "exp2 prev=1"),
        (exp2_clean[exp2_clean["prev_num"] == 3], "exp2 prev=3")
    ],
    x_col="norm_block_num",
    is_exp2=True,
    question_name="two_list",
    include_models=["qwen-student", "qwen-inst", "gpt-4.1"]
)


### Embeddings

In [None]:
symbol_map = {
    "Student": "square",
    "gpt_4_1": "cross",
    "qwen_2_5_coder_7b_inst": "cross",
    "qwen_2_5_coder_3b": "circle",
    "qwen_2_5_coder_7b": "circle",
    "qwen_3_8b": "circle",
    "llama_3_8b": "circle",
    "qwen-student": "circle",
    "qwen-inst": "cross",
    "gpt-4.1": "cross"
}

In [None]:
def plot_tsne_distribution_per_testclass(
    df,
    title_prefix,
    question_name=None,
    include_models=None,
    perplexity=30,
    n_iter=500,
    reducer="tsne"
):
    df = df.copy()

    
    
    if include_models is not None:
        df = df[df["model_name"].isin(include_models)]

    if question_name:
        df = df[df["question_name"] == question_name]

    for test_class in sorted(df["test_class"].dropna().unique()):
        all_tsne_rows = []

        for quantile in sorted(df["quantile"].dropna().unique()):
            for context_val in [False, True]:
                df_subset = df[
                    (df["test_class"] == test_class) &
                    (df["quantile"] == quantile) &
                    (df["context"] == context_val)
                ].copy()

                df_syn = df_subset[df_subset["model_name"].notna()].copy()
                df_syn["embedding"] = df_syn["synthetic_code_block_embedding"]
                df_syn["source"] = df_syn["model_name"]
                df_syn["quantile"] = quantile
                df_syn["test_class"] = test_class

                df_gt = df_subset.drop_duplicates(
                    subset=["student_id", "question_name", "quantile", "context", "test_class"]
                ).copy()
                df_gt["embedding"] = df_gt["gt_code_block_embedding"]
                df_gt["source"] = "GT"
                df_gt["quantile"] = quantile
                df_gt["test_class"] = test_class

                combined = pd.concat([df_syn, df_gt], ignore_index=True)
                combined["source"] = combined["source"].replace({"GT": "Student"})

                valid_combined = combined[combined["embedding"].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 0)]
                if len(valid_combined) < 2:
                    continue

                embeddings = np.array(valid_combined["embedding"].tolist())
                if reducer == "tsne":
                    coords = TSNE(
                        n_components=2,
                        perplexity=min(perplexity, len(valid_combined) - 1),
                        n_iter=n_iter,
                        random_state=42
                    ).fit_transform(embeddings)
                elif reducer == "umap":
                    coords = UMAP(
                        n_components=2,
                        random_state=42,
                        min_dist=0.7,         
                        n_neighbors=5,        
                        spread=2.0,         
                        metric='cosine',
                        init='random',
                        set_op_mix_ratio=0.1  
                    ).fit_transform(embeddings)


                elif reducer == "pca":
                    coords = PCA(n_components=2).fit_transform(embeddings)
                else:
                    raise ValueError(f"Unknown reducer: {reducer}")

                valid_combined["PC 1"] = coords[:, 0]
                valid_combined["PC 2"] = coords[:, 1]

                jitter_strength = 0.5
                valid_combined["PC 1"] += np.random.normal(0, jitter_strength, size=len(valid_combined))
                valid_combined["PC 2"] += np.random.normal(0, jitter_strength, size=len(valid_combined))

                all_tsne_rows.append(valid_combined)


        if not all_tsne_rows:
            continue

        final_df = pd.concat(all_tsne_rows, ignore_index=True)

        quantile_map = {
            "submission_q0": "first",
            "submission_q1": "middle",
            "submission_q2": "last"
        }
        final_df["quantile"] = final_df["quantile"].map(quantile_map)
        final_df["quantile"] = pd.Categorical(final_df["quantile"], categories=["first", "middle", "last"], ordered=True)

        final_df = final_df.sort_values("quantile")


        counts = final_df.groupby(["quantile", "context", "source", "test_class"]).size().reset_index(name="count")
        print(f"\nData point counts for test_class = {test_class}:\n", counts)

        final_df["source_order"] = final_df["source"].apply(lambda s: 0 if s == "Student" else 1)
        final_df = final_df.sort_values("source_order").drop(columns="source_order")
        
        final_df["context_label"] = final_df["context"].map({False: "context=False", True: "context=True"})
        final_df["context_label"] = pd.Categorical(final_df["context_label"], categories=["context=False", "context=True"], ordered=True)

        final_df["source"] = final_df["source"].replace({
            "qwen_2_5_coder_7b": "qwen-student",
            "qwen_2_5_coder_7b_inst": "qwen-inst",
            "gpt_4_1": "gpt-4.1",
            "Student": "Student"
        })



        
        fig = px.scatter(
            final_df,
            template="plotly_white",
            x="PC 1",
            y="PC 2",
            color="source",
            symbol="source",
            facet_row="context_label",
            facet_col="quantile",
            category_orders={
                "quantile": ["first", "middle", "last"],
                "context_label": ["context=False", "context=True"],
                "source": ["Student", "qwen-student", "qwen-inst", "gpt-4.1"]
            },
            hover_name="student_id",

            color_discrete_map=model_colors,
            title=None

        )

        fig.update_layout(height=650, width=1000)
        fig.update_xaxes(showgrid=False, zeroline=False)
        fig.update_yaxes(showgrid=False, zeroline=False)

        fig.update_traces(marker=dict(line=dict(width=0.5, color="white")))
        fig.for_each_trace(lambda t: t.update(marker_symbol=symbol_map.get(t.name, "circle")))
        fig.for_each_trace(lambda t: t.update(marker=dict(size=5 if t.name == "Student" else 6.5)))

        fig.for_each_annotation(lambda a: a.update(text=a.text.replace("quantile=", "").replace("context_label=", "")))
        fig.update_layout(margin=dict(l=10, r=10, t=80, b=10))
        fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
        fig.for_each_annotation(lambda a: a.update(font=dict(size=22))) 

        fig.update_xaxes(
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(size=16, color='black')
        )

        fig.update_yaxes(
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(size=16, color='black')
        )


        fig.update_xaxes(showticklabels=False)
        fig.update_yaxes(showticklabels=False)
 
        fig.update_layout(
            margin=dict(l=10, r=10, t=30, b=10), 
            plot_bgcolor='white',
            paper_bgcolor='white',
            font=dict(size=17, color="black"),
            legend_title_text=None,
            legend=dict(
                font=dict(size=22),
                orientation="h",
                yanchor="bottom",
                y=-0.25,
                xanchor="center",
                x=0.5,
                bordercolor="grey",
                borderwidth=1,
            )
        )
        fig.update_layout(
            legend=dict(
                itemsizing='constant',
                font=dict(size=24),
                orientation="h",
                yanchor="bottom",
                y=-0.25,
                xanchor="center",
                x=0.5,
                bordercolor="grey",
                borderwidth=1,
            )
        )


        for axis in fig.layout:
            if axis.startswith("xaxis") and "context=False" in str(fig.layout[axis]):
                fig.layout[axis].update(showticklabels=True)
            if axis.startswith("yaxis") and "Bin 1" in str(fig.layout[axis]):
                fig.layout[axis].update(showticklabels=True)
                fig.update_xaxes(title_text="PC 1")
                fig.update_yaxes(title_text="PC 2")
         
        print(f"{title_prefix} — {question_name} — {test_class} — Code Embeddings")
        # fig.write_image(f"plots/embd/app/{question_name}_{test_class}_embd.pdf")
        fig.show()



plot_tsne_distribution_per_testclass(
    exp1_clean,
    title_prefix="exp1",
    question_name="two_list",
    include_models=["qwen_2_5_coder_7b", "qwen_2_5_coder_7b_inst", "gpt_4_1"],
    reducer="pca",
)



In [None]:
def compute_embedding_metrics(df, question_name=None, include_models=None, k_distance=3, k_coverage=3):
    results_by_test_class = {}

    if question_name:
        df = df[df["question_name"] == question_name]
    if include_models:
        df = df[df["model_name"].isin(include_models)]

    contexts = df["context"].dropna().unique()
    quantiles = df["quantile"].dropna().unique()
    test_classes = df["test_class"].dropna().unique()
    model_names = df["model_name"].dropna().unique()

    for test_class in test_classes:
        df_tc = df[df["test_class"] == test_class]
        results = []

        for context in contexts:
            for quantile in quantiles:
                for model in model_names:
                    student_emb = df_tc[
                        (df_tc["model_name"]==model) &
                        (df_tc["context"]==context) &
                        (df_tc["quantile"]==quantile)
                    ]["gt_code_block_embedding"].tolist()
                    if not student_emb:
                        continue
                    student_emb = np.vstack(student_emb)
                    
                    model_emb = df_tc[
                        (df_tc["model_name"]==model) &
                        (df_tc["context"]==context) &
                        (df_tc["quantile"]==quantile)
                    ]["synthetic_code_block_embedding"].tolist()
                    if not model_emb:
                        continue
                    model_emb = np.vstack(model_emb)
                    
                    dist_student_model = 1.0 - cos_sim(student_emb, model_emb).numpy()
                    dist_model_student = 1.0 - cos_sim(model_emb, student_emb).numpy()
                    
                    # NN distance
                    nn_dist = np.sort(dist_student_model, axis=1)[:, :k_distance].mean()
                    
                    # NN coverage
                    nn_idx = np.argsort(dist_model_student, axis=1)[:, :k_coverage]
                    covered = np.unique(nn_idx).size
                    coverage = covered / student_emb.shape[0]

                    results.append({
                        "question": df_tc["question_name"].iloc[0],
                        "test_class": test_class,
                        "model": model,
                        "quantile": quantile,
                        "context": context,
                        "avg_knn_dist": round(nn_dist, 4),
                        "knn_coverage": round(coverage, 4)
                    })

        results_by_test_class[test_class] = pd.DataFrame(results).sort_values(
            ["model", "quantile", "context"]
        )

    return results_by_test_class


tables = compute_embedding_metrics(
    exp1_clean,
    question_name="two_list",
    include_models=["qwen_2_5_coder_7b", "qwen_2_5_coder_7b_inst", "gpt_4_1", "qwen_3_8b", "llama_3_8b", "qwen_2_5_coder_3b"],
    # include_models=["qwen_3_8b", "llama_3_8b", "qwen_2_5_coder_3b"],

    k_distance=3,
    k_coverage=10
)
for test_class, df in tables.items():
    print(f"\n=== Test Class: {test_class} ===")
    display(df)

### Pairwise Loss

In [None]:
keys_loss = [
    "loc", "char_count", "ast_depth", "ast_width",
    "ast_node_count", 
    "pep8_violations.count", "style_score"
]



In [None]:
def compute_style_and_embedding_errors(df):
    for key in keys_loss:
        df[f"{key}_abs_error"] = (
            df[f"gt_{key}"] - df[f"synthetic_{key}"]
        ).abs()

    df["test_pass_rate_abs_error"] = (
        df["gt_test_pass_rate"] - df["synthetic_test_pass_rate"]
    ).abs()

    if "gt_style_score" in df.columns and "synthetic_style_score" in df.columns:
        df["style_score_abs_error"] = (
            df["gt_style_score"] - df["synthetic_style_score"]
        ).abs()

    def compute_cosine_dist(row):
        if isinstance(row["gt_code_block_embedding"], list) and isinstance(row["synthetic_code_block_embedding"], list):
            return cosine_distances(
                [row["gt_code_block_embedding"]],
                [row["synthetic_code_block_embedding"]]
            )[0][0]
        else:
            return np.nan

    df["embedding_cosine_distance"] = df.apply(compute_cosine_dist, axis=1)

    df["error_type_match"] = (
        df["gt_error_type"] == df["synthetic_error_type"]
    ).astype(int)

    return df

exp1_clean = compute_style_and_embedding_errors(exp1_clean)
exp2_clean = compute_style_and_embedding_errors(exp2_clean)


In [None]:
def summarize_errors(
    df,
    question_name=None,
    test_class=None,
    include_prev_num=False 
):
    filtered = df.copy()
    if question_name:
        filtered = filtered[filtered["question_name"] == question_name]
    if test_class:
        filtered = filtered[filtered["test_class"] == test_class]

    group_keys = ["model_name", "quantile", "context"]

    error_cols = [col for col in filtered.columns if (
        col.endswith("_abs_error")
        or col.endswith("_distance")
        or col == "error_type_match"
    )]

    summary = (
        filtered[group_keys + error_cols]
        .groupby(group_keys)
        .mean(numeric_only=True)
        .reset_index()
    )

    allowed_models = ["gpt_4_1", "qwen_2_5_coder_7b", "qwen_2_5_coder_7b_inst", "GT"]
    # allowed_models=["qwen_3_8b", "llama_3_8b", "qwen_2_5_coder_3b",  "GT"]

    summary = summary[summary["model_name"].isin(allowed_models)]

    selected_cols = ["pep8_violations.count_abs_error", "style_score_abs_error"]
    summary = summary[group_keys + selected_cols]

    return summary



summarize_errors(exp1_clean, question_name="two_list", test_class="test1")
# summarize_errors(exp2_clean, question_name="count_coins", test_class="test3", include_prev_num=True)
