In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import re
import numpy as np

load_dotenv()

True

In [20]:
# df_pypi_decompiled_clustered = pd.read_csv(f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_syntax_errors_clustered.csv")

In [21]:
PATH_OF_DECOMPILATION_RESULTS = {
    "gemini-flash": {"base_result": f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_gemini_flash.csv"},
    "qwen-7b": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-7b_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-7b_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-7b_config_2.csv",
        },
    "qwen-32b": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-32b_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-32b_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_qwen-32b_config_2.csv",
        },
    "deepseek-r1": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_deepseek-r1_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_deepseek-r1_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_deepseek-r1_config_2.csv",
        },
    "granite": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_granite_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_granite_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_granite_config_2.csv",
        },
    "mistral": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_mistral_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_mistral_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_mistral_config_2.csv",
        },
    "phi-4": {
        "base_result":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_phi-4_config_0.csv",
        "config_1_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_phi-4_config_1.csv",
        "config_2_results":f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_comparison_results_phi-4_config_2.csv",
        },
    }

In [22]:
CLUSTER_CSV_PATH = f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decompiled_syntax_errors_clustered.csv"
CLUSTER_COL = "cluster"

In [23]:
OUT_MERGED_CSV = f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/decomp_results_with_clusters_and_metrics.csv"
OUT_CLUSTER_STATS_CSV = f"{os.getenv('PROJECT_ROOT_DIR')}/dataset/cluster_summary_stats.csv"



In [24]:
def infer_config(run_key: str) -> str:
    """
    Converts keys like:
      base_result -> config_0
      config_1_results -> config_1
    """
    if run_key == "base_result":
        return "config_0"
    m = re.search(r"config_(\d+)", run_key)
    return f"config_{m.group(1)}" if m else run_key

In [25]:
def load_all_results(path_map: dict, strict: bool = False) -> pd.DataFrame:
    """
    Loads every CSV from PATH_OF_DECOMPILATION_RESULTS and adds:
      model, config, run_key, source_csv
    """
    dfs = []
    for model, runs in path_map.items():
        for run_key, csv_path in runs.items():
            config = infer_config(run_key)

            if not os.path.exists(csv_path):
                msg = f"Missing file: {csv_path}"
                if strict:
                    raise FileNotFoundError(msg)
                print(f"[skip] {msg}")
                continue

            df = pd.read_csv(csv_path)
            df["model"] = model
            df["config"] = config
            df["run_key"] = run_key
            df["source_csv"] = csv_path
            dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    return pd.concat(dfs, ignore_index=True)

In [26]:
def load_all_results(path_map: dict, strict: bool = False) -> pd.DataFrame:
    """
    Loads every CSV from PATH_OF_DECOMPILATION_RESULTS and adds:
      model, config, run_key, source_csv
    """
    dfs = []
    for model, runs in path_map.items():
        for run_key, csv_path in runs.items():
            config = infer_config(run_key)

            if not os.path.exists(csv_path):
                msg = f"Missing file: {csv_path}"
                if strict:
                    raise FileNotFoundError(msg)
                print(f"[skip] {msg}")
                continue

            df = pd.read_csv(csv_path)
            df["model"] = model
            df["config"] = config
            df["run_key"] = run_key
            df["source_csv"] = csv_path
            dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    return pd.concat(dfs, ignore_index=True)

In [27]:
def compute_percent_changes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes:
      syntactic_drift_pct
      cosine_distance_pct_change

    Assumes df has:
      d_lookup_vs_decompiled, d_lookup_vs_repaired
      d_lookup_vs_decompiled_cosine_distance, d_lookup_vs_repaired_cosine_distance
    """
    required = [
        "d_lookup_vs_decompiled",
        "d_lookup_vs_repaired",
        "d_lookup_vs_decompiled_cosine_distance",
        "d_lookup_vs_repaired_cosine_distance",
    ]

    # Ensure numeric
    for c in required:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Drop rows with NaNs in required inputs
    df = df.dropna(subset=required).copy()

    # Avoid divide-by-zero
    denom_d = df["d_lookup_vs_decompiled"].replace(0, np.nan)
    denom_dist = df["d_lookup_vs_decompiled_cosine_distance"].replace(0, np.nan)

    df["syntactic_drift_pct"] = (
        (df["d_lookup_vs_repaired"] - df["d_lookup_vs_decompiled"]) / denom_d
    ) * 100

    df["cosine_distance_pct_change"] = (
        (df["d_lookup_vs_repaired_cosine_distance"] - df["d_lookup_vs_decompiled_cosine_distance"]) / denom_dist
    ) * 100
    df[["syntactic_drift_pct", "cosine_distance_pct_change"]] = (
        df[["syntactic_drift_pct", "cosine_distance_pct_change"]].round(2)
    )
    return df

In [28]:
def summarize_by_cluster(df: pd.DataFrame, cluster_col: str) -> pd.DataFrame:
    """
    Group stats by model/config/cluster.
    """
    grp_cols = ["model", "config", cluster_col]
    stats = (
        df.groupby(grp_cols, as_index=False)
          .agg(
              n=("file_hash", "count"),
              drift_mean=("syntactic_drift_pct", "mean"),
              drift_median=("syntactic_drift_pct", "median"),
              drift_std=("syntactic_drift_pct", "std"),
              dist_mean=("cosine_distance_pct_change", "mean"),
              dist_median=("cosine_distance_pct_change", "median"),
              dist_std=("cosine_distance_pct_change", "std"),
          )
          .sort_values(["model", "config", "n"], ascending=[True, True, False])
    )
    return stats

In [29]:
# Load results from all models/configs
df_results = load_all_results(PATH_OF_DECOMPILATION_RESULTS, strict=False)
if df_results.empty:
    raise RuntimeError("No results CSVs were loaded. Check your paths.")

# Load clusters
if not os.path.exists(CLUSTER_CSV_PATH):
    raise FileNotFoundError(f"Cluster CSV not found: {CLUSTER_CSV_PATH}")

df_clusters = pd.read_csv(CLUSTER_CSV_PATH)

needed_cluster_cols = {"file_hash", CLUSTER_COL}
missing = needed_cluster_cols - set(df_clusters.columns)
if missing:
    raise ValueError(f"Cluster CSV is missing columns: {sorted(missing)}")

# Keep one cluster label per file_hash (if duplicates exist)
df_clusters = df_clusters[["file_hash", CLUSTER_COL]].drop_duplicates("file_hash")

# Merge cluster info into results
df_merged = df_results.merge(
        df_clusters,
        on="file_hash",
        how="left",
        validate="many_to_one",  # many results rows per file_hash is OK
    )

    # If you want to drop rows with no cluster label, uncomment:
    # df_merged = df_merged.dropna(subset=[CLUSTER_COL]).copy()

    # Your cleanup line (expanded to include cosine distance columns)
df_merged.dropna(
        subset=[
            "d_lookup_vs_decompiled",
            "d_lookup_vs_repaired",
            "d_lookup_vs_decompiled_cosine_distance",
            "d_lookup_vs_repaired_cosine_distance",
        ],
        inplace=True,
    )

    # Compute percent-change metrics
df_merged = compute_percent_changes(df_merged)

# Cluster-based summary
cluster_stats = summarize_by_cluster(df_merged, CLUSTER_COL)
cluster_stats = cluster_stats.round(2)
    # Save outputs
cluster_stats = cluster_stats.drop(columns=["n"], errors="ignore")
df_merged.to_csv(OUT_MERGED_CSV, index=False)
cluster_stats.to_csv(OUT_CLUSTER_STATS_CSV, index=False)

print(f"[ok] merged+metrics saved: {OUT_MERGED_CSV}")
print(f"[ok] cluster summary saved: {OUT_CLUSTER_STATS_CSV}")
print(cluster_stats)

[ok] merged+metrics saved: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/decomp_results_with_clusters_and_metrics.csv
[ok] cluster summary saved: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/cluster_summary_stats.csv
           model    config    cluster  drift_mean  drift_median  drift_std  \
2    deepseek-r1  config_0  cluster_4       38.34          0.00     126.23   
0    deepseek-r1  config_0  cluster_1        5.48          0.00      34.60   
1    deepseek-r1  config_0  cluster_3       15.95         15.95      27.59   
3    deepseek-r1  config_1  cluster_4      250.97        250.97        NaN   
5   gemini-flash  config_0  cluster_1       77.29          0.00     666.72   
..           ...       ...        ...         ...           ...        ...   
67       qwen-7b  config_2  cluster_4       32.56          0.70      72.01   
66       qwen-7b  config_2  cluster_3      -39.59        -24.22      44.65   
68       qwen-7b  config_2  clus