This Jupyter notebook contains all code to data visualization of the LLM.

In [None]:
import json
import os
from math import comb
from typing import List
import pandas as pd
import matplotlib.pyplot as plt


def load_json_data(file_path):
    """Load JSON data from file"""
    with open(file_path, "r") as f:
        data = json.load(f)
    return data


def create_individual_tables(
    file_paths: List[str], output_dir: str, file_labels: List[str] = None
):
    """
    Create separate individual tables for each JSON file

    Parameters:
    -----------
    file_paths:     List[str]
        List of paths to JSON files
    output_dir:    str
        Name of output directory path
    file_labels:    List[str]
        Optional list of labels for each file

    Returns:
    --------
    None
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Use custom labels or default to File 1, File 2, etc.
    if file_labels is None:
        file_labels = [f"File {i + 1}" for i in range(len(file_paths))]

    all_dataframes = []

    # Process each file separately
    for idx, (path, label) in enumerate(zip(file_paths, file_labels)):
        # Load data
        data = load_json_data(path)
        # Ensure data is in list format
        if not isinstance(data, list):
            data = [data]

        # Create column headers for this file
        columns = [
            "Question",
            "Difficulty",
            "n",
            "compile",
            "func-corr",
            "synth",
        ]

        # Create table data
        table_data = []

        for item in data:
            # Truncate long questions
            question = item["question"]
            if len(question) > 60:
                question = question[:57] + "..."

            row = [
                question,
                item["difficulty"],
                item["evals"]["n"],
                item["evals"]["compile"],
                item["evals"]["func-corr"],
                item["evals"]["synth"],
            ]
            table_data.append(row)

        # Create DataFrame
        df = pd.DataFrame(table_data, columns=columns)
        all_dataframes.append(df)

        # Create figure
        fig_width = 14
        fig_height = min(20, len(df) * 0.5 + 2)

        fig, ax = plt.subplots(figsize=(fig_width, fig_height))
        ax.axis("tight")
        ax.axis("off")

        # Create table
        table = ax.table(
            cellText=df.values, colLabels=df.columns, cellLoc="center", loc="center"
        )

        # Style the table
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1.2, 1.5)

        # Color header row
        for i in range(len(columns)):
            table[(0, i)].set_facecolor("#4CAF50")
            table[(0, i)].set_text_props(weight="bold", color="white")

        # Color alternating rows
        for i in range(1, len(df) + 1):
            if i % 2 == 0:
                for j in range(len(columns)):
                    table[(i, j)].set_facecolor("#f0f0f0")

        # Adjust column widths
        cellDict = table.get_celld()
        for i in range(len(df) + 1):
            cellDict[(i, 0)].set_width(0.4)  # Question column wider
            cellDict[(i, 1)].set_width(0.1)  # Difficulty
            for j in range(2, len(columns)):
                cellDict[(i, j)].set_width(0.1)

        plt.title(
            f"{label} - Evaluation Results", fontsize=16, fontweight="bold", pad=6
        )
        plt.tight_layout()

        # Save files
        output_filepath = os.path.join(
            output_dir, f"{label.replace(' ', '_')}_table.png"
        )
        plt.savefig(output_filepath, dpi=300, bbox_inches="tight")
        plt.close()  # Close figure to free memory

        # Save as CSV
        csv_file = output_filepath.replace(".png", ".csv")
        df.to_csv(csv_file, index=False)

        print(f"Table for {label} saved as {output_filepath}")
        print(f"CSV for {label} saved as {csv_file}")

    return all_dataframes


def pass_at_k(n, c, k):
    """Compute Pass@k probability given n completions, c correct."""
    if pd.isna(n) or pd.isna(c):
        return None
    n = int(n)
    c = int(c)
    if n - c < k:
        return 1.0
    return 1 - comb(n - c, k) / comb(n, k)


def create_comparison_summary(
    file_paths: List[str],
    output_path: str,
    file_labels: List[str] = None,
    k: int = 5,
):
    """
    Create a summary table showing aggregated metrics for each file

    Parameters:
    -----------
    file_paths:     List[str]
        List of paths to JSON files
    output_path:    str
        Output filepath
    file_labels:    List[str]
        Optional list of labels for each file
    k:              int
        Number of samples to check

    Returns:
    --------
    None
    """
    # Load data from all files
    all_data = []
    for path in file_paths:
        data = load_json_data(path)
        if not isinstance(data, list):
            data = [data]
        all_data.append(data)

    # Use custom labels or default
    if file_labels is None:
        file_labels = [f"File {i + 1}" for i in range(len(file_paths))]

    # Calculate summary statistics
    summary_data = []

    for idx, data in enumerate(all_data):
        total_questions = len(data)
        avg_compile = (
            sum(
                pass_at_k(item["evals"]["n"], item["evals"]["compile"], k)
                for item in data
            )
            / total_questions
        )

        avg_func_corr = (
            sum(
                pass_at_k(item["evals"]["n"], item["evals"]["func-corr"], k)
                for item in data
            )
            / total_questions
        )

        avg_synth = (
            sum(
                pass_at_k(item["evals"]["n"], item["evals"]["synth"], k)
                for item in data
            )
            / total_questions
        )

        summary_data.append(
            [
                file_labels[idx],
                total_questions,
                f"{avg_compile:.4f}",
                f"{avg_func_corr:.4f}",
                f"{avg_synth:.4f}",
            ]
        )

    # Create DataFrame
    columns = [
        "File",
        "Total Questions",
        f"Avg pass@{k}-compile",
        f"Avg pass@{k}-func-corr",
        f"Avg pass@{k}-synth",
    ]
    df_summary = pd.DataFrame(summary_data, columns=columns)

    # Create figure
    fig, ax = plt.subplots(figsize=(12, len(df_summary) * 0.8 + 2))
    ax.axis("tight")
    ax.axis("off")

    # Create table
    table = ax.table(
        cellText=df_summary.values,
        colLabels=df_summary.columns,
        cellLoc="center",
        loc="center",
    )

    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.8)

    # Color header row
    for i in range(len(columns)):
        table[(0, i)].set_facecolor("#2196F3")
        table[(0, i)].set_text_props(weight="bold", color="white")

    # Color alternating rows
    for i in range(1, len(df_summary) + 1):
        if i % 2 == 0:
            for j in range(len(columns)):
                table[(i, j)].set_facecolor("#f0f0f0")

    plt.title(
        f"Summary Statistics Across Files for Pass@{k}",
        fontsize=16,
        fontweight="bold",
        pad=2,
    )
    plt.tight_layout()
    plt.savefig(
        os.path.join(output_path, f"summary_pass_{k}.png"), dpi=300, bbox_inches="tight"
    )
    plt.show()

    return df_summary


def create_difficulty_lvl_summary(output_path: str, model_labels: List[str] = None, k: int = 5):
    """
    Create table showing avg Pass@k metric for each level of difficulty per questions (for each LLM)

    Parameters:
    -----------
    output_path:     str
        Output filepath 
    model_labels:    List[str]
        Model labels containing model name - used in reading CSV file(s)
    k:              int
        Number of samples to check

    Returns:
    --------
    None
    """
    for model_label in model_labels:
        csv_filepath = os.path.join(
            output_path, f"{model_label.replace(' ', '_')}_table.csv"
        )

        print(csv_filepath)

        if not os.path.exists(csv_filepath):
            print("CSV Data of Pass@k metric doesn't exist. Try running again")
            continue

        df = pd.read_csv(csv_filepath)

        # Compute true Pass@k per row for each metric
        df["compile"] = df.apply(
            lambda row: pass_at_k(row["n"], row["compile"], k), axis=1
        )
        df["func_corr"] = df.apply(
            lambda row: pass_at_k(row["n"], row["func-corr"], k), axis=1
        )
        df["synth"] = df.apply(lambda row: pass_at_k(row["n"], row["synth"], k), axis=1)

        # Group by difficulty and average probabilities
        difficulty_order = ["basic", "medium", "hard"]
        summary_df = (
            df.groupby("Difficulty", sort=False)[["compile", "func_corr", "synth"]]
            .mean()
            .round(4)
            .reindex(difficulty_order)
        )

        if summary_df.empty:
            print(f"No data to display for {model_label}")
            continue

        summary_df.columns = [
            f"Avg Pass@{k} ({col.split('_')[-1]})" for col in summary_df.columns
        ]

        print(f"\nCreating table for model: {model_label}")
        print(summary_df)

        # Plotting (same as your original plotting logic)
        col_labels = ["Difficulty"] + list(summary_df.columns)
        cell_text = [
            [idx] + list(row) for idx, row in zip(summary_df.index, summary_df.values)
        ]

        fig, ax = plt.subplots(figsize=(12, len(summary_df) * 0.8 + 2))
        ax.axis("off")
        table = ax.table(cellText=cell_text, colLabels=col_labels, loc="center")
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.scale(0.8, 1.8)

        for i in range(len(col_labels)):
            table[(0, i)].set_facecolor("#2196F3")
            table[(0, i)].set_text_props(weight="bold", color="white")

        for i in range(1, len(summary_df) + 1):
            cell = table[(i, 0)]
            capitalized_text = cell.get_text().get_text().title()
            cell.get_text().set_text(capitalized_text)
            cell.set_text_props(weight="bold", ha="center", va="center")

        ax.set_title(
            f"Difficulty-Level Pass@{k} Summary for {model_label}",
            fontsize=14,
            fontweight="bold",
            pad=12,
        )

        filename = f"{model_label.replace(' ', '_')}_diff_summary.png"
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, filename), dpi=300)
        plt.close()


def find_json_files(directory: str):
    """
    Find all JSON files in a directory

    Parameters:
    -----------
    directory:  str
        Directory (path)

    Returns:
    --------
    None
    """
    json_files = []
    for file in os.listdir(directory):
        if file.endswith(".json"):
            json_files.append(os.path.join(directory, file))
    return sorted(json_files)


# Example usage
if __name__ == "__main__":
    # Example 1: Compare specific files
    eval_dataset_path = "dataset/evals"
    target = os.path.join("..", eval_dataset_path)

    file_paths = find_json_files(target)
    labels = [
        "Claude Opus4",
        "CodeLlama 7B Ins (no-sm)",
        "Deepseek Coder 7B Ins v1.5",
        "OpenAI GPT4.1",
        "Qwen Coder2.5 7B Instruct",
    ]

    # Number of samples to check
    k = 10

    # Uncomment to run:
    create_individual_tables(
        file_paths,
        os.path.join("..", eval_dataset_path, f"comparisons_pass_{k}"),
        labels,
    )
    create_comparison_summary(
        file_paths,
        os.path.join("..", eval_dataset_path, f"comparisons_pass_{k}"),
        labels,
        k=k,
    )
    create_difficulty_lvl_summary(
        os.path.join("..", eval_dataset_path, f"comparisons_pass_{k}"), labels, k=k
    )
    print("Script ready for multiple JSON file comparison!")

../dataset/evals/comparisons_pass_10/Claude_Opus4_table.csv

Creating table for model: Claude Opus4
            Avg Pass@10 (compile)  Avg Pass@10 (corr)  Avg Pass@10 (synth)
Difficulty                                                                
basic                       0.600               1.000                0.600
medium                      0.875               0.625                0.875
hard                        1.000               0.400                1.000
../dataset/evals/comparisons_pass_10/CodeLlama_7B_Ins_(no-sm)_table.csv

Creating table for model: CodeLlama 7B Ins (no-sm)
            Avg Pass@10 (compile)  Avg Pass@10 (corr)  Avg Pass@10 (synth)
Difficulty                                                                
basic                        1.00                0.80                  1.0
medium                       0.75                0.25                  1.0
hard                         1.00                0.20                  1.0
../dataset/evals/compariso