# 🙌 Benchmark

In this notebook we are are benchmarking the MCQ performance of our fine-tuned Phi-3 models and the base Phi-3 models. We follow the methodology outlined in `EVAL.md` and load the results from the folder `model/results`.

## 

---

Let's install some necessary dependencies and set global variables.

In [None]:
# Enable R magic
%load_ext rpy2.ipython

In [None]:
import autorootcwd

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# Modules
import os
import json
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from typing import Dict

In [None]:
%%R
# R Modules
library(ggplot2)

In [None]:
# Styling options
sns.set_style("whitegrid")
sns.set_palette("colorblind")

In [None]:
# Change index
INDEX_SELECTOR = {
    "openbookqa": ("OBQA", "OBQA"),
    "mmlu_abstract_algebra": ("MMLU", "Abstract Algebra"),
    "mmlu_anatomy": ("MMLU", "Anatomy"), 
    "mmlu_astronomy": ("MMLU", "Astronomy"),
    "mmlu_college_biology": ("MMLU", "College Biology"),
    "mmlu_college_chemistry": ("MMLU", "College Chemistry"),
    "mmlu_college_computer_science": ("MMLU", "College Computer Science"),
    "mmlu_college_mathematics": ("MMLU", "College Mathematics"),
    "mmlu_college_physics": ("MMLU", "College Physics"),
    "mmlu_computer_security": ("MMLU", "Computer Security"),
    "mmlu_conceptual_physics": ("MMLU", "Conceptual Physics"),
    "mmlu_electrical_engineering": ("MMLU", "Electrical Engineering"),
    "mmlu_elementary_mathematics": ("MMLU", "Elementary Mathematics"),
    "mmlu_high_school_biology": ("MMLU", "High School Biology"),
    "mmlu_high_school_chemistry": ("MMLU", "High School Chemistry"),
    "mmlu_high_school_computer_science": ("MMLU", "High School Computer Science"),
    "mmlu_high_school_mathematics": ("MMLU", "High School Mathematics"),
    "mmlu_high_school_physics": ("MMLU", "High School Physics"),
    "mmlu_high_school_statistics": ("MMLU", "High School Statistics"),
    "mmlu_machine_learning": ("MMLU", "Machine Learning"),
    "gpqa_main_zeroshot": ("GPQA", "GPQA Main (Zero-Shot)"),
    "gpqa_extended_zeroshot": ("GPQA", "GPQA Extended (Zero-Shot)"),
    "gpqa_diamond_zeroshot": ("GPQA", "GPQA Diamond (Zero-Shot)"),
    "arc_challenge": ("ARC", "ARC Challenge"),
    "arc_easy": ("ARC", "ARC Easy"),
    "sciq": ("SciQ", "SciQ"),
} 

COLUMN_SELECTOR = {
    "acc,none": "Accuracy",
    "acc_stderr,none": "SE",
}

In [None]:
# Helpers
def load_results(model: str) -> pd.DataFrame:
    path = os.path.join("results", model, "results.json")
    with open(path, "r") as f:
        eval = json.load(f)

    # Load results
    results = pd.DataFrame(eval["results"]).transpose()

    # Change/ select column names
    TASK_SELECTOR = {k: v[1] for k, v in INDEX_SELECTOR.items()}
    results = results.rename(index=TASK_SELECTOR, columns=COLUMN_SELECTOR)\
        .loc[TASK_SELECTOR.values(), COLUMN_SELECTOR.values()]\
        .reset_index().rename(columns={"index": "Task"})
    
    # Add group
    results["Group"] = [group for group, _ in INDEX_SELECTOR.values()]

    # Create multi-index
    results = results.set_index(["Group", "Task"])


    return results

def load_epfl_samples(model: str) -> pd.DataFrame:
    # Load samples
    path = os.path.join("results", model, f"samples_epfl-mcq.json")
    with open(path, "r") as f:
        samples = [json.loads(line) for line in f]

    # Create dataframe
    samples = pd.DataFrame(samples)

    # Add group
    samples["Subject"] = samples["doc"].apply(lambda x: x["subject"])
    samples["Question"] = samples["doc"].apply(lambda x: x["question"])
    for choice in range(4):
        samples[f"{chr(65+choice)}"] = samples["doc"].apply(lambda x: x["choices"][choice])
    samples["Target"] = samples["target"].apply(lambda x: x[0])
    preds = []
    for resps in samples["resps"].apply(lambda xs: [x[0][1] for x in xs]):
        pred = None
        try:
            pred = chr(65 + resps.index("True"))
        except:
            pass
        preds.append(pred)
    samples["Prediction"] = preds

    samples["Correct"] = samples["acc"]

    # Select only these columns
    samples = samples[["Subject", "Question", "A", "B", "C", "D", "Target", "Prediction", "Correct"]]

    return samples

## Baselines

### Phi-3

---

* Model Path: `microsoft/Phi-3-mini-4k-instruct`
* Results Path: `model/results/phi3`

In [None]:
# Load results
phi3 = load_results("phi3")

### LLama3-8B Instruct

---

* Model Path: `meta-llama/LLaMA-3-8B-instruct`
* Results Path: `model/results/llama3-8b-instruct` 

In [None]:
# load results
llama3 = load_results("llama3")

### OpenELM-3B Instruct

---

* Model Path: `apple/OpenELM-3B-Instruct`
* Model Name: `OpenELM-3B Instruct`

In [None]:
# load results
openelm = load_results("openelm")

## Fine-Tuned Models

### DPO Phi3

---

* Model Path: `cs552-mlp/phi3-dpo`
* Results Path: `model/results/phi3-dpo`

In [None]:
# Load results
phi3_dpo = load_results("phi3-dpo")

### Phi-3 SciQ

---

* Model Path: `cs552-mlp/phi3-sciq`
* Results Path: `model/results/phi3-sciq`

In [None]:
# Load results
phi3_sciq = load_results("phi3-sciq3")

### Phi-3 Arc

---

* Model Path: `cs552-mlp/phi3-arc`
* Results Path: `model/results/phi3-arc`

In [None]:
# Load results
phi3_arc = load_results("phi3-arc3")

### Phi-3 OpenBookQA

---

* Model Path: `cs552-mlp/phi3-openbookqa`
* Results Path: `model/results/phi3-openbookqa`

In [None]:
# Load results
phi3_openbookqa = load_results("phi3-openbookqa3")

### Phi-3 MCQ

---

Trained on all MCQ datasets (OpenBookQA, ARC, SciQ)

* Model Path: `cs552-mlp/phi3-mcq`
* Results Path: `model/results/phi3-mcq`

In [None]:
# Load results
phi3_mcq = load_results("phi3-mcq3")

## Quantised Models

### Phi-3 Arc GPTQ 8b

---

* Model Path: `cs552-mlp/phi3-lora-arc-gptq-8b`
* Results Path: `model/results/phi3-arc3-gptq-8b`

In [None]:
# Load results
phi3_arc_gptq_8b = load_results("phi3-arc3-gptq-8b")

### Phi-3 Arc GPTQ 4b

---

* Model Path: `cs552-mlp/phi3-lora-arc-gptq-4b`
* Results Path: `model/results/phi3-arc3-gptq-4b`

In [None]:
# Load results
phi3_arc_gptq_4b = load_results("phi3-arc3-gptq-4b")

### Phi-3 Arc GPTQ 3b

---

* Model Path: `cs552-mlp/phi3-lora-arc-gptq-3b`
* Results Path: `model/results/phi3-arc3-gptq-3b`

In [None]:
# Load results
phi3_arc_gptq_3b = load_results("phi3-arc3-gptq-3b")

### Phi-3 Arc GPTQ 2b

---

* Model Path: `cs552-mlp/phi3-lora-arc-gptq-2b`
* Results Path: `model/results/phi3-arc3-gptq-2b`

In [None]:
# Load results
phi3_arc_gptq_2b = load_results("phi3-arc3-gptq-2b")

## Analysis

---

We combine the benchmark results from all baseline and fine-tuned models and analyze the performance of each model.

### Quantiative Results

In [None]:
models = {
    "OpenELM": (openelm, "Baseline", "Unquantised"),
    "LLama": (llama3, "Baseline", "Unquantised"),
    "Phi-3": (phi3, "Baseline", "Unquantised"),
    "Phi-3-DPO": (phi3_dpo, "Finetuned", "Unquantised"),
    "Phi-3-SciQ": (phi3_sciq, "Finetuned", "Unquantised"),
    "Phi-3-OBQA": (phi3_openbookqa, "Finetuned", "Unquantised"),
    "Phi-3-Arc": (phi3_arc, "Finetuned", "Unquantised"),
    "Phi-3-MCQ": (phi3_mcq, "Finetuned", "Unquantised"),
    "GPTQ-8b": (phi3_arc_gptq_8b, "Finetuned", "Quantised"),
    "GPTQ-4b": (phi3_arc_gptq_4b, "Finetuned", "Quantised"),
    "GPTQ-3b": (phi3_arc_gptq_3b, "Finetuned", "Quantised"),
    "GPTQ-2b": (phi3_arc_gptq_2b, "Finetuned", "Quantised"),
}

model_df = [x[0] for x in models.values()]
baseline = [x[1] for x in models.values()]
quantised = [x[2] for x in models.values()]

combined = pd.concat(model_df, keys=[(k, b, q) for k, b, q in zip(models.keys(), baseline, quantised)], axis=0).reset_index().rename(columns={"level_0": "Model", "level_1": "Baseline", "level_2": "Quantised"})

combined["Accuracy"] = (combined["Accuracy"] * 100).astype(float)
combined["SE"] = (combined["SE"] * 100).astype(float)

combined

In [None]:
# To LaTeX
def format_for_latex(scores: pd.DataFrame, caption: str, label: str, drop_col = ["Baseline", "Quantised"], agg: bool = True) -> pd.DataFrame:
    latex_df = scores.copy()

    # Aggregate tasks
    if agg:
        latex_df = latex_df.groupby(["Model", "Group", "Baseline", "Quantised"])\
            .agg({"Accuracy": "mean", "SE": "mean"}).reset_index()
    
    # Combine accuracy and std. error
    latex_df["Acc. ± SE"] = latex_df.apply(lambda x: f"{x['Accuracy']:.1f} ± {x['SE']:.1f}", axis=1)
    latex_df = latex_df.drop(columns=["Accuracy", "SE"])
    
    # Drop user-specified columns
    latex_df = latex_df.drop(columns=drop_col)

    # Unstack columns
    c =  ["Model", "Group"] + (["Task"] if not agg else [])
    latex_df = latex_df.set_index(c)
    latex_df = latex_df.unstack("Model")

    # Remove multi-col
    latex_df.columns = latex_df.columns.droplevel(0)

    # Unname index and columns
    latex_df.index.name = None
    latex_df.columns.name = None

    # Convert to latex
    latex = latex_df.to_latex(caption=caption, label=label, position="h")

    # Post-process
    def add_centering(latex_code):
        lines = latex_code.split('\n')
        for i, line in enumerate(lines):
            if line.strip().startswith(r'\begin{table}'):
                lines.insert(i + 1, r'\centering')
                break
        return '\n'+ '\n'.join(lines)

    latex = add_centering(latex)

    return latex_df, latex

In [None]:
def write_latex(latex: str, path: str):
    with open(path, "w") as f:
        f.write(latex)

In [None]:
def format_for_plot(scores: pd.DataFrame) -> pd.DataFrame:
    plot_df = scores.copy()

    # Aggregate scores
    plot_df = plot_df.groupby(["Model", "Group", "Baseline", "Quantised"])\
        .agg({"Accuracy": "mean", "SE": "mean"}).reset_index()

    return plot_df

In [None]:
%%R

plot.benchmark.multiples <- function(df, path, title, order) {
    df$Model <- factor(df$Model, levels=order)

    p <- ggplot(df, aes(x=Group, y=Accuracy, fill=Model)) +
        geom_bar(width=.8, stat="identity", position = position_dodge(width = .9, preserve = "single"), linewidth=0.25, linetype="solid", color="black") +
        # Remove facet wrap title
        facet_wrap(~Group, scales="free_x", ncol=3, labeller = ) +
        labs(
            title=title,
            x=NULL,
            y=NULL,
        ) +
        geom_errorbar(aes(ymin=Accuracy-SE, ymax=Accuracy+SE), width=.2, position=position_dodge(.9)) +
        theme_minimal() +
        theme(legend.position="bottom", strip.text.x = element_blank(), panel.grid.minor = element_blank()) +
        scale_fill_brewer(palette = "Blues")

    # # Save plot
    ggsave(path, plot=p, width=5, height=4, units="in", dpi=300)
    p
}

In [None]:
%%R

plot.benchmark <- function(df, path, title, order) {
    df$Model <- factor(df$Model, levels=order)

    p <- ggplot(df, aes(x=Group, y=Accuracy, fill=Model)) +
        geom_bar(width=.8, stat="identity", position = position_dodge(width = .9, preserve = "single"), linewidth=0.25, linetype="solid", color="black") +
        labs(
            title=title,
            x="Task",
            y="Accuracy (%)",
        ) +
        geom_errorbar(aes(ymin=Accuracy-SE, ymax=Accuracy+SE), width=.2, position=position_dodge(.9)) +
        theme_minimal() +
        scale_fill_brewer(palette = "Blues")
        scale_x_discrete(limits=c("SciQ", "Arc", "MMLU", "OBQA", "GPQA"))

    # # Save plot
    ggsave(path, plot=p, width=10, height=3, units="in", dpi=300)
    p
}

#### Baselines

Here we are just going to compare the baselines against each other.

In [None]:
# Baseline scores
baseline_scores = combined[(combined["Baseline"] == "Baseline") & (combined["Quantised"] != "Quantised")]

In [None]:
# LateX Table
baseline_latex_df, baseline_latex = format_for_latex(baseline_scores,
    caption="\\textbf{Baseline Results.} Accuracy and Standard Error (SE) for baseline models.",
    label="tab:baseline-benchmark")

# Display
path = "../report/tables/baseline-benchmark.tex"
write_latex(baseline_latex, path)

baseline_latex_df

In [None]:
# To Plot
baseline_plot_df = format_for_plot(baseline_scores)

In [None]:
%%R -i baseline_plot_df -w 5 -h 4 -u in -r 100

plot.benchmark.multiples(baseline_plot_df, 
    path="../report/figures/baseline-benchmark.png",
    title="Baseline Benchmark",
    order=c("OpenELM", "LLama", "Phi-3")
)

#### Fine-Tuning

Here, we are showing the performance of the fine-tuned models against the Phi-3 baseline.

In [None]:
# Fine-tuned scores
finetuned_scores = pd.concat([
    combined[combined["Model"] == "Phi3"],
    combined[(combined["Baseline"] == "Finetuned") & (combined["Quantised"] != "Quantised")]
], axis=0)

In [None]:
# LateX Table
finetuned_latex_df, finetuned_latex = format_for_latex(finetuned_scores,
    caption="\\textbf{Finetune Results.} Accuracy and Standard Error (SE) for fine-tuned models and Phi-3 baseline.",
    label="tab:finetune-benchmark"
)

# Display
path = "../report/tables/finetuned-benchmark.tex"
write_latex(finetuned_latex, path)

finetuned_latex_df

In [None]:
# To Plot
finetuned_plot_df = format_for_plot(finetuned_scores)

In [None]:
%%R -i finetuned_plot_df -u in -w 10 -h 3 -r 100

plot.benchmark(finetuned_plot_df,
    path="../report/figures/finetuned-benchmark.png",
    title="Finetuned Benchmark",
    order=c("Phi-3", "Phi-3-DPO", "Phi-3-SciQ", "Phi-3-OBQA", "Phi-3-Arc", "Phi-3-MCQ")
)

#### Quantisation

Here we are showing the performance of the quantised models against the unquantised model.

In [None]:
# Quantised scores
quantised_scores = pd.concat([
    combined[combined["Model"] == "Phi-3-Arc"],
    combined[combined["Quantised"] == "Quantised"]
], axis=0)

In [None]:
# LateX Table
quantised_latex_df, quantised_latex = format_for_latex(quantised_scores,
    caption="\\textbf{Quantisation Results.} Accuracy and Standard Error (SE) for quantised models and its baseline.",
    label="tab:quantised-benchmark",
)

# Display
path = "../report/tables/quantised-benchmark.tex"
write_latex(quantised_latex, path)

quantised_latex_df

In [None]:
# To Plot
quantised_plot_df = format_for_plot(quantised_scores)

In [None]:
%%R -i quantised_plot_df -u in -w 10 -h 3 -r 100

plot.benchmark(quantised_plot_df,
    path="../report/figures/quantised-benchmark.png",
    title="Quantised Benchmark",
    order=c("Phi-3-Arc", "GPTQ-8b", "GPTQ-4b", "GPTQ-3b", "GPTQ-2b")
)

### Qulitative Analysis

We want to understand the difference in model behaviour for two pairs of models:

1. Phi-3 and Phi-3 ARC (Baseline vs. Fine-Tuned)
2. Phi-3 ARC vs. Phi3-ARC GPTQ 4b (Fine-Tuned vs. Quantised)

In particular, we will investigate the following:

* Analyse the performance per subject (from MMLU)
* Analyse the answer distribution

In [None]:
def load_mmlu_samples(model: str) -> pd.DataFrame:
    # Load all MMLU results
    path = f"results/{model}"
    filenames = [file for file in os.listdir(path) if "mmlu" in file]
    all_samples = []
    for filename in filenames:
        with open(os.path.join(path, filename), "r") as f:
            all_samples.extend([json.loads(line) for line in f])

    # Process to data frame
    samples = pd.DataFrame(all_samples)

    def get_group(subject: str) -> str:
        if "College" in subject:
            return "College"
        elif "High School" in subject:
            return "High School"
        elif "Elementary" in subject:
            return "Elementary"
        else:
            return "Unknown"

    # Define relevant columns
    samples["model"] = model
    samples["subject"] = samples.doc.apply(lambda x: " ".join(map(lambda x: x[0].upper() + x[1:],x["subject"].split("_"))))
    samples["group"] = samples.subject.apply(lambda x: get_group(x))
    samples["question"] = samples.doc.apply(lambda x: x["question"])
    samples["choices"] = samples.doc.apply(lambda x: x["choices"])
    samples["target"] = samples.doc.apply(lambda x: x["answer"])
    samples["logprobs"] = samples.resps.apply(lambda xs: [float(x[0][0]) for x in xs])
    samples["answer"] = samples.logprobs.apply(lambda x: x.index(max(x)))
    samples["correct"] = samples.acc.astype(bool)

    # Select only these columns
    samples = samples[["group", "subject", "question", "choices", "target", "answer", "correct", "logprobs"]]

    return samples

In [None]:
def get_scores_per_group(df, model):
    df = df.groupby("group").agg({"correct": ["mean", "sem"]}).reset_index().sort_values(("correct", "mean"), ascending=False)
    df["Model"] = model
    df.columns = ["Group", "Accuracy", "SE", "Model"]

    return df.reset_index(drop=True).set_index(["Model", "Group"])

In [None]:
def get_scores_per_subject(df, model):
    df = df.groupby("subject").agg({"correct": ["mean", "sem"]}).reset_index().sort_values(("correct", "mean"), ascending=False)
    df["Model"] = model
    df.columns = ["Subject", "Accuracy", "SE", "Model"]

    return df.reset_index(drop=True).set_index(["Model", "Subject"])

In [None]:
def get_confusion_matrix(a):
    conf =  a.groupby(["target", "answer"]).size().unstack(fill_value=0)
    options = ["A", "B", "C", "D"]
    # Rename index and columns according to options
    conf.columns, conf.index = options, options
    conf.index.name, conf.columns.name = "Target", "Prediction"

    return conf

#### Phi-3 vs. Phi-3 ARC

In [None]:
phi3_mmlu = load_mmlu_samples("phi3")
phi3_arc_mmlu = load_mmlu_samples("phi3-arc3")
phi3_arc_4b_mmlu = load_mmlu_samples("phi3-arc3-gptq-4b")

In [None]:
print(f"Phi3")
print(f"Macro Avg.: {phi3_mmlu.groupby('subject').correct.mean().mean() * 100:.2f}%")
print(f"Micro Avg.: {phi3_mmlu.correct.mean() * 100:.2f}%")

In [None]:
print(f"Phi3-ARC")
print(f"Macro Avg.: {phi3_arc_mmlu.groupby('subject').correct.mean().mean() * 100:.2f}%")
print(f"Micro Avg.: {phi3_arc_mmlu.correct.mean() * 100:.2f}%")

In [None]:
print(f"Phi3-ARC 4b")
print(f"Macro Avg.: {phi3_arc_4b_mmlu.groupby('subject').correct.mean().mean() * 100:.2f}%")
print(f"Micro Avg.: {phi3_arc_4b_mmlu.correct.mean() * 100:.2f}%")

In [None]:
scores_per_group = pd.concat([
    get_scores_per_group(phi3_mmlu, "Phi-3"),
    get_scores_per_group(phi3_arc_mmlu, "Phi-3-ARC"),
    get_scores_per_group(phi3_arc_4b_mmlu, "Phi-3-ARC-4b")
], axis=0)
scores_per_group

In [None]:
scores_per_subject = pd.concat([
    get_scores_per_subject(phi3_mmlu, "Phi-3"),
    get_scores_per_subject(phi3_arc_mmlu, "Phi-3-ARC"),
    get_scores_per_subject(phi3_arc_4b_mmlu, "Phi-3-ARC-4b")
], axis=0)
scores_per_subject.head(3)

In [None]:
# Plot scores per subject
plot_df = scores_per_subject.reset_index()
plot_df["Accuracy"] = plot_df["Accuracy"] * 100
keep_subjects = ["High School Biology", "College Biology", "High School Mathematics", "College Mathematics", "High School Physics", "College Physics"]
plot_df = plot_df[plot_df["Subject"].isin(keep_subjects)]

fig, ax = plt.subplots(1, 1, figsize=(4, 4))
sns.barplot(x="Accuracy", y="Subject", hue="Model", data=plot_df, palette="Blues", ax=ax)
# Increase font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel("")
plt.xlabel("Accuracy (%)", fontsize=14)

fig.savefig("../report/figures/mmlu-per-subject.png", dpi=300, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(3 * 2.5, 2.1))
fig.tight_layout(h_pad=5)

phi3_mmlu_conf = get_confusion_matrix(phi3_mmlu)
phi3_arc_mmlu_conf = get_confusion_matrix(phi3_arc_mmlu)
phi3_arc_4b_mmlu_conf = get_confusion_matrix(phi3_arc_4b_mmlu)
sns.heatmap(phi3_mmlu_conf, annot=True, cmap="Blues", fmt="d", ax=ax[0])
sns.heatmap(phi3_arc_mmlu_conf - phi3_mmlu_conf, annot=True, cmap="coolwarm", fmt="d", ax=ax[1])
sns.heatmap(phi3_arc_4b_mmlu_conf - phi3_mmlu_conf, annot=True, cmap="coolwarm", fmt="d", ax=ax[2])
ax[0].set_title("Phi3", fontsize=16)
ax[1].set_title("Phi3-ARC vs. Phi3", fontsize=16);
ax[2].set_title("Phi3-ARC-4b vs. Phi3", fontsize=16);
for a in ax:
    a.set_xlabel("")
    a.set_ylabel("")


fig.savefig("../report/figures/mmlu-confusion.png", dpi=300, bbox_inches="tight")