In [None]:
%load_ext autoreload
%autoreload 2

from sklearn.metrics import cohen_kappa_score, mean_absolute_error
from data.loader import load_data
import glob
import pandas as pd
from pathlib import Path
import re
import numpy as np
import json
import random

In [None]:
patterns = {
    "Selene-1-Mini-Llama-3.1-8B": {
        "reasoning":re.compile(r"\*\*Reasoning:\*\*(.*?)(\*\*Result:\*\*|$)", re.DOTALL),
        "score": re.compile(r"\*\*Result:\*\*\s+(\d+|yes|no|a|b)",re.IGNORECASE)
    },
    "Flow-Judge-v0.1": {
        "reasoning":re.compile(r"<feedback>(.*?)(</feedback>)|(<score>)", re.IGNORECASE | re.DOTALL),
        "score": re.compile(r"<score>\n*(\d+)\n*</score>", re.IGNORECASE)
    },
}

In [None]:
def percent_within_one(rater1, rater2, gap=1):
    """
    Compute percentage of cases where the absolute difference 
    between two raters' scores is ≤ 1.

    Parameters:
        rater1 (list or array): Ratings from annotator 1.
        rater2 (list or array): Ratings from annotator 2.

    Returns:
        float: Percentage of agreement within ±1.
    """
    rater1 = np.array(rater1)
    rater2 = np.array(rater2)
    
    # Mask out missing values if using np.nan
    mask = ~np.isnan(rater1) & ~np.isnan(rater2)
    r1 = rater1[mask]
    r2 = rater2[mask]

    within_one = np.abs(r1 - r2) <= gap
    return np.mean(within_one) * 100

## Interview

In [None]:
metrics = glob.glob(f"./metrics/*/*/gemma-3-27b-it/interview*")

In [None]:
judge = []
role = []
response = []
metric = []
for m in metrics:
    judge.append(m.split("/")[3])
    metric.append(m.split("/")[2])
    role.append(Path(m).name)
    response.append(pd.read_csv(m)[metric[-1]].tolist())

In [None]:
df = pd.DataFrame({"judge": judge, "role": role, "response": response, "metric": metric})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [int(patterns[x["judge"]]["score"].search(y).group(1)) if patterns[x["judge"]]["score"].search(y) is not None else int(re.search(r'\d', y).group(0)) for y in x["response"]],axis=1)

In [None]:
judges = list(np.unique(df.judge))
roles = list(np.unique(df.role))
metrics = list(np.unique(df.metric))

In [None]:
df = df.set_index(["judge", "metric", "role"])

In [None]:
judge1_scores = []
judge2_scores = []
all_metrics = []
all_roles = []
idxs = []
for metric in metrics:
    for role in roles:
        judge1 = df.loc[judges[0],metric, role].score
        judge2 = df.loc[judges[1],metric, role].score
        judge1_scores.extend(judge1)
        judge2_scores.extend(judge2)
        all_metrics.extend([metric]*len(judge1))
        all_roles.extend([role]*len(judge1))
        idxs.extend(list(range(len(judge1))))
kappa = cohen_kappa_score(judge1_scores, judge2_scores, weights="quadratic")
print(f"Cohens kappa: {kappa}")
    

In [None]:
ratings = pd.DataFrame({"metric": all_metrics, "role": all_roles, "idx": idxs, judges[0]: judge1_scores, judges[1]: judge2_scores})

In [None]:
interviews = glob.glob(f"./generations/gemma-3-27b-it/inter*")

In [None]:
dialogues = {}
for interview in interviews:
    role = Path(interview).name
    dialogues[role] = json.load(open(interview, "r"))

In [None]:
sampled_ratings = pd.read_csv("data/sampled_ratings.csv")

In [None]:
sampled = pd.merge(ratings, sampled_ratings, on=["role", "idx", "metric"], how="right")

In [None]:
cohen_kappa_score(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
cohen_kappa_score(sampled["Flow-Judge-v0.1"], sampled["Rating"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
cohen_kappa_score(sampled["Selene-1-Mini-Llama-3.1-8B"], sampled["Rating"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
(sampled["Flow-Judge-v0.1"]== sampled["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
(sampled["Flow-Judge-v0.1"]== sampled["Rating"]).mean()

In [None]:
(sampled["Rating"]== sampled["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
mean_absolute_error(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
mean_absolute_error(sampled["Flow-Judge-v0.1"], sampled["Rating"])

In [None]:
mean_absolute_error(sampled["Rating"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
mean_absolute_error(ratings["Flow-Judge-v0.1"], ratings["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
(ratings["Flow-Judge-v0.1"]== ratings["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
percent_within_one(sampled["Flow-Judge-v0.1"], sampled["Rating"])

In [None]:
percent_within_one(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
percent_within_one(sampled["Rating"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
ratings.to_csv("results/interview_ratings.csv", index=False)

## bfi

In [None]:
metrics = glob.glob(f"./metrics/bfi/*/*/*")

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["bfi"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [int(patterns[x["judge"]]["score"].search(y).group(1)) for y in x["response"]],axis=1)

In [None]:
np.unique(df.judge)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
for _, row in df.iterrows():
    if "Selene" in row.judge: continue
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores})

In [None]:
sampled = ratings.sample(50, random_state=42)

In [None]:
sampled.tail(n=10)

In [None]:
generations = glob.glob(f"./generations/*/bfi*")

In [None]:
all_generations = {}
for g in generations:
    role = Path(g).name
    model = g.split("/")[-2]
    all_generations[(model, role)] = json.load(open(g, "r"))

In [None]:
_, inputs = load_data("bfi")

In [None]:
sampled["input"] = sampled.apply(lambda x: inputs[x["idx"]], axis=1)

In [None]:
sampled["output"] = sampled.apply(lambda x: all_generations[(x["model"], x["role"].replace("json.csv", "json"))][x["idx"]], axis=1)

In [None]:
sampled

In [None]:
sampled = sampled.rename(columns=lambda x: x if x != "rating" else "Flow-Judge-v0.1")

In [None]:
selene_ratings = []
indexed_df = df.set_index(["model", "judge", "role"])
for _, row in sampled.iterrows():
    model, judge, role, i = row.model, "Selene-1-Mini-Llama-3.1-8B", row.role, row.idx
    selene_ratings.append(indexed_df.loc[model, judge, role].score[i])

In [None]:
sampled["Selene-1-Mini-Llama-3.1-8B"] = selene_ratings

In [None]:
sampled_ratings = pd.read_csv("data/sampled_bfi_ratings.csv")

In [None]:
sampled["human"] = sampled_ratings["Rating"].values

In [None]:
sampled[np.abs(sampled["human"] - sampled["Selene-1-Mini-Llama-3.1-8B"])>1]

In [None]:
cohen_kappa_score(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
cohen_kappa_score(sampled["Flow-Judge-v0.1"], sampled["human"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
cohen_kappa_score(sampled["Selene-1-Mini-Llama-3.1-8B"], sampled["human"], weights="quadratic", labels=[1,2,3,4,5])

In [None]:
(sampled["Flow-Judge-v0.1"]== sampled["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
(sampled["Flow-Judge-v0.1"]== sampled["human"]).mean()

In [None]:
(sampled["human"]== sampled["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
mean_absolute_error(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
mean_absolute_error(sampled["Flow-Judge-v0.1"], sampled["human"])

In [None]:
mean_absolute_error(sampled["human"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
ratings = ratings.rename(columns=lambda x: x if x != "rating" else "Flow-Judge-v0.1")

In [None]:
selene_ratings = []
indexed_df = df.set_index(["model", "judge", "role"])
for _, row in ratings.iterrows():
    model, judge, role, i = row.model, "Selene-1-Mini-Llama-3.1-8B", row.role, row.idx
    selene_ratings.append(indexed_df.loc[model, judge, role].score[i])

In [None]:
ratings["Selene-1-Mini-Llama-3.1-8B"] = selene_ratings

In [None]:
mean_absolute_error(ratings["Flow-Judge-v0.1"], ratings["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
(ratings["Flow-Judge-v0.1"]== ratings["Selene-1-Mini-Llama-3.1-8B"]).mean()

In [None]:
def percent_within_one(rater1, rater2, gap=1):
    """
    Compute percentage of cases where the absolute difference 
    between two raters' scores is ≤ 1.

    Parameters:
        rater1 (list or array): Ratings from annotator 1.
        rater2 (list or array): Ratings from annotator 2.

    Returns:
        float: Percentage of agreement within ±1.
    """
    rater1 = np.array(rater1)
    rater2 = np.array(rater2)
    
    # Mask out missing values if using np.nan
    mask = ~np.isnan(rater1) & ~np.isnan(rater2)
    r1 = rater1[mask]
    r2 = rater2[mask]

    within_one = np.abs(r1 - r2) <= gap
    return np.mean(within_one) * 100

In [None]:
percent_within_one(sampled["Flow-Judge-v0.1"], sampled["human"])

In [None]:
percent_within_one(sampled["Flow-Judge-v0.1"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
percent_within_one(sampled["human"], sampled["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
percent_within_one(ratings["Flow-Judge-v0.1"], ratings["Selene-1-Mini-Llama-3.1-8B"])

In [None]:
ratings

In [None]:
ratings.to_csv("results/bfi_ratings.csv", index=False)

## Instruction role specific

In [None]:
metrics = glob.glob(f"./metrics/instruction_role_specific/*/gemma*/instruction*")

In [None]:
metrics = [m for m in metrics if "reference" not in m and "empty" not in m]

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["instruction_role_specific"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [patterns[x["judge"]]["score"].search(y).group(1) if patterns[x["judge"]]["score"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores, "reasoning": reasonings})

In [None]:
ratings.rating.value_counts()

In [None]:
ratings[ratings.idx%2==0].rating.value_counts()

In [None]:
ratings[ratings.idx%2!=0].rating.value_counts()

In [None]:
sampled = ratings.sample(50, random_state=42)

In [None]:
sampled.tail(n=10)

In [None]:
generations = glob.glob(f"./generations/gemma*/instruction*")

In [None]:
all_generations = {}
for g in generations:
    role = Path(g).name
    model = g.split("/")[-2]
    all_generations[(model, role)] = json.load(open(g, "r"))

In [None]:
data, inputs = load_data("instruction_role_specific")

In [None]:
sampled["clean_role"] =  sampled.role.apply(lambda x: re.search(rf"(?<=instruction_role_specific_)[^.]*", x).group(0).rsplit("_",maxsplit=1)[0].replace("_", " "))

In [None]:
sampled["input"] = sampled.apply(lambda x: inputs[x["clean_role"]][x["idx"]], axis=1)

In [None]:
responses =sampled.apply(lambda x: all_generations[(x["model"], x["role"].replace("json.csv", "json"))][x["idx"]], axis=1).tolist()

In [None]:
references = sampled.apply(lambda x: data[x["clean_role"]]["answer"][x["idx"]], axis=1).tolist()

In [None]:
response_a = []
response_b = []
for i, (_, row) in enumerate(sampled.iterrows()):
    a = references[i]
    b = responses[i]
    if row.idx %2 != 0:
        a,b = b, a
    response_a.append(a)
    response_b.append(b)
        

In [None]:
sampled["responseA"] = response_a
sampled["responseB"] = response_b

In [None]:
sampled_ratings = pd.read_csv("data/sampled_instruction_role_specific_ratings.csv")

In [None]:
sampled["human"] = sampled_ratings["Rating"].values

In [None]:
cohen_kappa_score(sampled["rating"], sampled["human"], labels=["A", "B"])

In [None]:
(sampled["human"]== sampled["rating"]).mean()

In [None]:
sampled[sampled.idx%2==0].human.value_counts()

In [None]:
sampled[sampled.idx%2!=0].human.value_counts()

In [None]:
sampled[sampled["rating"] != sampled["human"]]

In [None]:
for r in sampled[sampled["rating"] != sampled["human"]].reasoning:
    print(r)
    print("===============")

## instruction_general

In [None]:
metrics = glob.glob(f"./metrics/instruction_general/*/gemma*/instruction*")
metrics = [m for m in metrics if "reference" not in m and "empty" not in m]

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["instruction_general"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) if patterns[x["judge"]]["reasoning"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [patterns[x["judge"]]["score"].search(y).group(1) if patterns[x["judge"]]["score"].search(y) is not None else random.choice(["A", "B"]) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores, "reasoning": reasonings})

In [None]:
ratings.rating.value_counts()

In [None]:
ratings[ratings.idx%2==0].rating.value_counts()

In [None]:
ratings[ratings.idx%2!=0].rating.value_counts()

In [None]:
sampled = ratings.sample(50, random_state=42)

In [None]:
sampled.head(n=20)

In [None]:
generations = glob.glob(f"./generations/gemma*/instruction_general*")

In [None]:
all_generations = {}
for g in generations:
    role = Path(g).name
    model = g.split("/")[-2]
    all_generations[(model, role)] = json.load(open(g, "r"))

In [None]:
data, inputs = load_data("instruction_general")

In [None]:
sampled["clean_role"] =  sampled.role.apply(lambda x: re.search(rf"(?<=instruction_general_)[^.]*", x).group(0).rsplit("_",maxsplit=1)[0].replace("_", " "))

In [None]:
sampled["input"] = sampled.apply(lambda x: inputs[x["clean_role"]][x["idx"]], axis=1)

In [None]:
responses =sampled.apply(lambda x: all_generations[(x["model"], x["role"].replace("json.csv", "json"))][x["idx"]], axis=1).tolist()

In [None]:
references = sampled.apply(lambda x: data[x["clean_role"]]["generated"][x["idx"]][0], axis=1).tolist()

In [None]:
response_a = []
response_b = []
for i, (_, row) in enumerate(sampled.iterrows()):
    a = references[i]
    b = responses[i]
    if row.idx %2 != 0:
        a,b = b, a
    response_a.append(a)
    response_b.append(b)
        

In [None]:
sampled["responseA"] = response_a
sampled["responseB"] = response_b

In [None]:
sampled_ratings = pd.read_csv("data/sampled_instruction_general_ratings.csv")

In [None]:
sampled["human"] = sampled_ratings["Rating"].values

In [None]:
cohen_kappa_score(sampled["rating"], sampled["human"], weights="quadratic", labels=["A", "B"])

In [None]:
(sampled["human"]== sampled["rating"]).mean()

In [None]:
sampled[sampled.idx%2==0].human.value_counts()

In [None]:
sampled[sampled.idx%2!=0].human.value_counts()

In [None]:
sampled[sampled["rating"] != sampled["human"]]

In [None]:
for r in sampled[sampled["rating"] != sampled["human"]].reasoning:
    print(r)
    print("===============")

## xstest

In [None]:
metrics = glob.glob(f"./metrics/xstest/*/gemma*/xstest*")
metrics = [m for m in metrics if "reference" not in m and "empty" not in m]

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["xstest"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [patterns[x["judge"]]["score"].search(y).group(1) if patterns[x["judge"]]["score"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores, "reasoning": reasonings})

In [None]:
sampled = ratings.sample(50, random_state=42)

In [None]:
sampled.head(n=10)

In [None]:
generations = glob.glob(f"./generations/gemma*/xstest*")

In [None]:
all_generations = {}
for g in generations:
    role = Path(g).name
    model = g.split("/")[-2]
    all_generations[(model, role)] = json.load(open(g, "r"))

In [None]:
_, inputs = load_data("xstest")

In [None]:
sampled["input"] = sampled.apply(lambda x: inputs[x["idx"]], axis=1)

In [None]:
sampled["output"] = sampled.apply(lambda x: all_generations[(x["model"], x["role"].replace("json.csv", "json"))][x["idx"]], axis=1)

In [None]:
sampled_ratings = pd.read_csv("data/sampled_xstest_ratings.csv")

In [None]:
sampled_ratings = sampled_ratings.rename(columns=lambda x: x if x != "Rating" else "human")

In [None]:
sampled["human"] = sampled_ratings["human"].values

In [None]:
for r in sampled[sampled["rating"] != sampled["human"]].reasoning:
    print(r)

In [None]:
cohen_kappa_score(sampled["rating"], sampled["human"], labels=["No", "Yes"])

In [None]:
(sampled["human"]== sampled["rating"]).mean()

In [None]:
ratings.to_csv("results/xstest_ratings.csv", index=False)