In [None]:
%load_ext autoreload
%autoreload 2

from sklearn.metrics import cohen_kappa_score, mean_absolute_error
from transformers import AutoTokenizer
from data.loader import load_data
import glob
import pandas as pd
from pathlib import Path
import re
import numpy as np
import json
import random
from utils.seeds import initialize_seeds

In [None]:
patterns_ab = {
    "Selene-1-Mini-Llama-3.1-8B": {
        "reasoning":re.compile(r"\*\*Reasoning:\*\*(.*?)(\*\*Result:\*\*|$)", re.DOTALL),
        "score": re.compile(r"\*\*Result:\*\*\s+(a|b)",re.IGNORECASE)
    },
    "Flow-Judge-v0.1": {
        "reasoning":re.compile(r"<feedback>(.*?)(</feedback>)|(<score>)", re.IGNORECASE | re.DOTALL),
        "score": re.compile(r"<score>\n*(\d+)\n*</score>", re.IGNORECASE)
    },
}

In [None]:
patterns = {
    "Selene-1-Mini-Llama-3.1-8B": {
        "reasoning":re.compile(r"\*\*Reasoning:\*\*(.*?)(\*\*Result:\*\*|$)", re.DOTALL),
        "score": re.compile(r"\*\*Result:\*\*\s+(\d+)",re.IGNORECASE)
    },
    "Flow-Judge-v0.1": {
        "reasoning":re.compile(r"<feedback>(.*?)(</feedback>)|(<score>)", re.IGNORECASE | re.DOTALL),
        "score": re.compile(r"<score>\n*(\d+)\n*</score>", re.IGNORECASE)
    },
}

In [None]:
def percent_within_one(rater1, rater2, gap=1):
    """
    Compute percentage of cases where the absolute difference 
    between two raters' scores is ≤ 1.

    Parameters:
        rater1 (list or array): Ratings from annotator 1.
        rater2 (list or array): Ratings from annotator 2.

    Returns:
        float: Percentage of agreement within ±1.
    """
    rater1 = np.array(rater1)
    rater2 = np.array(rater2)
    
    # Mask out missing values if using np.nan
    mask = ~np.isnan(rater1) & ~np.isnan(rater2)
    r1 = rater1[mask]
    r2 = rater2[mask]

    within_one = np.abs(r1 - r2) <= gap
    return np.mean(within_one) * 100

## Instructions

In [None]:
metrics = glob.glob(f"./metrics/*/*/*/instructions*")

In [None]:
judge = []
role = []
response = []
metric = []
model = []
for m in metrics:
    judge.append(m.split("/")[3])
    metric.append(m.split("/")[2])
    model.append(m.split("/")[4])
    role.append(Path(m).name)
    response.append(pd.read_csv(m)[metric[-1]].tolist())

In [None]:
df = pd.DataFrame({"judge": judge, "role": role, "response": response, "metric": metric, "model": model})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [int(patterns[x["judge"]]["score"].search(y).group(1)) if patterns[x["judge"]]["score"].search(y) is not None else int(re.search(r"score of (\d)", y).group(1))  for y in x["response"]],axis=1)

In [None]:
df

In [None]:
judges = list(np.unique(df.judge))
roles = list(np.unique(df.role))
metrics = list(np.unique(df.metric))
models = list(np.unique(df.model))

In [None]:
df = df.set_index(["metric", "model", "judge", "role"])

In [None]:
judge1_scores = []
all_metrics = []
all_roles = []
all_models = []
idxs = []
for metric in metrics:
    for role in roles:
        for model in models:
            judge1 = df.loc[metric,model, judges[0], role].score
            judge1_scores.extend(judge1)
            all_models.extend([model]*len(judge1))
            all_metrics.extend([metric]*len(judge1))
            all_roles.extend([role]*len(judge1))
            idxs.extend(list(range(len(judge1))))

In [None]:
ratings = pd.DataFrame({"model": all_models, "metric": all_metrics, "role": all_roles, "idx": idxs, judges[0]: judge1_scores})

In [None]:
ratings

In [None]:
ratings.to_csv("results/instructions_ratings.csv", index=False)

## Interview

In [None]:
metrics = glob.glob(f"./metrics/*/*/*/interview*")

In [None]:
judge = []
role = []
response = []
metric = []
model = []
for m in metrics:
    if "Selene" not in m.split("/")[3]:
        continue
    judge.append(m.split("/")[3])
    metric.append(m.split("/")[2])
    model.append(m.split("/")[4])
    role.append(Path(m).name)
    response.append(pd.read_csv(m)[metric[-1]].tolist())

In [None]:
df = pd.DataFrame({"judge": judge, "role": role, "response": response, "metric": metric, "model": model})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [int(patterns[x["judge"]]["score"].search(y).group(1)) if patterns[x["judge"]]["score"].search(y) is not None else int(re.search(r"score of (\d)", y).group(1))  for y in x["response"]],axis=1)

In [None]:
df

In [None]:
judges = list(np.unique(df.judge))
roles = list(np.unique(df.role))
metrics = list(np.unique(df.metric))
models = list(np.unique(df.model))

In [None]:
df = df.set_index(["metric", "model", "judge", "role"])

In [None]:
judge1_scores = []
all_metrics = []
all_roles = []
all_models = []
idxs = []
for metric in metrics:
    for role in roles:
        for model in models:
            judge1 = df.loc[metric,model, judges[0], role].score
            judge1_scores.extend(judge1)
            all_models.extend([model]*len(judge1))
            all_metrics.extend([metric]*len(judge1))
            all_roles.extend([role]*len(judge1))
            idxs.extend(list(range(len(judge1))))

In [None]:
ratings = pd.DataFrame({"model": all_models, "metric": all_metrics, "role": all_roles, "idx": idxs, judges[0]: judge1_scores})

In [None]:
ratings

In [None]:
ratings.to_csv("results/interview_ratings.csv", index=False)

## bfi

In [None]:
metrics = glob.glob(f"./metrics/bfi/*/*/*")

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["bfi"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [int(patterns[x["judge"]]["score"].search(y).group(1)) for y in x["response"]],axis=1)

In [None]:
np.unique(df.judge)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
for _, row in df.iterrows():
    if "Selene" not in row.judge: continue
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores})

In [None]:
ratings.to_csv("results/bfi_ratings.csv", index=False)

## Instruction role specific

In [None]:
metrics = glob.glob(f"./metrics/instruction_role_specific/*/*/*")

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["instruction_role_specific"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns_ab[x["judge"]]["reasoning"].search(y).group(1) for y in x["response"]],axis=1)

In [None]:
initialize_seeds()

In [None]:
df["score"] = df.apply(lambda x: [patterns_ab[x["judge"]]["score"].search(y).group(1) if patterns_ab[x["judge"]]["score"].search(y) is not None else np.random.choice(["A", "B"]) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores})

In [None]:
ratings.rating.value_counts()

In [None]:
ratings[ratings.idx%2==0].rating.value_counts()

In [None]:
ratings[ratings.idx%2!=0].rating.value_counts()

In [None]:
ratings.to_csv("results/instruction_role_specific_ratings.csv", index=False)

## instruction_general

In [None]:
metrics = glob.glob(f"./metrics/instruction_general/*/*/*")

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["instruction_general"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
initialize_seeds()

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns_ab[x["judge"]]["reasoning"].search(y).group(1) if patterns_ab[x["judge"]]["reasoning"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [patterns_ab[x["judge"]]["score"].search(y).group(1) if patterns_ab[x["judge"]]["score"].search(y) is not None else random.choice(["A", "B"]) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores})

In [None]:
ratings.rating.value_counts()

In [None]:
ratings[ratings.idx%2==0].rating.value_counts()

In [None]:
ratings[ratings.idx%2!=0].rating.value_counts()

In [None]:
ratings.to_csv("results/instruction_general_ratings.csv", index=False)

## xstest

In [None]:
patterns = {
    "Selene-1-Mini-Llama-3.1-8B": {
        "reasoning":re.compile(r"\*\*Reasoning:\*\*(.*?)(\*\*Result:\*\*|$)", re.DOTALL),
        "score": re.compile(r"\*\*Result:\*\*\s+(yes|no)",re.IGNORECASE)
    },
    "Flow-Judge-v0.1": {
        "reasoning":re.compile(r"<feedback>(.*?)(</feedback>)|(<score>)", re.IGNORECASE | re.DOTALL),
        "score": re.compile(r"<score>\n*(\d+)\n*</score>", re.IGNORECASE)
    },
}

In [None]:
metrics = glob.glob(f"./metrics/xstest/*/*/*")

In [None]:
role = []
response = []
model = []
judge = []
for m in metrics:
    role.append(Path(m).name)
    judge.append(m.split("/")[-3])
    model.append(m.split("/")[-2])
    response.append(pd.read_csv(m)["xstest"].tolist())

In [None]:
df = pd.DataFrame({"model": model, "judge": judge, "role": role, "response": response})

In [None]:
df["reasoning"] = df.apply(lambda x: [patterns_ab[x["judge"]]["reasoning"].search(y).group(1) if patterns[x["judge"]]["reasoning"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
df["score"] = df.apply(lambda x: [patterns[x["judge"]]["score"].search(y).group(1) if patterns[x["judge"]]["score"].search(y) is not None else print(y) for y in x["response"]],axis=1)

In [None]:
all_scores = []
all_roles = []
all_models = []
idxs = []
reasonings = []
for _, row in df.iterrows():
    scores = row.score
    role = row.role
    model = row.model
    all_scores.extend(scores)
    all_roles.extend([role]*len(scores))
    all_models.extend([model]*len(scores))
    reasonings.extend(row.reasoning)
    idxs.extend(list(range(len(scores))))
    

In [None]:
ratings = pd.DataFrame({"model": all_models, "role": all_roles, "idx": idxs, "rating": all_scores})

In [None]:
ratings.to_csv("results/xstest_ratings.csv", index=False)