In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm
import pandas as pd
import torch
import re


areas = ["MENTALCHAT", "DEPTHQA"]
area = areas[1]
MODEL_NAME = f"mghiasvand1/custom_eval_{area}"
DATA_PATH = f"data/synthetic/test_data_{area}.csv"
KEYWORDS = {
    "MENTALCHAT": {"ood": [":\nemotional supportiveness", ":\npracticality of advicey"], "id": [":\nclarity and comprehensibility", ":\nbalance between validation and encouragement"]},
    "DEPTHQA": {"ood": [":\nstep-by-step explanation", ":\nterminology accuracy"], "id": [":\nuse of definitions", ":\nclarity"]},
}

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def get_model_response(prompt):
    messages = [{"role": "user", "content": prompt}]
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    output = model.generate(
        input_ids.to("cuda"),
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=2048,
        do_sample=False,
        temperature=0
    )

    response_text = tokenizer.decode(output[0]).split("[|assistant|]")[
        1].split("[|endofturn|]")[0].strip()

    match = re.search(r"The final score is (\d+) out of 5", response_text)
    return int(match.group(1)) if match else None

df = pd.read_csv(DATA_PATH)
df["gold_score"] = df["output"].astype(int)

llm_scores = []
for prompt in tqdm(df["input"]):
    llm_response = get_model_response(prompt)
    try:
        llm_scores.append(int(llm_response))
    except (ValueError, TypeError):
        llm_scores.append(None)

df["llm_score"] = llm_scores
df = df.dropna(subset=["llm_score"]).astype({"llm_score": "int"})

pattern_ood = "|".join(KEYWORDS[area]["ood"])
df1 = df[df["input"].str.lower().str.contains(pattern_ood, na=False)]
qwk_1 = cohen_kappa_score(
    df1["gold_score"], df1["llm_score"], weights="quadratic") if not df1.empty else None

pattern_id = "|".join(KEYWORDS[area]["id"])
df2 = df[df["input"].str.lower().str.contains(pattern_id, na=False)]
qwk_2 = cohen_kappa_score(
    df2["gold_score"], df2["llm_score"], weights="quadratic") if not df2.empty else None

qwk_3 = cohen_kappa_score(
    df["gold_score"], df["llm_score"], weights="quadratic")

print("Out-of-Domain QWK:", qwk_1)
print("In-Domain QWK:", qwk_2)
print("Overall QWK:", qwk_3)