In [1]:
import os, pickle
import pandas as pd
from sklearn.metrics import f1_score
from lpaaj.data import TextDataset
from lpaaj.constants import LORA_RESULTS_PATH, FULL_RESULTS_PATH

In [2]:
prompt_keys = {
    'newsroom': ['coherence', 'fluency', 'informativeness', 'relevance'],
    'summeval': ['coherence', 'consistency', 'fluency', 'relevance'],
    'hanna': ['coherence', 'complexity', 'empathy', 'engagement', 'relevance', 'surprise'],
    'rocstories': ['consistency']
} # default: prompt

label_keys = {
    'mctaco': 'correct',
    'caters': 'first',
    'rocstories': 'correct'
} # default: prompt-key

In [3]:
# === LABELS ===
labels = {}
datasets = ["newsroom", "summeval", "hanna", "rocstories", "mctaco", "caters"]
for dataset in datasets:
    pks = prompt_keys.get(dataset, ["prompt"])
    for pk in pks:
        lk = label_keys.get(dataset, pk)
        data = TextDataset(
            task="compare",
            dataset=dataset,
            prompt_key=pk,
            label_key=lk,
        )
        labels[f"{dataset}-{pk}"] = data.labels

loading data from /workspace/LP-as-a-Judge/data/newsroom/newsroom_prompts_compare.jsonl
loading labels from /workspace/LP-as-a-Judge/data/newsroom/newsroom_pairwise_comparisons.jsonl
loading data from /workspace/LP-as-a-Judge/data/newsroom/newsroom_prompts_compare.jsonl
loading labels from /workspace/LP-as-a-Judge/data/newsroom/newsroom_pairwise_comparisons.jsonl
loading data from /workspace/LP-as-a-Judge/data/newsroom/newsroom_prompts_compare.jsonl
loading labels from /workspace/LP-as-a-Judge/data/newsroom/newsroom_pairwise_comparisons.jsonl
loading data from /workspace/LP-as-a-Judge/data/newsroom/newsroom_prompts_compare.jsonl
loading labels from /workspace/LP-as-a-Judge/data/newsroom/newsroom_pairwise_comparisons.jsonl
loading data from /workspace/LP-as-a-Judge/data/summeval/summeval_prompts_compare.jsonl
loading labels from /workspace/LP-as-a-Judge/data/summeval/summeval_pairwise_comparisons.jsonl
loading data from /workspace/LP-as-a-Judge/data/summeval/summeval_prompts_compare.jso

In [4]:
results = pd.DataFrame(columns=["method", "dataset", "model", "f1"])
for method in ["lora", "full"]:
    for dataset in datasets:
        pks = prompt_keys.get(dataset, ["prompt"])
        RESULTS_PATH = LORA_RESULTS_PATH if method == "lora" else FULL_RESULTS_PATH
        RESULTS_PATH += f"/{dataset}"
        models = os.listdir(RESULTS_PATH)
        for model in models:
            scores = []
            for pk in pks:
                filepath = f"{RESULTS_PATH}/{model}/{pk}.pkl"
                with open(filepath, "rb") as f: predictions = pickle.load(f)
                current_labels = labels[f"{dataset}-{pk}"]
                score = f1_score(predictions, current_labels, average="micro")
                scores.append(score)
            results.loc[len(results)] = [method, dataset, model, sum(scores) / len(scores)]

In [6]:
text_quality_datasets = ["newsroom", "summeval", "hanna"]
common_sense_datasets = ["mctaco", "caters", "rocstories"]

# Group by dataset categories and average F1 scores
text_quality_results = results[results['dataset'].isin(text_quality_datasets)].groupby(['method', 'model'])['f1'].mean().reset_index()
text_quality_results['category'] = 'text_quality'

common_sense_results = results[results['dataset'].isin(common_sense_datasets)].groupby(['method', 'model'])['f1'].mean().reset_index()
common_sense_results['category'] = 'common_sense'

# Combine the results
category_results = pd.concat([text_quality_results, common_sense_results], ignore_index=True)
print("Results averaged by dataset category:")
category_results

Results averaged by dataset category:


Unnamed: 0,method,model,f1,category
0,full,gemma-2-2b-it,0.556953,text_quality
1,full,gemma-2-9b-it,0.597837,text_quality
2,full,llama-3.1-8b-it,0.557425,text_quality
3,full,mistral-nemo-12b-it,0.586087,text_quality
4,full,qwen-2.5-0.5b-it,0.464594,text_quality
5,full,qwen-2.5-1.5b-it,0.476574,text_quality
6,full,qwen-2.5-14b-it,0.576812,text_quality
7,full,qwen-2.5-3b-it,0.553805,text_quality
8,full,qwen-2.5-7b-it,0.552733,text_quality
9,lora,gemma-2-27b-it,0.582767,text_quality


In [7]:
category_results.to_json(f"/workspace/PPairS_results/finetuning_results.jsonl", orient="records", lines=True)