In [1]:
import sys; sys.path.append("../utils")
from constants import *
from prompts_science import *

import numpy as np
import pandas as pd

import random
from tqdm import tqdm, trange

In [2]:
v2int = {
    "Incorrect": -1,
    "Misleading": 0,
    "Correct": 1
}

path = f"{gdrive_path}/data/sciencefeedback.jsonl"
df = pd.read_json(path, orient="records", lines=True)
cv = df.loc[df["topic"] == "Health", ["topic", "claim", "verdict"]].values[:150]
topics, claims, verdicts = cv[:, 0], cv[:, 1], cv[:, 2]
ids = np.arange(len(claims))

In [3]:
for model in ["llama3", "mistral", "llama2"]:
    print(f"\n{model}")

    # scoring
    prompts = pd.DataFrame(columns=["claimID", "topic", "claim", "verdict", "prompt"])
    bar = tqdm(total=len(ids), desc="score")
    for id, topic, claim, verdict in zip(ids, topics, claims, verdicts):
        bar.update()
        prompt = chat_templates[model].format(
            INSTRUCTION=template_score.format(CLAIM=claim),
            ANSWER="The above claim is "
        )
        prompts.loc[len(prompts)] = [id, topic, claim, verdict, prompt]
    prompts.to_json(f"{gdrive_path}/prompts_short/{model}/healthfeedback_score.jsonl", orient="records", lines=True)

    # comparison
    prompts = pd.DataFrame(columns=["claimID", "topic", "prompt", "P(Claim 1)"])
    pairs = []
    bar = tqdm(total=len(ids)**2, desc="compare")
    for topic, id1, claim1, v1 in zip(topics, ids, claims, verdicts):
        if topic != "Health": continue
        for topic2, id2, claim2, v2 in zip(topics, ids, claims, verdicts):
            bar.update()
            if topic != topic2: continue
            if id1 == id2: continue
            if (id1, id2) not in pairs:
                pairs.append((id1, id2))
                pairs.append((id1, id2))
                row = [(id1, id2), topic]
                prompt = chat_templates[model].format(
                    INSTRUCTION=template_compare.format(
                        CLAIM1=claim1,
                        CLAIM2=claim2
                    ),
                    ANSWER="Between Claim 1 and Claim 2, the more factually accurate / less ambiguous choice is Claim "
                )
                row.append(prompt)
                if v2int[v1] > v2int[v2]:
                    row.append(1)
                elif v2int[v1] < v2int[v2]:
                    row.append(0)
                else:
                    row.append(0.5)
                prompts.loc[len(prompts)] = row
    prompts.to_json(f"{gdrive_path}/prompts_short/{model}/healthfeedback_compare.jsonl", orient="records", lines=True) 

    # contrast pairs
    for choice in [1, 2]:
        prompts = pd.DataFrame(columns=["claimID", "topic", "prompt", "P(Claim 1)"])
        pairs = []
        bar = tqdm(total=len(ids)**2, desc=f"contrast:{choice}")
        for topic, id1, claim1, v1 in zip(topics, ids, claims, verdicts):
            if topic != "Health": continue
            for topic2, id2, claim2, v2 in zip(topics, ids, claims, verdicts):
                bar.update()
                if topic != topic2: continue
                if id1 == id2: continue
                if (id1, id2) not in pairs:
                    pairs.append((id1, id2))
                    pairs.append((id1, id2))
                    row = [(id1, id2), topic]
                    prompt = chat_templates[model].format(
                        INSTRUCTION=template_compare.format(
                            CLAIM1=claim1,
                            CLAIM2=claim2
                        ),
                        ANSWER=f"Between Claim 1 and Claim 2, the more factually accurate / less ambiguous choice is Claim {choice}"
                    )
                    row.append(prompt)
                    if v2int[v1] > v2int[v2]:
                        row.append(1)
                    elif v2int[v1] < v2int[v2]:
                        row.append(0)
                    else:
                        row.append(0.5)
                    prompts.loc[len(prompts)] = row
        prompts.to_json(f"{gdrive_path}/prompts_short/{model}/healthfeedback_contrast_{choice}.jsonl", orient="records", lines=True)


llama3


score: 100%|██████████| 150/150 [00:00<00:00, 157.09it/s]
compare: 100%|██████████| 22500/22500 [03:08<00:00, 119.62it/s]
contrast:1: 100%|██████████| 22500/22500 [04:03<00:00, 92.43it/s] 



mistral


contrast:2: 100%|██████████| 22500/22500 [05:09<00:00, 72.70it/s] 
score: 100%|██████████| 150/150 [00:00<00:00, 680.34it/s]
compare: 100%|██████████| 22500/22500 [04:05<00:00, 91.55it/s]
contrast:1: 100%|██████████| 22500/22500 [03:38<00:00, 103.08it/s]



llama2


contrast:2: 100%|██████████| 22500/22500 [03:36<00:00, 103.85it/s]
score: 100%|██████████| 150/150 [00:00<00:00, 636.37it/s]
compare: 100%|██████████| 22500/22500 [03:48<00:00, 98.39it/s]
contrast:1: 100%|██████████| 22500/22500 [05:15<00:00, 71.24it/s] 
