In [None]:
from datasets import load_dataset
import pandas as pd

d = load_dataset('Amirkid/MedQuad-dataset')

In [None]:
d['train'][3]

In [None]:
q = []
a = []

for i in range(len(d['train'])):
    if i % 2 == 0:
        q.append(d['train'][i]['text'])
    else:
        aq = d['train'][i]['text']
        if aq.startswith('Summary : '):
            aq = aq.replace('Summary : ', '')
        a.append(aq)

In [None]:
df = pd.DataFrame({
    'instruction': q,
    'input': '',
    'output': a,
    'dataset': 'MedQuad'
})

In [None]:
df_medquad = df.copy()

### SciQ

In [None]:
import numpy as np

from cajajejo.commands.api.utils import MAP

def fmt_output_mc(x):
    return x['correct']

def fmt_q(x):
    if x['method'] == 'multiple_choice_joint':
        references = '\n'.join([f"{MAP[i]}. {r.strip()}" for i, r in enumerate(x['references'].split(';'))])
        prompt = f"""{x['prefix']}{x['task']}\n{references}\n{x['output_prefix'].strip()} {fmt_output_mc(x)}""".strip()
    else:
        references = x['references']
        prompt = f"""{x['prefix']}{x['task']}\n{x['output_prefix']}{references}""".strip()
    return prompt

def n_shot_examples(df, random_state: int, n_examples: int = 2, n_shot_postfix = "\n\n"):
    n_examples_req_per_sample = n_examples + 1
    d_n = df.shape[0]
    ns_s = d_n // n_examples_req_per_sample
    if d_n - ns_s < 0:
        raise ValueError("Not enough samples")
    print(f"Total input samples: {d_n}")
    print(f"Examples required per N-shot sample: {n_examples_req_per_sample}")
    print(f"N-shot samples: {ns_s}")
    dshuf = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    dshuf["instruction_fmt"] = dshuf.apply(fmt_q, axis=1)
    instruction = dshuf["instruction"].iloc[0]
    samples_ns = np.array_split(dshuf, ns_s)
    n_shot_questions = []
    n_shot_answers = []
    for s in samples_ns:
        if s.shape[0] < n_examples_req_per_sample:
            continue
        sl = s.iloc[-1]
        len_answer = len(sl['correct']) if sl['method'] == 'multiple_choice_joint' else len(sl['references'])
        comb = (instruction + "\n" + f"{n_shot_postfix}".join(s["instruction_fmt"])).strip()
        nsh_q = comb[:(len(comb) - len_answer)].strip()
        nsh_a = comb[-len_answer:].strip()
        n_shot_questions.append(nsh_q)
        n_shot_answers.append(nsh_a)
    dfr = pd.DataFrame(
        {
            "question": n_shot_questions,
            "answer": n_shot_answers,
            "method": sl['method'],
        }
    )
    return dfr


def fmt_mc(x):
    answers = x.loc[["correct_answer", "distractor1", "distractor2", "distractor3"]].values.tolist()
    correct, _, _, _ = answers

    np.random.shuffle(answers)

    answers_fmt = []

    for i, ans in enumerate(answers):
        if ans == correct:
            correct_ident = MAP[i]
        opt = ans
        answers_fmt.append(opt)
    
    return correct_ident, ";".join(answers_fmt)

In [None]:
import sh

link = 'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip'

In [None]:
sh.curl(link, '-o', 'SciQ.zip')

In [None]:
sh.unzip('SciQ.zip')

In [None]:
import pathlib as plb

unz_data = [*plb.Path('.').glob('SciQ dataset*')][0].resolve()

In [None]:
import pandas as pd

splits = []

for split in ["train.json", "valid.json", "test.json"]:
    df = pd.read_json(unz_data / split, lines=False)
    df['split'] = split
    splits.append(df)

In [None]:
df = pd.concat(splits, axis=0)
df

In [None]:
np.random.seed(78463)

mcq = df.apply(fmt_mc, axis=1)

In [None]:
correct = [a[0] for a in mcq]
ref = [a[1] for a in mcq]

In [None]:
prefix = "Question: "
output_prefix = "Answer:"
method = "multiple_choice_joint"

In [None]:
df = pd.DataFrame({
    'task': df['question'].tolist(),
    'instruction': '',
    'prefix': prefix,
    'output_prefix': output_prefix,
    'method': method,
    'references': ref,
    'correct': correct
})

In [None]:
df = df.sample(frac=1, random_state=21853).reset_index(drop=True)

In [None]:
df_one_shot = n_shot_examples(df.loc[:5000], random_state=24122, n_examples=0)

In [None]:
df_five_shot = n_shot_examples(df.loc[5001:], random_state=98864, n_examples=5)

In [None]:
df = pd.concat([df_one_shot, df_five_shot], axis=0)

df["dataset"] = "SciQ"

In [None]:
df = df.rename({"question": "instruction", "answer": "output"}, axis=1, inplace=False)
df["input"] = "Choose the best option out of the choices given, and return the letter corresponding to the option you choose."
df = df.drop(["method"], axis=1, inplace=False)

In [None]:
df_sciq = df.copy()

### Kaggle challenge

from https://www.kaggle.com/competitions/kaggle-llm-science-exam/data?select=test.csv.

In [None]:
train = '/Users/user/Documents/CodeProjects/neurips-llm-efficiency-challenge-2023/neurips-llm-efficiency-challenge/data/kaggle/train.csv'

In [None]:
df_train = pd.read_csv(train)

In [None]:
df_train

In [None]:
def fmt(x):
    answers = [
        f"A. {x['A']}",
        f"B. {x['B']}",
        f"C. {x['C']}",
        f"D. {x['D']}"
        f"E. {x['E']}"
    ]
    answers_str = "\n".join(answers)
    p = f"""
Question: {x['prompt']}
{answers_str}
Answer:
""".strip()
    return {
        'instruction': p,
        'input': "",
        'output': x["answer"]
    }    

In [None]:
df_kag = pd.DataFrame(df_train.apply(fmt, axis=1).tolist())

In [None]:
df_kag["dataset"] = "kaggle"
df_kag["input"] = "Choose the best option out of the choices given, and return the letter corresponding to the option you choose."

### Stereoset

In [None]:
import datasets
import numpy as np

intra = datasets.load_dataset("stereoset", 'intrasentence') #'intersentence')
inter = datasets.load_dataset("stereoset", 'intersentence')

In [None]:
df_intra = intra['validation'].to_pandas()
df_inter = inter['validation'].to_pandas()

In [None]:
p = np.where(df_inter.iloc[0].sentences['gold_label'] == 0)[0][0]

df_inter.iloc[0].sentences['sentence'][p]

In [None]:
import numpy as np

LABEL_MAP = {0: 'anti-stereotype', 1: 'stereotype', 2: 'unrelated'}

instruction = "Rewrite the passage to remove the stereotype."
instruction_postfix = "\n\n"
passage_prefix = "Passage: "
passage_postfix = "\n\n"
rewritten_prefix = "Answer: "

def get_label(x, label_idx: int):
    sentence_pos = np.where(x['gold_label'] == label_idx)[0][0]
    return x['sentence'][sentence_pos]
    

df_inter['anti_stereotype'] = df_inter['sentences'].apply(get_label, label_idx=0)
df_inter['stereotype'] = df_inter['sentences'].apply(get_label, label_idx=1)

In [None]:
df_inter['passage'] = df_inter['context'].str.rstrip(".") + ". " + df_inter['stereotype']
df_inter['output'] = df_inter['context'].str.rstrip(".") + ". " + df_inter['anti_stereotype']

df_inter.iloc[0].output

In [None]:
def fmt_prompt(x):
    return f"""
{instruction}{instruction_postfix}{passage_prefix}{x['passage']}{passage_postfix}{rewritten_prefix}
    """.strip()

df_inter['instruction'] = df_inter.apply(fmt_prompt, axis=1)

In [None]:
print(df_inter.iloc[0].instruction)

In [None]:
df_inter['input'] = ''
df_inter['dataset'] = 'stereoset_intersentence'

df_inter = df_inter.loc[:, ['instruction', 'input', 'output', 'dataset']]

In [None]:
instruction = 'Fill in the blank with the appropriate answer.'

def get_label(x, label_idx: int):
    sentence_pos = np.where(x['gold_label'] == label_idx)[0][0]
    return x['sentence'][sentence_pos]

df_intra['anti_stereotype'] = df_intra['sentences'].apply(get_label, label_idx=0)
df_intra['stereotype'] = df_intra['sentences'].apply(get_label, label_idx=1)

In [None]:
np.random.seed(7316463)

def fmt(x):
    co = x["context"]
    st = x["stereotype"]
    ast = x["anti_stereotype"]
    io = [st, ast]
    np.random.shuffle(io)
    opts = "\n".join([f"A. {io[0]}", f"B. {io[1]}"])
    correct_idx = np.where(np.array(io) == ast)[0][0]
    correct_label = 'A' if correct_idx == 0 else 'B'
    return f"""{instruction}\n\nPassage: {co}\n{opts}\n\nAnswer: {correct_label}"""

df_intra['instruction'] = df_intra.apply(lambda df: fmt(df), axis=1)
df_intra['input'] = "Choose the best option out of the choices given, and return the letter corresponding to the option you choose."
df_intra['output'] = df_intra['instruction'].str[-1]
df_intra['instruction'] = df_intra['instruction'].str[:-1]

In [None]:
df_intra = df_intra.loc[:, ['instruction', 'input', 'output']]

In [None]:
df_intra

In [None]:
df_intra["dataset"] = "stereoset_intrasentence"

### Crows-pairs

In [None]:
import pandas as pd

prompt = pd.read_csv('https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/prompts.csv').loc[:, ['prompt', 'source']]
resp = pd.read_csv('https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv')
resp["prompt"] = prompt["prompt"]
resp = resp.loc[lambda df: df['bias_type'] != "religion"]
resp

In [None]:
resp = resp.loc[:, ["prompt", "sent_less", "sent_more"]]
resp["passage"] = resp["prompt"] + " " + resp["sent_more"]
resp["response"] = resp["prompt"] + " " + resp["sent_less"]

def fmt_prompt(x):
    return f"""Rewrite the passage to remove the stereotype.\n\nPassage: {x['passage']}\n\nAnswer: """.strip()

resp["instruction"] = resp.apply(fmt_prompt, axis=1)

resp["input"] = ""

resp.rename({"response": "output"}, axis=1, inplace=True)

resp["dataset"] = "crows_pairs"

resp = resp.loc[:, ["instruction", "input", "output", "dataset"]]

resp

In [None]:
df_other = pd.concat([df_medquad, df_sciq, df_kag, df_inter, df_intra, resp], axis=0).reset_index(drop=True)

In [None]:
df_other

In [None]:
df_other.to_json('../../data/processed/otherdata.jsonl', lines=True, orient='records')