In [None]:
import pandas as pd
import pathlib as plb

In [None]:
helm_dir = plb.Path('/Users/user/Documents/CodeProjects/neurips-llm-efficiency-challenge-2023/neurips-llm-efficiency-challenge/data/raw/helm_data')

In [None]:
f = [i for i in helm_dir.glob("*.jsonl")]

In [None]:
dfl = []

for fl in f:
    print(fl)
    cols = ['task', 'references', 'correct', 'method', 'instruction', 'prefix', 'output_prefix', 'dataset', 'subject']
    df = pd.read_json(fl, lines=True)
    if 'correct' not in df.columns:
        df['correct'] = None
    if 'subject' not in df.columns:
        df['subject'] = None
    df = df#.loc[:, cols]
    if fl.name.split(".")[0] == "summarization_cdnn":
        nmax = 3000
    else:
        nmax = 20_000
    n = min([df.shape[0], nmax])
    df = df.sample(n=n, random_state=786949692)
    dfl.append(df)

In [None]:
dfl[2]

In [None]:
import numpy as np

from cajajejo.commands.api.utils import MAP

def fmt_output_mc(x):
    return MAP[x["correct"]]

def fmt_q(x):
    if x['method'] == 'multiple_choice_joint':
        references = '\n'.join([f"{MAP[i]}. {r.strip()}" for i, r in enumerate(x['references'].split(';'))])
        prompt = f"""{x['prefix']}{x['task']}\n{references}\n{x['output_prefix'].strip()} {fmt_output_mc(x)}""".strip()
    else:
        references = x['references']
        prompt = f"""{x['prefix']}{x['task']}\n{x['output_prefix']}{references}""".strip()
    return prompt

def n_shot_examples(df, random_state: int, n_examples: int = 2, n_shot_postfix = "\n\n"):
    n_examples_req_per_sample = n_examples + 1
    d_n = df.shape[0]
    ns_s = d_n // n_examples_req_per_sample
    if d_n - ns_s < 0:
        raise ValueError("Not enough samples")
    print(f"Total input samples: {d_n}")
    print(f"Examples required per N-shot sample: {n_examples_req_per_sample}")
    print(f"N-shot samples: {ns_s}")
    dshuf = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    dshuf["instruction_fmt"] = dshuf.apply(fmt_q, axis=1)
    instruction = dshuf["instruction"].iloc[0]
    samples_ns = np.array_split(dshuf, ns_s)
    n_shot_questions = []
    n_shot_answers = []
    for s in samples_ns:
        if s.shape[0] < n_examples_req_per_sample:
            continue
        sl = s.iloc[-1]
        len_answer = len(MAP[sl['correct']]) if sl['method'] == 'multiple_choice_joint' else len(sl['references'])
        comb = (instruction + "\n" + f"{n_shot_postfix}".join(s["instruction_fmt"])).strip()
        nsh_q = comb[:(len(comb) - len_answer)].strip()
        nsh_a = comb[-len_answer:].strip()
        n_shot_questions.append(nsh_q)
        n_shot_answers.append(nsh_a)
    dfr = pd.DataFrame(
        {
            "question": n_shot_questions,
            "answer": n_shot_answers,
            "method": sl['method'],
        }
    )
    return dfr

In [None]:
cnn_dm = dfl[4].copy().sample(frac=1, random_state=1232312).reset_index(drop=True)
cnn_dm["output_prefix"] = "\n" + cnn_dm["output_prefix"]

In [None]:
cnn_dm.shape

In [None]:
zero_shot = n_shot_examples(cnn_dm.loc[:750], random_state=8623, n_examples=0).assign(dataset="cnn_dm")
three_shot = n_shot_examples(cnn_dm.loc[750:1500], random_state=78362, n_examples=3).assign(dataset="cnn_dm")
five_shot = n_shot_examples(cnn_dm.loc[1500:], random_state=32166, n_examples=5).assign(dataset="cnn_dm")

In [None]:
cnn_dm = pd.concat([zero_shot, three_shot, five_shot]).reset_index(drop=True)

### BBQ

In [None]:
bbq = dfl[1].copy().sample(frac=1, random_state=1232312).reset_index(drop=True)

In [None]:
zero_shot = n_shot_examples(bbq.loc[:2000], random_state=8623, n_examples=0).assign(dataset="bbq")
three_shot = n_shot_examples(bbq.loc[2000:6000], random_state=78362, n_examples=3).assign(dataset="bbq")
five_shot = n_shot_examples(bbq.loc[6000:16000], random_state=32166, n_examples=5).assign(dataset="bbq")

In [None]:
bbq = pd.concat([zero_shot, three_shot, five_shot]).reset_index(drop=True)

In [None]:
bbq

### TruthfulQA

In [None]:
tqa = dfl[2].copy().sample(frac=1, random_state=1232312).reset_index(drop=True)

tqa

In [None]:
zero_shot = n_shot_examples(tqa.loc[(14*6):], random_state=8623, n_examples=0).assign(dataset="truthful_qa")
five_shot = n_shot_examples(tqa.loc[0:(14*6)-1], random_state=32166, n_examples=5).assign(dataset="truthful_qa")

In [None]:
print(five_shot.iloc[0].question)

In [None]:
tqa = pd.concat([zero_shot, five_shot]).reset_index(drop=True)

### GSM

In [None]:
gsm = dfl[5].copy().sample(frac=1, random_state=1232312).reset_index(drop=True)

gsm

In [None]:
zero_shot = n_shot_examples(gsm.loc[:1000], random_state=8623, n_examples=0).assign(dataset="gsm")
three_shot = n_shot_examples(gsm.loc[1001:3500], random_state=32166, n_examples=3).assign(dataset="gsm")
five_shot = n_shot_examples(gsm.loc[3501:], random_state=89643, n_examples=5).assign(dataset="gsm")

In [None]:
print(five_shot.iloc[0].question)

In [None]:
gsm = pd.concat([zero_shot, three_shot, five_shot]).reset_index(drop=True)

### MMLU

In [None]:
mmlu = dfl[3].copy().reset_index(drop=True)

subjects = mmlu.subject.unique().tolist()

In [None]:
all_mmlu = []

for subject in subjects:
    mmlus = mmlu.loc[mmlu.subject == subject].sample(frac=1, random_state=1232312).reset_index(drop=True)
    zero_shot = n_shot_examples(mmlus.loc[3:], random_state=8623, n_examples=0).assign(dataset="mmlu")
    two_shot = n_shot_examples(mmlus.loc[:2], random_state=32166, n_examples=2).assign(dataset="mmlu")
    mmlus = pd.concat([zero_shot, two_shot]).reset_index(drop=True)
    all_mmlu.append(mmlus)

In [None]:
all_mmlu = pd.concat(all_mmlu).reset_index(drop=True)

In [None]:
print(all_mmlu.iloc[0].question)

### BigBench

In [None]:
bigb = dfl[0].copy().reset_index(drop=True)

task_names = bigb.task_name.unique().tolist()

In [None]:
samples_bb = []

frac_zs = .3
frac_ns = 1 - frac_zs

for taskn in task_names:
    bigbs = bigb.loc[bigb.task_name == taskn].reset_index(drop=True)
    subtasks = bigbs.subtask_name.unique().tolist()
    if len(subtasks) > 1:
        for subtask in subtasks:
            bigbss = bigbs.loc[bigbs.subtask_name == subtask].reset_index(drop=True)
            if bigbss.shape[0] < 30:
                samples = n_shot_examples(bigbss, random_state=8623, n_examples=0).assign(dataset="big_bench")
            else:
                zero_shot = n_shot_examples(bigbss.loc[:int(bigbss.shape[0]*frac_zs)], random_state=43423, n_examples=0).assign(dataset="big_bench")
                n_shot = n_shot_examples(bigbss.loc[int(int(bigbss.shape[0]*frac_ns)):], random_state=32166, n_examples=3).assign(dataset="big_bench")
                samples = pd.concat([zero_shot, n_shot]).reset_index(drop=True)
    else:
        if bigbs.shape[0] < 30:
            samples = n_shot_examples(bigbs, random_state=8623, n_examples=0).assign(dataset="big_bench")
        else:
            zero_shot = n_shot_examples(bigbs.loc[:int(bigbs.shape[0]*frac_zs)], random_state=43423, n_examples=0).assign(dataset="big_bench")
            n_shot = n_shot_examples(bigbs.loc[int(int(bigbs.shape[0]*frac_ns)):], random_state=32166, n_examples=3).assign(dataset="big_bench")
            samples = pd.concat([zero_shot, n_shot]).reset_index(drop=True)
    samples_bb.append(samples)

In [None]:
bigb = pd.concat(samples_bb).reset_index(drop=True)

In [None]:
bigb

In [None]:
print(bigb.loc[5000].question)

In [None]:
ddf = pd.concat([cnn_dm, bbq, tqa, gsm, all_mmlu, bigb]).reset_index(drop=True)

In [None]:
ddf['input'] = (
    ddf
    .apply(lambda x: "Choose the best option out of the choices given, and return the letter corresponding to the option you choose." if x["method"] != 'generation' else '', axis=1)
)

ddf = ddf.rename({
    'question': 'instruction',
    'answer': 'output'
}, axis=1)

In [None]:
ddf.to_json("../../data/processed/helm_training_data.jsonl", orient="records", lines=True)