In [2]:
import json
import pandas as pd
import pickle

In [3]:
def isascii(s):
    """Check if the characters in string s are in ASCII, U+0-U+7F."""
    return len(s) == len(s.encode())

In [10]:
with open("raw/triviaqa-unfiltered/unfiltered-web-dev.json", "r") as fp:
    raw = json.load(fp) 
list_preprocessed = []
for example in raw["Data"]:
    references = [alias for alias in list(example["Answer"]["Aliases"]) if isascii(alias)]
    list_preprocessed.append({
        "qid": f"triviaqa-{example['QuestionId']}", 
        "question": example["Question"], 
        "reference_answers": references,
        # "reference_answers": random.sample(references, min(5, len(references))),
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/triviaqa-dev.csv", index=False)

In [9]:
with open("raw/SciQ-dataset/valid.json", "r") as fp:
    raw = json.load(fp) 
list_preprocessed = []
for i, example in enumerate(raw):
    list_preprocessed.append({
        "qid": f"sciq-valid-{i}", 
        "question": example["question"], 
        "reference_answers": example["correct_answer"],
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/sciq-valid.csv", index=False)

In [18]:
raw = pd.read_csv("raw/TruthfulQA.csv")
list_preprocessed = []
for i, example in raw.iterrows():
    list_preprocessed.append({
        "qid": f"truthfalqa-{i}", 
        "question": example["Question"], 
        "reference_answers": example["Correct Answers"].split(";"),
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/truthfulqa.csv", index=False)

In [4]:
results = [] 
with open("raw/MATH/test-prm800k-500.jsonl", "r") as fp:
    json_list = list(fp)
for json_str in json_list:
    results.append(json.loads(json_str))
list_preprocessed = []
for example in results:
    list_preprocessed.append({
        "qid": f"MATH-{example['unique_id']}", 
        "question": example["problem"].replace("{", "{{").replace("}", "}}"),
        "reference_answers": example["answer"].replace("{", "{{").replace("}", "}}"),
        "reference_process": example["solution"].replace("{", "{{").replace("}", "}}"),
        "subject": example["subject"],
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/math-test-prm800k.csv", index=False)

In [14]:
with open('raw/WikiLingua/english.pkl', 'rb') as f:
    english_docs = pickle.load(f)  # 57945 urls
with open('raw/WikiLingua/chinese.pkl', 'rb') as f:
    chinese_docs = pickle.load(f)  # 6541 urls
count = 1000  # 1000 pairs of (eng_doc, eng_summary, chn_doc, chn_summary), try chn_doc -> eng_summary for now
list_preprocessed = []
for chinese_doc in chinese_docs.values():
    outer_break = False
    for doc in chinese_doc.values():
        chn_doc = doc["document"]
        chn_summary = doc["summary"]        
        eng_sn = doc.get("english_section_name", None)
        if not eng_sn:
            continue
        eng_url = doc["english_url"]
        parallel_pos = english_docs[eng_url][eng_sn]
        eng_doc = parallel_pos["document"]
        eng_summary = parallel_pos["summary"]
        list_preprocessed.append({
            "qid": f"WikiLingua-{count}", 
            "question": f"Please summarize the following sentences using English:\n{chn_doc}",
            "reference_answers": eng_summary,
            "chn_summary": chn_summary,
            "eng_doc": eng_doc,
            "source_url": eng_url,
        })
        count = count - 1
        if not count:
            outer_break = True
            break
    if outer_break:
        break
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/WikiLingua-1000-chn-eng.csv", index=False)        



In [13]:
list(chinese_docs.items())[0]

('https://zh.wikihow.com/%E8%AE%A1%E7%AE%97%E6%8A%95%E8%B5%84%E7%BB%84%E5%90%88%E7%9A%84%E5%B9%B4%E5%8C%96%E6%94%B6%E7%9B%8A%E7%8E%87',
 {'计算年化收益率': {'summary': '计算年化收益率。 计算半年收益率。 计算年化当量。',
   'document': '算出总收益率后（如上），将结果代入这个方程：年化收益率=(1+ 收益率)1/N-1。这个方程的结果就是整个投资期内每年的收益率。  这里的指数（即括号外的小数字）中，“1”代表计算的单位，也就是1年。如果你想更精确，可以用“365”计算每日收益。 “N”代表计算的期限。因此，如果计算的是7年里的收益情况，“N”应为7。 例如，在7年的时间里，你的投资组合价值从1000元增加到2500元。 首先计算总收益率：（2500-1000）/1000 = 1.50（收益率为150%）。 然后计算年化收益率：（1 + 1.50)1/7-1 = 0.1399 = 13.99%。就是这样！ 就用平常的运算顺序：先做括号内的运算，然后计算指数，最后做减法。 现在假设你想知道这7年中每半年的收益是多少（每年有两个半年收益，每隔六个月一个）。公式不变，只需调整计算的期数即可。最终的结果就是半年收益率。   本例中7年里每年有两个周期，因此有14个半年周期。 首先计算总收益率：（2500-1000)/1000 = 1.50（收益率为150%）。 然后计算年化收益率：（1+1.50）1/14-1 = 6.76%。 只要简单地将这个结果乘以2就能转化成年化收益率：6.76% x 2 = 13.52%。 你还可以计算更短期限内收益率的年化当量。例如，你只有6个月的收益，想知道年化当量。公式仍然不变。  假设在6个月的时间里，投资组合的价值从1000元增加到1050元。 先计算总收益率：（1050-1000）/1,000=0.05（6个月内的收益率为5%）。 如果想知道年化当量是多少（假设收益率和复合收益率不变），只需按如下计算：（1+0.05）1/0.50-1=10.25%。 只要按照以上公式计算，无论期限长短都能将其收益转换成年化收益率。',
   'english_section_name

In [3]:
results = [] 
with open("raw/gsm8k/test.jsonl", "r") as fp:
    json_list = list(fp)
for json_str in json_list:
    results.append(json.loads(json_str))
list_preprocessed = []
for i, example in enumerate(results):
    list_preprocessed.append({
        "qid": f"gsm8k-{i}", 
        "question": example["question"],
        "reference_answers": example["answer"].split("\n####")[1].strip(),
        "reference_process": example["answer"].split("\n####")[0].strip(),
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/gsm8k-test.csv", index=False)

In [5]:
with open("raw/TheoremQA/theoremqa_test.json", "r") as fp:
    raw = json.load(fp) 
list_preprocessed = []
for i, example in enumerate(raw):
    list_preprocessed.append({
        "qid": f"theoremqa-{example['id']}", 
        "question": example["Question"],
        "reference_answers": str(example["Answer"]),
        "subfield": example["subfield"],
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/theoremqa-test.csv", index=False)

In [4]:
with open("raw/AmbigQA/dev_light.json", "r") as fp:
    raw = json.load(fp) 
list_preprocessed = []
for i, example in enumerate(raw):
    reference_answers = []
    interpretations = []
    annotations = example["annotations"]
    for annotation in annotations:
        if annotation["type"] == "multipleQAs":
            pairs = annotation["qaPairs"]
            interpretations.extend([pair["question"] for pair in pairs])
            reference_answers.extend([pair["answer"] for pair in pairs])
    if not interpretations or not reference_answers:
        continue
    list_preprocessed.append({
        "qid": f"ambigqa-{example['id']}", 
        "question": example["question"],
        "reference_answers": reference_answers,
        "interpretations": interpretations,
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/ambigqa.csv", index=False)

In [10]:
raw = pd.read_csv("raw/gpqa/gpqa_diamond.csv")
list_preprocessed = []
for i, example in raw.iterrows():
    list_preprocessed.append({
        "qid": f"gpqa-diamond-{i}", 
        "question": example["Question"], 
        "reference_answers": example["Correct Answer"],
        "incorrect_answers": ";".join([example["Incorrect Answer 1"], example["Incorrect Answer 2"], example["Incorrect Answer 3"]]),
        "explanation": example["Explanation"]
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/gpqa_diamond.csv", index=False)

In [5]:
with open("raw/DateUnd/date_understanding.json", "r") as fp:
    examples = json.load(fp)["examples"] 
list_preprocessed = []
for i, example in enumerate(examples):
    raw_question = example["input"]
    correct_choice = example["target"]
    for l in raw_question.split("\n"):
        if l.startswith(correct_choice):
            correct_ans = l

    list_preprocessed.append({
        "qid": f"dateUnd-{i}", 
        "question": example["input"] + "\nChoose one option (surrounded in round brackets)", 
        "reference_answers": example["target"].replace("(", "").replace(")", ""),
        "full_answers": correct_ans
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/dateUnd.csv", index=False)

In [7]:
raw = pd.read_csv("raw/Prf-Law/professional_law_test.csv", names=["Question", "A", "B", "C", "D", "Correct Answer"])
list_preprocessed = []
for i, example in raw.iterrows():
    options = [f"{k}.{example[k]}" for k in ["A", "B", "C", "D"]]
    list_preprocessed.append({
        "qid": f"prfLaw-{i}", 
        "question": example["Question"] + " Options (choose one):\n" + "\n".join(options), 
        "reference_answers": example["Correct Answer"],
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/prfLaw.csv", index=False)

In [4]:
raw = pd.read_csv("raw/Biz-Ethics/business_ethics_test.csv", names=["Question", "A", "B", "C", "D", "Correct Answer"])
list_preprocessed = []
for i, example in raw.iterrows():
    options = [f"{k}.{example[k]}" for k in ["A", "B", "C", "D"]]
    list_preprocessed.append({
        "qid": f"Biz-Ethics-{i}", 
        "question": example["Question"] + "\nOptions (choose one):\n" + "\n".join(options), 
        "reference_answers": example["Correct Answer"],
    })
df = pd.DataFrame(list_preprocessed) 
df.to_csv("preprocessed/Biz-Ethics.csv", index=False)