In [None]:
import os

def extract_Q_A_text(text):
    questions = []
    answers = []

    # First, match the Q: part with its corresponding A: part.
    q_a_pairs = re.findall(r'(Q:.*?)(A:.*?)(?=\n\nQ:|\Z)', text, re.DOTALL)

    for q, a in q_a_pairs:
        # Extract "So the answer is..." part from section A: to the end.
        answer = re.search(r'So the answer is.*', a, re.DOTALL)
        if answer:
            questions.append(q.strip())  # Add removal of leading and trailing whitespace for the Q: part.
            answers.append(answer.group().strip()[len("So the answer is."):])
            return questions, answers

source_directory = r"BIG-Bench-Hard-main\cot-prompts"
target_directory = r"BBH_preprocess\QA-pure_example_from_3_cot"


if not os.path.exists(target_directory):
    os.makedirs(target_directory)

for root, dirs, files in os.walk(source_directory):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            prompt = f.read()
            questions, answers = extract_Q_A_text(prompt)
            print(questions)

        target_file_path = os.path.join(target_directory, file)
        QA_shot = ''
        for q, a in zip(questions, answers):
            a_ = a
            if a[-1] == '.':
                a_ = a[:-1]
            QA_shot += f'[Question]\n{q}\n\n[Answer]\n{a_}\n\n'
        QA_shot = QA_shot.strip()

        with open(target_file_path, 'w', encoding='utf-8') as f:
            f.write(QA_shot)
            # f.write(f'[Question]\n{q}\n\n')
            # f.write(f'[Answer]\n{a}\n\n')


In [None]:
# create BBHI_QA-pure-example-from-3-cot
from until import load_rules, apply_rule

source_directory = r"BIG-Bench-Hard-main\cot-prompts"
# target_directory = r"BBH_preprocess\BBHI_QA-pure-example-from-3-cot"
target_directory = r"BBH_preprocess\BBHI"
rules_file = "BBH_preprocess/remove_prefix_suffix.txt"

rules = load_rules(rules_file)


if not os.path.exists(target_directory):
    os.makedirs(target_directory)

for root, dirs, files in os.walk(source_directory):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            prompt = f.read()
            questions, answers = extract_Q_A_text(prompt)

        # mapping rule task name
        rule_task_name = file.replace('.txt', '.json')
        rule = rules.get(rule_task_name, None)               

        QA_shot = ''
        for q, a in zip(questions, answers):
            a_ = a
            if a[-1] == '.':
                a_ = a[:-1]
            q_ = q[3:] # remove "Q: "
            if rule:
                q_ = apply_rule(q_, rule)
            QA_shot += f'[Question]\n{q_}\n\n[Answer]\n{a_}\n\n'
        QA_shot = QA_shot.strip()

        base_file_name = os.path.splitext(file)[0]
        file_subdir = os.path.join(target_directory, base_file_name)
        if not os.path.exists(file_subdir):
            os.makedirs(file_subdir)

        new_file_path = os.path.join(file_subdir, f'{base_file_name}.3-example.txt')
        print(new_file_path)
        with open(new_file_path, 'w', encoding='utf-8') as f:
            f.write(QA_shot)


In [None]:
# create BBH_preprocess\BBHI_human-cot-prompt
import os
import re

def extract_Q_cot_A_text(text):
    questions = []
    answers = []

    # First match Q: and the corresponding A: part
    q_a_pairs = re.findall(r'(Q:.*?)(A:.*?)(?=\n\nQ:|\Z)', text, re.DOTALL)

    for q, a in q_a_pairs:
        questions.append(q.strip())  # Add the Q: part with the leading and trailing blanks removed
        answers.append(a.strip())  # Directly add the A: part with the leading and trailing blanks removed
    return questions, answers


source_directory = r"BIG-Bench-Hard-main\cot-prompts"
# target_directory = r"BBH_preprocess\BBHI_human-cot-prompt"
target_directory = r"BBH_preprocess\BBHI"
rules_file = "BBH_preprocess/remove_prefix_suffix.txt"

rules = load_rules(rules_file)

if not os.path.exists(target_directory):
    os.makedirs(target_directory)

for root, dirs, files in os.walk(source_directory):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            prompt = f.read()
            questions, answers = extract_Q_cot_A_text(prompt)
            # print(questions)

        # mapping rule task name
        rule_task_name = file.replace('.txt', '.json')
        rule = rules.get(rule_task_name, None)

        QA_shot = ''
        for q, a in zip(questions, answers):
            a_ = a
            if a[-1] == '.':
                a_ = a[:-1]
            q_ = q[3:] # remove "Q: "
            if rule:
                q_ = apply_rule(q_, rule)
            QA_shot += f'[Question]\n{q_}\n\n[Answer]\n{a_}\n\n'
        QA_shot = QA_shot.strip()

        base_file_name = os.path.splitext(file)[0]
        file_subdir = os.path.join(target_directory, base_file_name)
        if not os.path.exists(file_subdir):
            os.makedirs(file_subdir)

        new_file_path = os.path.join(file_subdir, f'{base_file_name}.3-human-cot-example.txt')
        print(new_file_path)
        with open(new_file_path, 'w', encoding='utf-8') as f:
            f.write(QA_shot)


In [None]:
# create human_task_instruction
import os
import re

source_directory = r"BIG-Bench-Hard-main\cot-prompts"
target_directory = r"BBH_preprocess\BBHI"



def extract_human_task_instruction(text):
    # Find the text between '-----' and the first 'Q:'
    instruction_match = re.search(r'-----.*?(?=Q:)', text, re.DOTALL)
    if instruction_match:
        # Remove '-----' and leading and trailing whitespace, then return the text
        instruction = instruction_match.group().replace('-----', '').strip()
        return instruction
    return ''


for root, dirs, files in os.walk(source_directory):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            # Extract Human Task Instruction
            human_task_instruction = extract_human_task_instruction(content)

        base_file_name = os.path.splitext(file)[0]
        file_subdir = os.path.join(target_directory, base_file_name)
        if not os.path.exists(file_subdir):
            os.makedirs(file_subdir)

        new_file_path = os.path.join(file_subdir, f'{base_file_name}.ori_instruction.txt')
        with open(new_file_path, 'w', encoding='utf-8') as f:
            f.write(human_task_instruction)