# Subsetting & merging instruction finetuning datasets

| **Dataset** 	| **Source** 	| **Generation** 	|
|---	|---	|---	|
| openai/prm800K 	| https://github.com/openai/prm800k 	| H 	|
| databricks/databricks-dolly-15k 	| https://huggingface.co/datasets/databricks/databricks-dolly-15k 	| H 	|
| timdettmers/openassistant-guanaco 	| https://huggingface.co/datasets/timdettmers/openassistant-guanaco 	| H 	|
| metaeval/reclor 	| https://whyu.me/reclor/; https://openreview.net/pdf?id=HJgJtT4tvB 	| H 	|
| mandyyyyii/scibench 	| https://github.com/mandyyyyii/scibench; https://huggingface.co/datasets/xw27/scibench 	| H 	|
| metaeval/ScienceQA_text_only 	| https://huggingface.co/datasets/metaeval/ScienceQA_text_only 	| H 	|
| wenhu/TheoremQA 	| https://github.com/wenhuchen/TheoremQA 	| H 	|
| TigerResearch/tigerbot-kaggle-leetcodesolutions-en-2k 	| https://huggingface.co/datasets/TigerResearch/tigerbot-kaggle-leetcodesolutions-en-2k 	| H 	|
| hendrycks/MATH 	| https://github.com/hendrycks/math 	| H 	|
| duckai/arb 	| https://github.com/TheDuckAI/arb 	| H 	|

In [None]:
!pip install datasets

In [None]:
import pathlib as plb
import json
import random
import subprocess
import os

from getpass import getpass
import sh
import pandas as pd
from datasets import load_dataset

In [None]:
try:
  pat = getpass('Huggingface PAT: ')

  sh.huggingface_cli.login('--token', pat)
except Exception as e:
  print(e)
finally:
  pat = ""

In [None]:
RAW_PATH = "../../data/raw"
INTERMEDIATE_PATH = "../../data/intermediate"
PROCESSED_PATH = "../../data/processed"

raw_path = plb.Path(RAW_PATH)
intermediate_path = plb.Path(INTERMEDIATE_PATH)
processed_path = plb.Path(PROCESSED_PATH)

raw_path.mkdir(parents=True, exist_ok=True)
intermediate_path.mkdir(parents=True, exist_ok=True)

processed_path.mkdir(parents=True, exist_ok=True)


### Openai/prm800K

NB: this dataset is merged with MATH dataset below

In [None]:
# Change this is you also want the ones were the final solution wasn't found
ONLY_SOLVED = True
ONLY_FOUND_ANSWER = True

# Script to convert the openai prm800K dataset into input output pairs

def download_files():
    urls = ["https://github.com/openai/prm800k/raw/main/prm800k/data/phase1_test.jsonl",
            "https://github.com/openai/prm800k/raw/main/prm800k/data/phase1_train.jsonl",
            "https://github.com/openai/prm800k/raw/main/prm800k/data/phase2_test.jsonl",
            "https://github.com/openai/prm800k/raw/main/prm800k/data/phase2_train.jsonl"]

    raw_files_prm = raw_path / "openai_prm800k_raw"
    raw_files_prm.mkdir(parents=True, exist_ok=True)

    output_file_names = [str(raw_files_prm / name.split('/')[-1]) for name in urls]
    for i in range(len(urls)):
        url = urls[i]
        output_file = output_file_names[i]

        # Check if the file already exists
        if not os.path.isfile(output_file):
            # If the file doesn't exist, download it
            try:
                subprocess.run(["curl", "-L", url, "-o", output_file])
                # print("Downloaded", output_file, "successfully!")
            except:
                # If the curl command fails, try wget
                subprocess.run(["wget", url, "-O", output_file])
                # print("Downloaded", output_file, "successfully!")
        else:
            print("File", output_file, "already exists, skipping download")

    return output_file_names


def convert_format(data, ONLY_SOLVED):
    # Check for solved questions
    if ONLY_SOLVED and data["label"]["finish_reason"] != "solution":
        return None

    input = data["question"]["problem"]
    answer = data["question"]["ground_truth_answer"]

    steps = data["label"]["steps"]
    output = []
    answer_marker = "# Answer\n\n"
    for step in steps:
        # Get a set of correct completions
        completions = step.get("completions")
        selected_text = None
        if completions is not None:
            rated_completions = [comp for comp in completions if comp["rating"] == 1]
            completions_with_answer = [comp for comp in rated_completions if answer_marker in comp["text"]]
            if completions_with_answer:
                selected_completion = random.choice(completions_with_answer)
                selected_text = selected_completion["text"]
            elif rated_completions:
                selected_completion = random.choice(rated_completions)
                selected_text = selected_completion["text"]

        if selected_text:
            output.append(selected_text)

        human_completion = step.get("human_completion")
        if human_completion and human_completion["rating"] == 1:
            output.append(human_completion["text"])

    full_output = " ".join(output)

    # Find answer in the output if available
    answer_start_index = full_output.rfind(answer_marker)
    if answer_start_index != -1:
        answer_end_index = answer_start_index + len(answer_marker)
        found_answer = full_output[answer_end_index:].strip()
    else:
        found_answer = None
    return {
        "Input": input,
        "Output": full_output,
        "Answer": answer,
        "Found_Answer": found_answer,
    }


def main():
    output_file = str(intermediate_path / "openai_prm800k_formatted.jsonl")
    input_files = download_files()

    with open(output_file, 'w') as out_f:
        for input_file_name in input_files:
            with open(input_file_name, 'r') as input_file:
                line_count = 0
                for line in input_file:
                    line_count += 1
                    data = json.loads(line)
                    converted_data = convert_format(data, ONLY_SOLVED)
                    if converted_data is not None:
                        if ONLY_FOUND_ANSWER and converted_data["Found_Answer"] == None:
                            pass
                        elif converted_data["Found_Answer"] == converted_data["Answer"]:
                            out_f.write(json.dumps(converted_data) + '\n')
                # print the number of lines in the input file, including the file name
                print("Number of lines in", input_file_name, ":", line_count)
    # Print the number of lines in the output file
    print("Number of lines in", output_file, ":", sum(1 for line in open(output_file)))

main()


In [None]:
df = pd.read_json(str(intermediate_path / "openai_prm800k_formatted.jsonl"), lines=True)
df.head()

### Databricks/dolly-15k

In [None]:
ds_dolly = load_dataset("databricks/databricks-dolly-15k")

In [None]:
df_dolly = ds_dolly["train"].to_pandas().drop(columns=["category"]).rename(columns={"context": "input", "response": "output"})

In [None]:
df_dolly.to_json(processed_path / "databricks_dolly15k.jsonl",
           orient="records",
           lines=True)

In [None]:
df_dolly

### Openassistant-guanaco

In [None]:
!pip install py3langid

In [None]:
from py3langid.langid import LanguageIdentifier, MODEL_FILE 

In [None]:
# Loading the data
guanaco = load_dataset('timdettmers/openassistant-guanaco',split='train')
guanaco_test = load_dataset('timdettmers/openassistant-guanaco',split='test')

# Put the guanaco data in a  dataframe from the guanaco and guanaco_test variables
df = pd.DataFrame(guanaco)
df_test = pd.DataFrame(guanaco_test)
df = pd.concat([df, df_test])
df.head()

def split_text(text):
    split_marker = "### Assistant:"
    instruction, output = text.split(split_marker, 1)
    instruction = instruction.replace("### Human:", "").strip()
    output = output.replace("### Human:", "### Instruction:\n").replace("### Assistant:", "### Response:\n").strip()
    return pd.Series([instruction, output])

# Apply the function to the filtered dataframe
df[['instruction', 'output']] = df['text'].apply(split_text)

df.head()

In [None]:
def detect_language_with_langid(df):  
    identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
    lang, prob = identifier.classify(df["text"].replace("### Human:", "").replace("### Assistant:", "").strip())
    df["lang"] = lang
    df["lang_prob"] = prob
    return df

In [None]:
df = df.apply(detect_language_with_langid, axis=1)

In [None]:
df_filtered = df.loc[lambda df: df["lang_prob"] > .5].loc[lambda df: df["lang"] == "en"]

In [None]:
df_filtered.shape

In [None]:
data = df_filtered[['instruction', 'output']].to_dict('records')
for record in data:
    record['input'] = ''

data_reordered = []
for record in data:
    data_reordered.append({'instruction': record['instruction'], 'input': record['input'], 'output': record['output']})

In [None]:
with (processed_path / "guanaco.jsonl").open('w') as f:
    for record in data_reordered:
        f.write(json.dumps(record) + '\n')

### duckai/arb

In [None]:
import json
import requests
import pandas as pd

def fetch_data_from_url(url):
    headers = {'accept': 'application/json'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        return response.json()
    except requests.RequestException as e:
        print(f"Error: {e}")
        return None

# MATH

url = 'https://arb.duckai.org/api/lib/math'
data = fetch_data_from_url(url)

if data is not None:
    # Extract the ids
    ids = [item['_id'] for item in data]

    formatted_data = []

    # For each id, make a request to the API and format the returned data
    for id_ in ids:
        response = requests.get(f"https://arb.duckai.org/api/lib/math/{id_}")
        response_data = response.json()

        # Format the data
        formatted_entry = {
            "instruction": response_data["Problem_Statement"],
            "input": "",
            "output": response_data["Solution"]
        }
        formatted_data.append(formatted_entry)

    # Create a DataFrame from the formatted data
    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to a new JSON file
    df.to_json(processed_path / "duckai_arb_formatted_math_data.jsonl", orient="records", lines=True)
else:
    print("No data was returned from the API for Math")


#MCAT READING

url = 'https://arb.duckai.org/api/lib/mcatReading'
data = fetch_data_from_url(url)

if data is not None:
    # Create an empty list to hold the formatted data
    formatted_data = []

    # Loop through each item in the data
    for item in data:
        # Extract the instruction, possible solutions, and correct answer
        instruction = item['Problem Statement']
        options = item['Answer Candidates']
        output = item['Solution']

        output = output.split('.', 1)[-1].lstrip()

    # Append the options to the instruction
        for i, option in enumerate(options, start=65):
            instruction += f"\n{chr(i)}. {option}"


        # Create a new dictionary with 'input' and 'output' switched
        formatted_entry = {
            "instruction": instruction,
            "input": "Choose A, B, C or D as your solution.",
            "output": output
        }

        # Add the formatted entry to the list
        formatted_data.append(formatted_entry)

    # Create a DataFrame from the formatted data
    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to a new JSON file
    df.to_json(processed_path / "duckai_arb_formatted_mcat_data.jsonl", orient="records", lines=True)
else:
    print("No data was returned from the API for MCAT Reading")

# LAW

# Load the data from the file
url = 'https://arb.duckai.org/api/lib/law'
data = fetch_data_from_url(url)

if data is not None:
    # Create an empty list to hold the formatted data
    formatted_data = []

    # Loop through each item in the data
    for item in data:
        # Extract the instruction, possible solutions, and correct answer
        instruction = item['Problem Statement']
        options = item['Answer Candidates']
        output = item['Final Answer']

    # Append the options to the instruction
        for i, option in enumerate(options, start=65):
            instruction += f"\n{chr(i)}. {option}"


        # Create a new dictionary with 'input' and 'output' switched
        formatted_entry = {
            "instruction": instruction,
            "input": "Choose A, B, C or D as your solution.",
            "output": output
        }

        # Add the formatted entry to the list
        formatted_data.append(formatted_entry)

    # Create a DataFrame from the formatted data
    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to a new JSON file
    df.to_json(processed_path / "duckai_arb_formatted_law_data.jsonl", orient="records", lines=True)
else:
    print("No data was returned from the API for MCAT Reading")


#MCAT SCIENCE

url = 'https://arb.duckai.org/api/lib/mcatscience/val'
data = fetch_data_from_url(url)

if data is not None:
    # Create an empty list to hold the formatted data
    formatted_data = []

    # Loop through each item in the data
    for item in data:
        # Extract the instruction, possible solutions, and correct answer
        instruction = item['Problem Statement']
        options = item['Answer Candidates']
        output = item['Solution']

        output = output.split('.', 1)[-1].lstrip()

    # Append the options to the instruction
        for i, option in enumerate(options, start=65):  # ASCII value of 'A' is 65
            instruction += f"\n{chr(i)}. {option}"


        # Create a new dictionary with 'input' and 'output' switched
        formatted_entry = {
            "instruction": instruction,
            "input": "Choose A, B, C or D as your solution.",
            "output": output
        }

        # Add the formatted entry to the list
        formatted_data.append(formatted_entry)

    # Create a DataFrame from the formatted data
    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to a new JSON file
    df.to_json(processed_path / "duckai_arb_formatted_mcat_science_data.jsonl", orient="records", lines=True)
else:
    print("No data was returned from the API for MCAT Science")

# PHYSICS

url = 'https://arb.duckai.org/api/lib/physics/val'
data = fetch_data_from_url(url)


if data is not None:
    # Extract the ids
    ids = [item['_id'] for item in data]

    formatted_data = []

    # For each id, make a request to the API and format the returned data
    for id_ in ids:
        try:
            response = requests.get(f"https://arb.duckai.org/api/lib/physics/val/{id_}")
            response_data = response.json()
            # Format the data
            formatted_entry = {
                "instruction": response_data["Problem_Statement"],
                "input": "",
                "output": response_data["Solution"]
            }
            formatted_data.append(formatted_entry)

        except:
            print(f"Error with id: {id_}")


    # Create a DataFrame from the formatted data
    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to a new JSON file
    df.to_json(processed_path / "duckai_arb_formatted_physics_data.jsonl", orient="records", lines=True)
else:
    print("No data was returned from the API for Physics.")

### Metaeval/reclor

In [None]:
data = load_dataset('metaeval/reclor', split='train')
# Function for update
def format_question(data_entry):
    context = data_entry['context']
    question = data_entry['question']
    answers = data_entry['answers']
    label = data_entry['label']

    formatted_question = context + " " + question
    for i, ans in enumerate(answers):
        formatted_question += "\n" + chr(65+i) + ": " + ans  

    # Create the formatted answer string
    formatted_answer = chr(65+label)  

    return {"instruction": formatted_question, "input": "Choose A, B, C or D as your solution.", "output": formatted_answer}

reclor_data = [format_question(entry) for entry in data]

In [None]:
df = pd.DataFrame(reclor_data)

df

In [None]:
df.to_json(processed_path / "reclor.jsonl", orient="records", lines=True)

### Mandyyyyii/scibench

In [None]:
import os
import json
import subprocess
import os
# The directory where the json files are stored
# dir_path = 'original'

import subprocess


# Define the repository URL
repo_url = "https://github.com/mandyyyyii/scibench.git"
repo_name = "scibench"
target_subdir = "dataset/original"

# Clone the repository
result_clone = subprocess.run(["git", "clone", repo_url])

# Check if the clone command was successful
if result_clone.returncode == 0:
    print("Repository cloned successfully!")

    # Construct the path to the target directory within the cloned repository
    target_path = os.path.join(os.getcwd(), repo_name, target_subdir)
    print("Path to the target directory:", target_path)
else:
    print("An error occurred while cloning the repository.")

json_files = [f for f in os.listdir(target_path) if f.endswith('.json')]


new_data = []

# Iterate over all the files
for json_file in json_files:
    file_path = os.path.join(target_path, json_file)

    # Open each json file
    with open(file_path, 'r') as f:
        # Load the data
        file_data = json.load(f)

        # Transform the data
        for d in file_data:
            output = d.get('solution')
            if not output:
                output = d.get('answer_number')
            transformed_data = {
                "instruction": d.get('problem_text'),
                "input": '',
                "output": output
            }
            new_data.append(transformed_data)

In [None]:
df = pd.DataFrame(new_data)

In [None]:
df.to_json(processed_path / "scibench.jsonl", orient="records", lines=True)

### ScienceQA_text_only

In [None]:
from datasets import load_dataset

import pandas as pd
dataset = load_dataset('metaeval/ScienceQA_text_only') 

# Load in to a df
df = pd.DataFrame(dataset['train'])
# Print out all of the unique first three words of the question
unique_first_three_words = df['question'].apply(lambda x: ' '.join(x.split()[:3])).unique()
print(unique_first_three_words)

# Print out all the skills
skills = df['skill'].unique().tolist()
for skill in sorted(skills):
    print(skill)
print(len(df['skill'].unique()))

skills_to_remove = ['Choose customary units of distance','Choose customary units of mass','Choose customary units of volume','Is it a complete sentence or a run-on','Is the sentence simple compound, comples, or compound-complex','Use guide words']
print('Length before removing skills: ', len(df))
df = df[~df['skill'].isin(skills_to_remove)]
print('Length after removing skills: ', len(df))


# remove the task, grade, subject,topic, category, and skill columns
df_dropped = df.drop(['task', 'grade', 'subject', 'topic', 'category', 'skill'], axis=1)
df_dropped.head(10)

# Check if every example has a choices field that is not an empty string
print('Length of updated df:',len(df_dropped))
df_dropped['solution'].apply(lambda x: x != '').value_counts()

# Filter the df to only include examples with a non-empty choices field

df_filtered = df_dropped[df_dropped['solution'].apply(lambda x: x != '')]
print('Keeping only those questions which have a long-form solution:',len(df_filtered))
# Reset the row numbers of df_filtered
df_filtered = df_filtered.reset_index(drop=True)

# Create a new df with the columns we want to keep
df_reformatted = df_filtered[['question', 'choices', 'solution', 'lecture', 'answer']]

# Add a column which contains the correct answer based on the answer index for the list
df_reformatted['correct_answer'] = df_reformatted.apply(lambda x: x['choices'][x['answer']], axis=1)

# Reformat the choices column to be a string of the form: A, choice1, B, choice2, C, choice3, D, choice4
df_reformatted['choices'] = df_reformatted['choices'].apply(lambda x: '\n'.join([f'{chr(65+i)}: {choice}' for i, choice in enumerate(x)]))

# Combine the question and choices columns into one column
df_reformatted['question'] = df_reformatted['question'] + '\n' + df_reformatted['choices']

# Rename question to instruction, lecture to input, and solution to output
df_reformatted = df_reformatted.rename(columns={'question': 'instruction', 'lecture': 'input', 'solution': 'output','answer':'answer'})

# reorder the columns to instruction, input, output, correct_answer, answer
df_reformatted = df_reformatted[['instruction', 'input', 'output', 'correct_answer','answer']]

print('Length of df_reformatted:', len(df_reformatted))

# Remove the examples which have duplicate inputs
df_reformatted = df_reformatted.drop_duplicates(subset=['instruction'])
print('Length of df_reformatted after removing duplicates:', len(df_reformatted))

# Display a graph of the histogram of the lengths of the outputs based on words
#import matplotlib.pyplot as plt

# Get the lengths of the outputs
output_lengths = df_reformatted['output'].apply(lambda x: len(x.split()))

print('There are ',len(output_lengths[output_lengths > 40]), 'examples with output length > 40')

# Print the number of unique inputs
print('There are', len(df_reformatted['input'].unique()), 'unique inputs\n')

# Print the unique inputs
df_reformatted['input'].unique()

# For every input, print the number of examples with that input
for input in df_reformatted['input'].unique():
    print('There are', len(df_reformatted[df_reformatted['input'] == input]), 'examples with input:', input[:60])
    print('\n')

# List of the questions that need description
need_description = ['What information supports','Based on','Read the following','Use the evidence','Look at the','According to a']

# Removing the input for those which don't need it
def check_description(instruction):
    for desc in need_description:
        if desc in instruction:
            return True
    return False

# Apply the function to the 'instruction' column
df_reformatted['contains_description'] = df_reformatted['instruction'].apply(check_description)

# Now update 'input' field where 'contains_description' is False
df_reformatted.loc[~df_reformatted['contains_description'], 'input'] = ''

input_lengths = df_reformatted['input'].apply(lambda x: len(x.split()))
output_lengths = df_reformatted['output'].apply(lambda x: len(x.split()))

# remove the examples with input length > 400
df_reformatted = df_reformatted[input_lengths <= 400].reset_index(drop=True)

# save the information from the df_reformatted to a json file, with the format: {"instruction": "instruction text", "input": "input text", "output": "output text"}

In [None]:
df_reformatted

In [None]:
df_reformatted.drop(['correct_answer','answer','contains_description'], axis=1).to_json(processed_path / 'scienceqa.jsonl', orient='records', lines=True)

### TheoremQA

In [None]:
import pandas as pd
import json
from datasets import load_dataset
# The path where the csv file is stored
file_path = 'test.csv'
output_file = processed_path / 'theoremqa.jsonl'
dataset = load_dataset("wenhu/TheoremQA", data_files="test.csv")
new_data = []

# Open the csv file
df = pd.DataFrame(dataset['train'])

# Transform the data
for _, row in df.iterrows():
    instruction = "{}\nRelevant Theorem: {}".format(row['Question'], row['theorem_def'])
    transformed_data = {
        "instruction": instruction,
        "input": '',
        "output": row['Answer']
    }
    new_data.append(transformed_data)

In [None]:
df = pd.DataFrame(new_data)

In [None]:
df

In [None]:
df.to_json(output_file, orient="records", lines=True)

### Tigerbot

In [None]:
dataset = load_dataset("TigerResearch/tigerbot-kaggle-leetcodesolutions-en-2k", split="train") 

# make df from dataset
df = dataset.to_pandas()

# swap the values of instruction and input
df['instruction'], df['input'] = df['input'], df['instruction']

df = df.loc[lambda df: df["input"].str.contains('python')]

# make input empty
df['input'] = ''

In [None]:
df

In [None]:
df.to_json(processed_path / 'tigerbot.jsonl', orient='records', lines=True)

### Update MATH

In [None]:
#! /usr/bin/env python3
import json
import glob
import re
import argparse
import os
import subprocess
# Script to update the MATH dataset with enhanced solutions from the PRM dataset
# See convert_prm.py for the creating the required PRM file

def download_files():
    url = 'https://people.eecs.berkeley.edu/~hendrycks/MATH.tar'
    tar_file = 'MATH.tar'
    output_dir = 'MATH'

    # Check if the directory already exists
    if not os.path.isdir(output_dir):
        # If the directory doesn't exist, download the tar file
        try:
            subprocess.check_output(["curl", "-L", url, "-o", tar_file])
        except subprocess.CalledProcessError as e:
            print("Curl failed with error:", e.output)
            return

        # Try to unzip
        try:
            print("Unzipping", tar_file, "...")
            subprocess.check_output(["tar", "-xf", tar_file])
        except subprocess.CalledProcessError as e:
            print("Tar failed with error:", e.output)
            return

        # Remove the tar file
        try:
            subprocess.check_output(["rm", tar_file])
        except subprocess.CalledProcessError as e:
            print("Remove tar file failed with error:", e.output)
            return
    else:
        print("Directory", output_dir, "already exists, skipping download")

    
def main():
    download_files()
    # read in the .jsonl file
    with open('../../data/intermediate/openai_prm800k_formatted.jsonl', 'r') as f:
        lines = f.readlines()
    jsonl_data = [json.loads(line) for line in lines]

    def process_directory(directory):
        # find all .json files in all subdirectories
        filepaths = glob.glob(directory + '/**/*.json', recursive=True)
        
        combined_data = []
        for filepath in filepaths:
            with open(filepath, 'r') as f:
                data = json.load(f)

                # Replace 'problem' with 'input' and 'solution' with 'output'
                modified_data = {
                    'instruction': data['problem'],
                    'input': "",
                    'output': data['solution'],
                }

                combined_data.append(modified_data)

        # save the combined data to a .json file
        dir_name = directory.split('/')[-1]
        with open(f'MATH_{dir_name}_data.json', 'w') as f:
            json.dump(combined_data, f)

        return combined_data


    # create single json files for train and test data
    train_data = process_directory('MATH/train')
    test_data = process_directory('MATH/test')

    # add train data and test data together to get all data
    train_data = train_data + test_data
    
    # train_data_locations = [data['instruction'] for data in train_data]

    # Put the train and test data together
    train_data_locations = [data['instruction'] for data in train_data]
    test_data_locations = [data['instruction'] for data in test_data]

    train_data_locations.extend(test_data_locations)

    count = 0
    # replace with the enhanced solutions
    for data in jsonl_data:
        if data['Input'] in train_data_locations:
            index = train_data_locations.index(data['Input'])

            answer = data['Output'].split('# Answer\n\n')[1]
            modified_content = {
                'instruction': data['Input'],
                'input': "",
                # 'output': re.sub(r'# Answer\n\n.*', r'\\boxed{' + re.escape(answer) + '}', data['Output']),
                'output': re.sub(r'# Answer\n\n.*','', data['Output']),
                # 'gt': answer
            }
            train_data[index] = modified_content
            count += 1

    # For every problem in the train_data, if the ouput has a boxed answer, remove the box update to keep the answer
    for data in train_data:
        if re.search(r'\\boxed{(.*)\}', data['output']):
            data['output'] = re.sub(r'\\boxed\{(.*)\}', r'\1', data['output'])

    #remove the math folder
    os.remove('MATH_test_data.json')
    os.remove('MATH_train_data.json')
    subprocess.check_output(["rm", "-rf", "MATH"])
    
    return train_data

data = main()

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_json(processed_path / 'MATH_train_enhanced_no_boxed.jsonl', orient='records', lines=True)

### LIMA

In [None]:
dataset = load_dataset("GAIR/lima")

In [None]:
df = dataset["train"].to_pandas()

In [None]:
# Same formatting as guanaco

def process_lima_record(x):
    instruction = x["conversations"][0]
    response = x["conversations"][1:]
    response_out = response[0]
    for i, r in enumerate(response):
        if i == 0:
            continue
        if i % 2 == 1: # human
            response_out += "### Instruction:\n " + r
        else: # bot
            response_out += "### Response:\n " + r
    return {
        "instruction": instruction,
        "input": "",
        "output": response_out
    }

In [None]:
out = df.apply(process_lima_record, axis=1)

In [None]:
out.to_json(processed_path / "lima.jsonl", orient="records", lines=True)

## Flan

https://github.com/google-research/FLAN/tree/main/flan/v2

> NB #1: These scripts download and process dozens of GBs of data, which is usually not feasible in a single run. We recommend starting with submixtures like cot_submix, flan2021_submix, dialog_submix, t0_submix and niv2_submix, as shown in flan/v2/run_example.py. If you plan to use Seqio/T5X for training then we recommend caching the datasets, following these instructions. If not, you can use the above script to collect the data as raw text/json.

In [None]:
from datasets import load_dataset

flan_datasets = [
    "conceptofmind/cot_submix_original",
    "conceptofmind/flan2021_submix_original",
    "conceptofmind/dialog_submix_original",
    "conceptofmind/t0_submix_original",
    # "conceptofmind/niv2_submix_original",
]

def download_flan_submix(flan_dataset):
    ds_flan = load_dataset(flan_dataset)

    df_flan = (
        ds_flan["train"]
        .to_pandas()
        [['inputs','targets']]
        .rename(columns={"inputs": "context", "target": "output"})
        .assign(input="")
    )

    return df_flan

In [None]:
df = download_flan_submix("conceptofmind/cot_submix_original")

In [None]:
df.head()

In [None]:
print(df.iloc[0].targets)

In [None]:
df_flan.to_json(processed_path / f"{flan_dataset.replace('/','-')}.jsonl",
        orient="records",
        lines=True)