# Create train-finetuned-davinci

In [6]:
import os
import json
import random

# MODIFIABLE VARIABLES
random.seed(42) # Modify seed
data_dir = "data/MATH" # Modify input data directory
modified_data_dir = "train-finetuned-davinci/MATH" # Modify output directory

"""This code snippet creates a test set of 60 problems for each category, 20 from each level, for the MATH dataset. This is a testing dataset of 420 problems."""

# Loop over the train OR test OR both directories
for data_type in ["train"]:
    # Loop over the subdirectories in the data directory
    for sub_dir in os.listdir(os.path.join(data_dir, data_type)):
        
        # Skip any files that are not directories
        if not os.path.isdir(os.path.join(data_dir, data_type, sub_dir)):
            continue
        
        # Create the corresponding subdirectory in the modified data directory
        os.makedirs(os.path.join(modified_data_dir, data_type, sub_dir), exist_ok=True)

        # Get a list of all the JSON files in the subdirectory
        json_files = [f for f in os.listdir(os.path.join(data_dir, data_type, sub_dir)) if f.endswith(".json")]
        
        # Randomly select 30 JSON files from the list
        selected_files = random.sample(json_files, k=min(30, len(json_files)))

        # Initialize counters for each level
        level1_count = 0
        level2_count = 0
        level3_count = 0

        # Loop over all JSON files and count how many files are at each level
        for file_name in json_files:
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)
                level = data["level"]
                if level == "Level 1":
                    level1_count += 1
                elif level == "Level 2":
                    level2_count += 1
                elif level == "Level 3":
                    level3_count += 1

        # Print the counts for each level
        # print("Level 1:", level1_count)
        # print("Level 2:", level2_count)
        # print("Level 3:", level3_count)

        # Randomly select exactly 10 files from each level
        selected_files = []
        for level in ["Level 1", "Level 2", "Level 3"]:
            files_at_level = [f for f in json_files if json.load(open(os.path.join(data_dir, data_type, sub_dir, f)))["level"] == level]
            selected_files.extend(random.sample(files_at_level, k=min(20, len(files_at_level))))

        # Loop over the selected JSON files
        for file_name in selected_files:
            
            # Load the JSON file
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)

            # Print level
            # level = data["level"]
            
            # Save the modified JSON data to a new file
            with open(os.path.join(modified_data_dir, data_type, sub_dir, file_name), "w") as file:
                json.dump(data, file)

# Create train-wolfram-finetuned-davinci

Below is the exact same function, for creating a wolfram finetuned davinci model. Then, we manually label each one by **filling in its Wolfram Query.**

In [2]:
import os
import json
import random

# MODIFIABLE VARIABLES
random.seed(42) # Modify seed
data_dir = "data/MATH" # Modify input data directory
modified_data_dir = "train-wolfram-finetuned-davinci/MATH" # Modify output directory

"""This code snippet creates a test set of 60 problems for each category, 20 from each level, for the MATH dataset. This is a testing dataset of 420 problems."""

# Loop over the train OR test OR both directories
for data_type in ["train"]:
    # Loop over the subdirectories in the data directory
    for sub_dir in os.listdir(os.path.join(data_dir, data_type)):
        
        # Skip any files that are not directories
        if not os.path.isdir(os.path.join(data_dir, data_type, sub_dir)):
            continue
        
        # Create the corresponding subdirectory in the modified data directory
        os.makedirs(os.path.join(modified_data_dir, data_type, sub_dir), exist_ok=True)

        # Get a list of all the JSON files in the subdirectory
        json_files = [f for f in os.listdir(os.path.join(data_dir, data_type, sub_dir)) if f.endswith(".json")]
        
        # Randomly select 30 JSON files from the list
        selected_files = random.sample(json_files, k=min(30, len(json_files)))

        # Initialize counters for each level
        level1_count = 0
        level2_count = 0
        level3_count = 0

        # Loop over all JSON files and count how many files are at each level
        for file_name in json_files:
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)
                level = data["level"]
                if level == "Level 1":
                    level1_count += 1
                elif level == "Level 2":
                    level2_count += 1
                elif level == "Level 3":
                    level3_count += 1

        # Print the counts for each level
        # print("Level 1:", level1_count)
        # print("Level 2:", level2_count)
        # print("Level 3:", level3_count)

        # Randomly select exactly 10 files from each level
        selected_files = []
        for level in ["Level 1", "Level 2", "Level 3"]:
            files_at_level = [f for f in json_files if json.load(open(os.path.join(data_dir, data_type, sub_dir, f)))["level"] == level]
            selected_files.extend(random.sample(files_at_level, k=min(20, len(files_at_level))))

        # Loop over the selected JSON files
        for file_name in selected_files:
            
            # Load the JSON file
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)

            # Print level
            # level = data["level"]
            
            # Save the modified JSON data to a new file
            with open(os.path.join(modified_data_dir, data_type, sub_dir, file_name), "w") as file:
                json.dump(data, file)

Add the labels "wolframquery" and "wolfram output"

In [None]:
import os
import json
# Define the paths to the data directories
data_dir = "train-wolfram-finetuned-davinci/MATH/train"

for sub_dir in os.listdir(data_dir):
    print(sub_dir)
    # Skip any files that are not directories
    if not os.path.isdir(os.path.join(data_dir, sub_dir)):
        continue
    
  
    # Get a list of all the JSON files in the subdirectory
    json_files = [f for f in os.listdir(os.path.join(data_dir, sub_dir)) if f.endswith(".json")]
    
    for filename in json_files:
        if filename.endswith('.json'):
                file_path = os.path.join(data_dir, sub_dir,filename)
                with open(file_path, 'r+') as f:
                    data = json.load(f)
                    data['wolframquery'] = ''
                    data['wolframoutput'] = ''
                    f.seek(0)
                    json.dump(data, f, indent=4)
                    f.truncate()
                

Run Wolfram Output

In [8]:
import requests
import os
import json

with open('keys.json') as f:
    data = json.load(f)
wolfram_key = data['wolfram-key']

def wolfram(input_query: str):
    api_url = "https://api.wolframalpha.com/v2/query"
    params = {
        "input": str(input_query), # handles None bs
        "appid": wolfram_key,
        "format": "plaintext",
        "output": "json"
    }
    response = requests.get(api_url, params=params)
    response_json = response.json()
    print(response_json)
    try:
        result = response_json["queryresult"]["pods"][1]["subpods"][0]["plaintext"]
        print(result)
        return result
    except:
        print("Wolfram Alpha could not find an answer and encountered an error.")
        return "Wolfram Alpha could not find an answer and encountered an error."

data_dir = "train-wolfram-finetuned-davinci/MATH"

# Loop over the directories | possible: ["train", "test"]:
for data_type in ["train"]:
    for sub_dir in os.listdir(os.path.join(data_dir, data_type)):
        print(sub_dir)
        # Skip any files that are not directories
        if not os.path.isdir(os.path.join(data_dir, data_type, sub_dir)):
            continue
        # Get a list of all the JSON files in the subdirectory
        json_files = [f for f in os.listdir(os.path.join(data_dir, data_type, sub_dir)) if f.endswith(".json")]
        for filename in json_files:
            if filename.endswith('.json'):
                    file_path = os.path.join(data_dir, data_type, sub_dir,filename)
                    with open(file_path, 'r+') as f:
                        data = json.load(f)
                        wolframquery = data['wolframquery']
                        print(wolframquery)
                        wolframoutput = wolfram(wolframquery)
                        print(wolframoutput)
                        data['wolframoutput'] = wolframoutput

                        with open(os.path.join(data_dir, data_type, sub_dir, filename), "w") as file:
                            json.dump(data, file)

counting_and_probability
solve 1= 5/12 + 1/3 + x for x
{'queryresult': {'success': True, 'error': False, 'numpods': 4, 'datatypes': 'Solve', 'timedout': '', 'timedoutpods': '', 'timing': 0.751, 'parsetiming': 0.435, 'parsetimedout': False, 'recalculate': '', 'id': 'MSP41981h18i91f1g21i8bh00000db88i2i1da0gi9e', 'host': 'https://www6b3.wolframalpha.com', 'server': '6', 'related': 'https://www6b3.wolframalpha.com/api/v1/relatedQueries.jsp?id=MSPa41991h18i91f1g21i8bh00002ba8b8af1h393he17167914177805443390', 'version': '2.6', 'inputstring': 'solve 1= 5/12 + 1/3 + x for x', 'pods': [{'title': 'Input interpretation', 'scanner': 'Identity', 'id': 'Input', 'position': 100, 'error': False, 'numsubpods': 1, 'subpods': [{'title': '', 'plaintext': 'solve 1 = 5/12 + 1/3 + x for x'}], 'expressiontypes': {'name': 'Default'}}, {'title': 'Result', 'scanner': 'Solve', 'id': 'Result', 'position': 200, 'error': False, 'numsubpods': 1, 'primary': True, 'subpods': [{'title': '', 'plaintext': 'x = 1/4'}], 'ex