In [1]:
import os
import json
import random

# MODIFIABLE VARIABLES
random.seed(42) # Modify seed
data_dir = "data/MATH" # Modify input data directory
modified_data_dir = "test/MATH" # Modify output directory

"""This code snippet creates a test set of 30 problems per category, 10 from each level, for the MATH dataset. This is a testing size of 210 problems."""

# Loop over the train OR test OR both directories
for data_type in ["test"]:
    # Loop over the subdirectories in the data directory
    for sub_dir in os.listdir(os.path.join(data_dir, data_type)):
        
        # Skip any files that are not directories
        if not os.path.isdir(os.path.join(data_dir, data_type, sub_dir)):
            continue
        
        # Create the corresponding subdirectory in the modified data directory
        os.makedirs(os.path.join(modified_data_dir, data_type, sub_dir), exist_ok=True)

        # Get a list of all the JSON files in the subdirectory
        json_files = [f for f in os.listdir(os.path.join(data_dir, data_type, sub_dir)) if f.endswith(".json")]
        
        # Randomly select 30 JSON files from the list
        selected_files = random.sample(json_files, k=min(30, len(json_files)))

        # Initialize counters for each level
        level1_count = 0
        level2_count = 0
        level3_count = 0

        # Loop over all JSON files and count how many files are at each level
        for file_name in json_files:
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)
                level = data["level"]
                if level == "Level 1":
                    level1_count += 1
                elif level == "Level 2":
                    level2_count += 1
                elif level == "Level 3":
                    level3_count += 1

        # Print the counts for each level
        # print("Level 1:", level1_count)
        # print("Level 2:", level2_count)
        # print("Level 3:", level3_count)

        # Randomly select exactly 10 files from each level
        selected_files = []
        for level in ["Level 1", "Level 2", "Level 3"]:
            files_at_level = [f for f in json_files if json.load(open(os.path.join(data_dir, data_type, sub_dir, f)))["level"] == level]
            selected_files.extend(random.sample(files_at_level, k=min(10, len(files_at_level))))

        # Loop over the selected JSON files
        for file_name in selected_files:
            
            # Load the JSON file
            with open(os.path.join(data_dir, data_type, sub_dir, file_name), "r") as file:
                data = json.load(file)

            # Print level
            # level = data["level"]
            
            # Save the modified JSON data to a new file
            with open(os.path.join(modified_data_dir, data_type, sub_dir, file_name), "w") as file:
                json.dump(data, file)