In [7]:
import json, os
from typing import Optional, Dict, Any, List
import utils as util

from collections import Counter
import matplotlib.pyplot as plt

DATA_FOLDER = "cleaned_data"

### Process Data (Single Move)

In [8]:
INPUT_FILENAME = "scienceworld_singlemove_10_1_25_50000.jsonl"
OUTPUT_FILENAME = "scienceworld_singlemove_50k.jsonl"

def parse_entry(entry):
    data = entry['data']
    
    # Check first element is "Full Observation"
    if data[0][0] != "Full Observation":
        raise ValueError(f"First element must be 'Full Observation', got: {data[0][0]}")
    
    full_observation = data[0][1]
    
    # Parse alternating User/Environment pairs
    user_env_pairs = data[1:]  # Skip the Full Observation
    
    # Validate pattern: User, Environment, User, Environment, ...
    for i, (role, content) in enumerate(user_env_pairs):
        expected_role = "User" if i % 2 == 0 else "Environment"
        if role != expected_role:
            raise ValueError(f"Expected {expected_role} at index {i+1}, got: {role}")
    
    # Build chat format
    # chat = [
    #     ["system", "generic_sysprompt.txt"],
    #     ["user", f"You are playing Cooking World, a text-based game that requires you to assemble a recipe and eat a meal. This follows typical text-based adventure game formats -- you should attempt to output the best action to play directly given the current state of the game. I will provide you with the output from the environment. The current state is the following:\n{full_observation}\nWhat is your first action?"]
    # ]
    chat = [
        ["system", "generic_sysprompt.txt"],
        ["user", f"You are playing Scienceworld, a text-based game that requires you to accomplish various science-related experiments. This follows typical text-based adventure game formats -- you should attempt to output the best action to play directly given the current state of the game. I will provide you with the output from the environment. The current state is the following:\n{full_observation}\nWhat is your first action?"]
    ]
    
    # Add alternating assistant/user pairs
    for i in range(0, len(user_env_pairs), 2):
        # Add user action as assistant response
        chat.append(["assistant", user_env_pairs[i][1]])
        
        # Add environment response as user (if exists)
        if i + 1 < len(user_env_pairs):
            chat.append(["user", user_env_pairs[i + 1][1]])
    
    return {"chat": chat}

# Process file
input_path = os.path.join(DATA_FOLDER, INPUT_FILENAME)
output_path = os.path.join(DATA_FOLDER, OUTPUT_FILENAME)

processed = 0
errors = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        if line.strip():
            try:
                entry = json.loads(line)
                chat_entry = parse_entry(entry)
                outfile.write(json.dumps(chat_entry) + '\n')
                processed += 1
            except Exception as e:
                errors += 1
                print(f"Error processing entry {processed + errors}: {e}")

print(f"Processed: {processed}, Errors: {errors}")
print(f"Output saved to: {output_path}")

Processed: 50000, Errors: 0
Output saved to: cleaned_data/scienceworld_singlemove_50k.jsonl


### Process Data (Multimove)

In [9]:
INPUT_FILENAME = "scienceworld_multimove_10_1_25_20000.jsonl"
OUTPUT_FILENAME = "scienceworld_multimove_20k.jsonl"

def parse_entry(entry):
    data = entry['data']
    
    # Check first element is "Full Observation"
    if data[0][0] != "Full Observation":
        raise ValueError(f"First element must be 'Full Observation', got: {data[0][0]}")
    
    full_observation = data[0][1]
    
    # Parse alternating User/Environment pairs
    user_env_pairs = data[1:]  # Skip the Full Observation
    
    # Validate pattern: User, Environment, User, Environment, ...
    for i, (role, content) in enumerate(user_env_pairs):
        expected_role = "User" if i % 2 == 0 else "Environment"
        if role != expected_role:
            raise ValueError(f"Expected {expected_role} at index {i+1}, got: {role}")
    
    # Build chat format
#     chat = [
#         ["system", "generic_sysprompt.txt"],
#         ["user", f"""You are playing Cooking World, a text-based game that requires you to assemble a recipe and eat a meal. This follows typical text-based adventure game formats -- you should attempt to output the best action to play directly given the current state of the game. 
        
# Additionally, I want you to predict what the environment will respond with. You should format your response with 'role: action/response' for the roles: {{player, environment}} seprated by a newline. I will specify how long you should continue the pattern for -- as a mock example, 3 plies would be:

# player: Take jamberries from fridge
# environment: You take the jamberries from the fridge. Score = 3/9
# player: Cook jamberries with stove
         
# The current state is the following:
# {full_observation}
# Please predict the optimal line for {len(user_env_pairs)} plies starting with the player's action."""]
#     ]
    chat = [
        ["system", "generic_sysprompt.txt"],
        ["user", f"""You are playing Scienceworld, a text-based game that requires you to accomplish various science-related experiments. This follows typical text-based adventure game formats -- you should attempt to output the best action to play directly given the current state of the game. 
        
Additionally, I want you to predict what the environment will respond with. You should format your response with 'role: action/response' for the roles: {{player, environment}} seprated by a newline. I will specify how long you should continue the pattern for -- as a mock example, 3 plies would be:

player: Take jamberries from fridge
environment: You take the jamberries from the fridge. Score = 30/100
player: Cook jamberries with stove
         
The current state is the following:
{full_observation}
Please predict the optimal line for {len(user_env_pairs)} plies starting with the player's action."""]
    ]
    
    
    asst_response = ""
    for i in range(0, len(user_env_pairs), 2):
        asst_response += f"player: {user_env_pairs[i][1]}\n"
        if i + 1 < len(user_env_pairs):
            asst_response += f"environment: {user_env_pairs[i + 1][1]}\n"

    chat.append(["assistant", asst_response])
    return {"chat": chat}

# Process file
input_path = os.path.join(DATA_FOLDER, INPUT_FILENAME)
output_path = os.path.join(DATA_FOLDER, OUTPUT_FILENAME)

processed = 0
errors = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        if line.strip():
            try:
                entry = json.loads(line)
                chat_entry = parse_entry(entry)
                outfile.write(json.dumps(chat_entry) + '\n')
                processed += 1
            except Exception as e:
                errors += 1
                print(f"Error processing entry {processed + errors}: {e}")

print(f"Processed: {processed}, Errors: {errors}")
print(f"Output saved to: {output_path}")

Processed: 20000, Errors: 0
Output saved to: cleaned_data/scienceworld_multimove_20k.jsonl


### Analyze distributions

### Aggregate multiple files into one

In [4]:
DATA_FOLDER = "generated_data"
FILES = [
    "scienceworld_singlemove_20251001_022903_25000.jsonl",
    "scienceworld_singlemove_20251001_085945_25000.jsonl",
]
# FILES = [
#     "cooking_multimove_20250928_231710_10000.jsonl",
#     "cooking_multimove_20250929_013858_10000.jsonl",
#     "cooking_multimove_20250929_025654_10000.jsonl",
#     "cooking_multimove_20250929_041338_10000.jsonl",
#     "cooking_multimove_20250929_053040_10000.jsonl",
#     "cooking_multimove_20250929_064903_10000.jsonl",
# ]

In [5]:
output_filename = "scienceworld_singlemove_10_1_25"
all_data = []

# Read all files and collect data
for filename in FILES:
    filepath = os.path.join(DATA_FOLDER, filename)
    print(f"Processing {filename}...")
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                all_data.append(data)

# Create output filename with the total count
total_count = len(all_data)
output_filepath = os.path.join(DATA_FOLDER, f"{output_filename}_{total_count}.jsonl")

# Write all data to the output file
print(f"Writing {total_count} entries to {output_filepath}...")
with open(output_filepath, 'w', encoding='utf-8') as f:
    for data in all_data:
        f.write(json.dumps(data) + '\n')

print(f"Successfully created {output_filepath} with {total_count} entries")

Processing scienceworld_singlemove_20251001_022903_25000.jsonl...
Processing scienceworld_singlemove_20251001_085945_25000.jsonl...
Writing 50000 entries to generated_data/scienceworld_singlemove_10_1_25_50000.jsonl...
Successfully created generated_data/scienceworld_singlemove_10_1_25_50000.jsonl with 50000 entries
