In [1]:
import json, os
from typing import Optional, Dict, Any, List
import utils as util

from collections import Counter
import matplotlib.pyplot as plt

### Update file to be in correct format for training

In [13]:
DATA_FOLDER = "cleaned_data"
INPUT_FILENAME = "cooking_multimove_9_29_25_60000.jsonl"
OUTPUT_FILENAME = "cooking_multimove_60k.jsonl"
# INPUT_FILENAME = "cooking_singlemove_9_29_25_150000.jsonl"
# OUTPUT_FILENAME = "cooking_singlemove_150k.jsonl"

def parse_entry(entry):
    data = entry['data']
    
    # Check first element is "Full Observation"
    if data[0][0] != "Full Observation":
        raise ValueError(f"First element must be 'Full Observation', got: {data[0][0]}")
    
    full_observation = data[0][1]
    
    # Parse alternating User/Environment pairs
    user_env_pairs = data[1:]  # Skip the Full Observation
    
    # Validate pattern: User, Environment, User, Environment, ...
    for i, (role, content) in enumerate(user_env_pairs):
        expected_role = "User" if i % 2 == 0 else "Environment"
        if role != expected_role:
            raise ValueError(f"Expected {expected_role} at index {i+1}, got: {role}")
    
    # Build chat format
    chat = [
        ["system", "generic_sysprompt.txt"],
        ["user", f"You are playing Cooking World, a text-based game that requires you to assemble a recipe and eat a meal. This follows typical text-based adventure game formats -- you should attempt to output the best action to play directly given the current state of the game. I will provide you with the output from the environment. The current state is the following:\n{full_observation}\nWhat is your first action?"]
    ]
    
    # Add alternating assistant/user pairs
    for i in range(0, len(user_env_pairs), 2):
        # Add user action as assistant response
        chat.append(["assistant", user_env_pairs[i][1]])
        
        # Add environment response as user (if exists)
        if i + 1 < len(user_env_pairs):
            chat.append(["user", user_env_pairs[i + 1][1]])
    
    return {"chat": chat}

# Process file
input_path = os.path.join(DATA_FOLDER, INPUT_FILENAME)
output_path = os.path.join(DATA_FOLDER, OUTPUT_FILENAME)

processed = 0
errors = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        if line.strip():
            try:
                entry = json.loads(line)
                chat_entry = parse_entry(entry)
                outfile.write(json.dumps(chat_entry) + '\n')
                processed += 1
            except Exception as e:
                errors += 1
                print(f"Error processing entry {processed + errors}: {e}")

print(f"Processed: {processed}, Errors: {errors}")
print(f"Output saved to: {output_path}")

Processed: 60000, Errors: 0
Output saved to: cleaned_data/cooking_multimove_60k.jsonl


### Analyze distributions

In [None]:
DATA_FOLDER = "cleaned_data"
FILENAME = "cooking_multimove_9_29_25_60000.jsonl"
# FILENAME = "cooking_singlemove_9_29_25_150000.jsonl"

# Load our data in -- for each line we have something like the following:
#{"data": [["Full Observation", "Move 1 | Score = 0/15\nActionable Verbs: ['chop', 'close', 'cook', 'dice', 'drink', 'drop', 'eat', 'examine', 'go', 'insert', 'inventory', 'lock', 'look', 'open', 'prepare', 'put', 'slice', 'take', 'unlock']\nObjective: You are hungry! Let's cook a delicious meal. Check the cookbook in the kitchen for the recipe. Once done, enjoy your meal!\nLocation: Pantry\nInventory: You are carrying: a yellow bell pepper, a banana, an orange bell pepper, a raw purple potato, a red hot pepper and a raw yellow potato.\nCurrent Observation: You've entered a pantry.\n\nOh, great. Here's a shelf. The shelf is wooden. But oh no! there's nothing on this piece of garbage. Aw, and here you were, all excited for there to be things on it!\n\nThere is a closed plain door leading east.\nNecessary Context: Recipe #1\n---------\nGather all following ingredients and follow the directions to prepare this tasty meal.\n\nIngredients:\nblock of cheese\n  red potato\n  banana\n  yellow bell pepper\n  red onion\n\nDirections:\nchop the block of cheese\n  roast the block of cheese\n  slice the red potato\n  roast the red potato\n  dice the banana\n  roast the banana\n  dice the yellow bell pepper\n  roast the yellow bell pepper\n  dice the red onion\n  roast the red onion\n  prepare meal\n"], ["User", "Open plain door"], ["Environment", "You open plain door. [Move 2 | Score = 0/15]"], ["User", "Go east"], ["Environment", "-= Kitchen =-\nYou've just sauntered into a kitchen. Okay, just remember what you're here to do, and everything will go great.\n\nYou can make out a closed fridge. You can see a closed oven. You can see a table. The table is massive. On the table you see a knife. As if things weren't amazing enough already, you can even see a counter. You see a raw red potato, a red apple and a cookbook on the counter. You see a gleam over in a corner, where you can see a stove. But the thing is empty.\n\nThere is an open plain door leading west. There is an exit to the north. There is an exit to the south. [Move 3 | Score = 0/15]"], ["User", "Examine cookbook"]], "info": {"move_idx": 1, "generation_args": {"filename": "custom_cooking", "folder": "tw_games", "fmt": "z8", "split": "train", "seed": 1482702892, "recipe_seed": 606939935, "recipe": 5, "take": 1, "go": 6, "open_": true, "cook": true, "cut": true, "drop": false}}}

# First let's get a distribution of length of our 'data' array
# Let's also go through and get a distribution of move_idx

# Load the data and collect statistics
data_lengths = []
move_indices = []
total_entries = 0

filepath = os.path.join(DATA_FOLDER, FILENAME)
print(f"Analyzing {filepath}...")

with open(filepath, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            entry = json.loads(line)
            total_entries += 1
            
            # Get length of 'data' array
            if 'data' in entry:
                data_lengths.append(len(entry['data']))
            
            # Get move_idx from info
            if 'info' in entry and 'move_idx' in entry['info']:
                move_indices.append(entry['info']['move_idx'])

# Calculate distributions
data_length_dist = Counter(data_lengths)
move_idx_dist = Counter(move_indices)

print(f"\nTotal entries processed: {total_entries}")
print(f"Entries with 'data': {len(data_lengths)}")
print(f"Entries with 'move_idx': {len(move_indices)}")

print("\n=== Data Length Distribution ===")
print("Length -> Count (Percentage)")
for length in sorted(data_length_dist.keys()):
    count = data_length_dist[length]
    percentage = (count / len(data_lengths)) * 100 if data_lengths else 0
    print(f"{length:6d} -> {count:6d} ({percentage:5.1f}%)")

print(f"\nData length stats:")
if data_lengths:
    print(f"  Min: {min(data_lengths)}")
    print(f"  Max: {max(data_lengths)}")
    print(f"  Mean: {sum(data_lengths) / len(data_lengths):.1f}")

print("\n=== Move Index Distribution ===")
print("Move Index -> Count (Percentage)")
for move_idx in sorted(move_idx_dist.keys()):
    count = move_idx_dist[move_idx]
    percentage = (count / len(move_indices)) * 100 if move_indices else 0
    print(f"{move_idx:10d} -> {count:6d} ({percentage:5.1f}%)")

print(f"\nMove index stats:")
if move_indices:
    print(f"  Min: {min(move_indices)}")
    print(f"  Max: {max(move_indices)}")
    print(f"  Mean: {sum(move_indices) / len(move_indices):.1f}")

Analyzing cleaned_data/cooking_singlemove_9_29_25_150000.jsonl...

Total entries processed: 150000
Entries with 'data': 150000
Entries with 'move_idx': 150000

=== Data Length Distribution ===
Length -> Count (Percentage)
     2 -> 150000 (100.0%)

Data length stats:
  Min: 2
  Max: 2
  Mean: 2.0

=== Move Index Distribution ===
Move Index -> Count (Percentage)
         0 ->   5260 (  3.5%)
         1 ->   5360 (  3.6%)
         2 ->   5296 (  3.5%)
         3 ->   5299 (  3.5%)
         4 ->   5326 (  3.6%)
         5 ->   5304 (  3.5%)
         6 ->   5316 (  3.5%)
         7 ->   5251 (  3.5%)
         8 ->   5230 (  3.5%)
         9 ->   5189 (  3.5%)
        10 ->   5085 (  3.4%)
        11 ->   4965 (  3.3%)
        12 ->   4914 (  3.3%)
        13 ->   4861 (  3.2%)
        14 ->   4730 (  3.2%)
        15 ->   4548 (  3.0%)
        16 ->   4385 (  2.9%)
        17 ->   4264 (  2.8%)
        18 ->   4035 (  2.7%)
        19 ->   3789 (  2.5%)
        20 ->   3530 (  2.4%)
      

### Aggregate multiple files into one

In [9]:
DATA_FOLDER = "generated_data/archive"
FILES = [
    "cooking_singlemove_20250928_231710_25000.jsonl",
    "cooking_singlemove_20250929_013858_25000.jsonl",
    "cooking_singlemove_20250929_025654_25000.jsonl",
    "cooking_singlemove_20250929_041338_25000.jsonl",
    "cooking_singlemove_20250929_053040_25000.jsonl",
    "cooking_singlemove_20250929_064903_25000.jsonl",
]
# FILES = [
#     "cooking_multimove_20250928_231710_10000.jsonl",
#     "cooking_multimove_20250929_013858_10000.jsonl",
#     "cooking_multimove_20250929_025654_10000.jsonl",
#     "cooking_multimove_20250929_041338_10000.jsonl",
#     "cooking_multimove_20250929_053040_10000.jsonl",
#     "cooking_multimove_20250929_064903_10000.jsonl",
# ]

In [10]:
output_filename = "cooking_singlemove_9_29_25"
all_data = []

# Read all files and collect data
for filename in FILES:
    filepath = os.path.join(DATA_FOLDER, filename)
    print(f"Processing {filename}...")
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                all_data.append(data)

# Create output filename with the total count
total_count = len(all_data)
output_filepath = os.path.join(DATA_FOLDER, f"{output_filename}_{total_count}.jsonl")

# Write all data to the output file
print(f"Writing {total_count} entries to {output_filepath}...")
with open(output_filepath, 'w', encoding='utf-8') as f:
    for data in all_data:
        f.write(json.dumps(data) + '\n')

print(f"Successfully created {output_filepath} with {total_count} entries")

Processing cooking_singlemove_20250928_231710_25000.jsonl...
Processing cooking_singlemove_20250929_013858_25000.jsonl...
Processing cooking_singlemove_20250929_013858_25000.jsonl...
Processing cooking_singlemove_20250929_025654_25000.jsonl...
Processing cooking_singlemove_20250929_025654_25000.jsonl...
Processing cooking_singlemove_20250929_041338_25000.jsonl...
Processing cooking_singlemove_20250929_041338_25000.jsonl...
Processing cooking_singlemove_20250929_053040_25000.jsonl...
Processing cooking_singlemove_20250929_053040_25000.jsonl...
Processing cooking_singlemove_20250929_064903_25000.jsonl...
Processing cooking_singlemove_20250929_064903_25000.jsonl...
Writing 150000 entries to generated_data/archive/cooking_singlemove_9_29_25_150000.jsonl...
Writing 150000 entries to generated_data/archive/cooking_singlemove_9_29_25_150000.jsonl...
Successfully created generated_data/archive/cooking_singlemove_9_29_25_150000.jsonl with 150000 entries
Successfully created generated_data/archi