# RL seeding playground (longer runs)
Use this notebook to train a simple bandit over seeding strategies and observe how best fitness improves across episodes.


## Imports and setup


In [None]:
import random
from pathlib import Path
import sys
import json
import matplotlib.pyplot as plt
import numpy as np

# Ensure backend/src is importable
PROJECT_ROOT = None
for candidate in [Path.cwd().resolve(), *Path.cwd().resolve().parents]:
    module_root = candidate / "backend" / "src"
    if module_root.exists():
        PROJECT_ROOT = candidate
        if str(module_root) not in sys.path:
            sys.path.insert(0, str(module_root))
        break
if PROJECT_ROOT is None:
    raise RuntimeError("Run this notebook from inside the repo")

from ver0.evolver import evolve, EAConfig, mutate
from ver0.fitness import Weights
from ver0.vars import (
    DEFAULT_GRID_SIZE,
    POPULATION_SIZE,
    GENERATIONS,
    CROSSOVER_RATE,
    MUTATION_RATE,
    TOURNAMENT_K,
    ELITE_FRACTION,
    RANDOM_SEED,
    QUADRANT_WEIGHT,
    OVERLAP_WEIGHT,
    AREA_WEIGHT,
    COMPACTNESS_WEIGHT,
    ADJACENCY_WEIGHT,
    LOCATION_WEIGHT,
    SECTION_WEIGHT,
    DISPERSION_WEIGHT,
    ROOM_USAGE_WEIGHT,
    MASK_WEIGHT,
    BUDGET_WEIGHT,
    SECTION_BBOX_WEIGHT,
    RELATIONSHIP_WEIGHT,
    REALISM_WEIGHT,
    REALISM_THRESHOLD,
    NO_CHANGE_PENALTY,
    ROTATE_IMAGE_K,
)
from ver0.grid_encoder import encode_floorplan_to_grid
from ver0.rl_bandit import make_seed_bandit
from ver0.seeders import SEEDING_REGISTRY



## Parameters


In [None]:
# Episodes control how many EA runs we use to train the bandit
EPISODES = 30  # increase for smoother learning on M3 Max 64GB
GENS = 100  # per-episode generations
GRID_SIZE = DEFAULT_GRID_SIZE
ROTATE_K = ROTATE_IMAGE_K
EPSILON = 0.15  # initial exploration rate for bandit
EPSILON_DECAY = 0.98  # decay per run if you re-run the notebook
RL_STATE_PATH = PROJECT_ROOT / "backend" / "data" / "rl" / "seed_bandit.json"
EPISODE_LOG_PATH = PROJECT_ROOT / "backend" / "data" / "rl" / "episode_log.jsonl"

# Fixed floor ids to reduce variance across episodes
USE_FIXED_FLOORS = True
FIXED_FLOOR_IDS = [101, 205, 309, 412, 523, 634, 745, 856, 967, 120, 230, 340]

POPULATION_SIZE = 80
CROSSOVER_RATE = 0.7
MUTATION_RATE = 0.30
ELITE_FRACTION = 0.06
TOURNAMENT_K = 3

EA_WEIGHTS = Weights(
    quadrant=QUADRANT_WEIGHT,
    overlap=OVERLAP_WEIGHT,
    area=AREA_WEIGHT,
    compactness=COMPACTNESS_WEIGHT,
    adjacency=ADJACENCY_WEIGHT,
    location=LOCATION_WEIGHT,
    section=SECTION_WEIGHT,
    dispersion=DISPERSION_WEIGHT,
    room_usage=ROOM_USAGE_WEIGHT,
    budget=BUDGET_WEIGHT,
    section_bbox=SECTION_BBOX_WEIGHT,
    mask=MASK_WEIGHT,
    relationships=RELATIONSHIP_WEIGHT,
    realism=REALISM_WEIGHT,
)

EA_CONFIG = EAConfig(
    population_size=POPULATION_SIZE,
    generations=GENS,
    crossover_rate=CROSSOVER_RATE,
    mutation_rate=MUTATION_RATE,
    tournament_k=TOURNAMENT_K,
    elite_fraction=ELITE_FRACTION,
    random_seed=RANDOM_SEED,
    weights=EA_WEIGHTS,
    stagnation_threshold=20,
    restart_fraction=0.30,
    mutation_boost=1.5,
    mutation_floor=0.05,
    mutation_ceiling=0.8,
    no_change_penalty=NO_CHANGE_PENALTY,
)



In [None]:
# Optional: reset bandit state/search log before training (set True after seeder/logic changes)
DO_RESET = False
if DO_RESET:
    for path in [RL_STATE_PATH, EPISODE_LOG_PATH]:
        if path.exists():
            path.unlink()
            print(f"Deleted {path}")
        else:
            print(f"No existing file: {path}")



## Helper: load a sample


In [None]:
def load_sample(floor_id: int):
    floor_dir = PROJECT_ROOT / "backend" / "data" / "processed" / "floor_plans" / f"floor{floor_id:03d}"
    return encode_floorplan_to_grid(floor_dir, grid_size=GRID_SIZE, rotate_k=ROTATE_K)



## Train bandit over seeding strategies


In [None]:
import concurrent.futures
import json
import os
from dataclasses import asdict

from ver0.rl_runner import run_episode as process_run_episode

bandit = make_seed_bandit(RL_STATE_PATH, epsilon=EPSILON, rng=random.Random(RANDOM_SEED))
# apply simple epsilon decay each run (persisted)
bandit.epsilon = max(0.05, bandit.epsilon * EPSILON_DECAY)

# serialize config for multiprocessing
cfg_dict = asdict(EA_CONFIG)
project_root_str = str(PROJECT_ROOT)

# pre-sample jobs using current bandit policy
jobs = []
for ep in range(EPISODES):
    if USE_FIXED_FLOORS and FIXED_FLOOR_IDS:
        floor_id = FIXED_FLOOR_IDS[ep % len(FIXED_FLOOR_IDS)]
    else:
        floor_id = random.randint(1, 970)
    seed_name, _ = bandit.select()
    jobs.append((ep, floor_id, seed_name, cfg_dict, project_root_str, GRID_SIZE, ROTATE_K))

max_workers = max(1, os.cpu_count() // 2)
print(f"Running {len(jobs)} episodes with up to {max_workers} workers...")

results = []
try:
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as ex:
        for rec in ex.map(process_run_episode, jobs, chunksize=1):
            print(f"Episode {rec['episode']:02d} | floor {rec['floor_id']:03d} | seed {rec['seed']} | best {rec['best_fitness']:.2f} | {rec.get('duration_s',0):.1f}s")
            results.append(rec)
except Exception as e:
    print(f"Process pool failed ({e!r}); falling back to sequential execution.")
    for job in jobs:
        rec = process_run_episode(job)
        print(f"Episode {rec['episode']:02d} | floor {rec['floor_id']:03d} | seed {rec['seed']} | best {rec['best_fitness']:.2f} | {rec.get('duration_s',0):.1f}s")
        results.append(rec)

# update bandit with rewards
for rec in results:
    reward = -rec["best_fitness"]  # lower fitness is better
    bandit.update(rec["seed"], reward)

bandit.save()
episodes = sorted(results, key=lambda r: r["episode"])  # ensure plotting order

# Save episode log for further analysis
log_path = EPISODE_LOG_PATH
log_path.parent.mkdir(parents=True, exist_ok=True)
with log_path.open("a") as f:
    for rec in episodes:
        f.write(json.dumps(rec) + "")

plt.figure(figsize=(7,4))
plt.plot([rec["episode"] for rec in episodes], [rec["best_fitness"] for rec in episodes], marker='o')
plt.xlabel('Episode')
plt.ylabel('Best fitness (lower better)')
plt.title('Bandit training: best fitness per episode')
plt.grid(True)
plt.tight_layout()
plt.show()

last_hist = episodes[-1]["history"] if episodes else None
if last_hist:
    plt.figure(figsize=(7,4))
    plt.plot(last_hist.get("best", []), label='best')
    plt.plot(last_hist.get("mean", []), label='mean')
    plt.xlabel('Generation')
    plt.ylabel('Fitness')
    plt.title('Trajectory of last episode')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

