In [None]:
%%writefile env.py
"""
Waffle Environment - DENSE REWARD SHAPING for RL
"""

import numpy as np
import json
from typing import Tuple, List, Optional

class Waffle(object):
    def __init__(self, puzzles_file: str = "/kaggle/input/waffle/waffles_shuffled_2k.jsonl", 
                 embeddings_file: str = "/kaggle/input/waffle/letter_embeddings.npy",
                 vocab_file: str = "/kaggle/input/waffle/words.txt") -> None:
        self.puzzles_file = puzzles_file
        self.puzzles = self._load_puzzles()
        self.embeddings = self._load_embeddings(embeddings_file)
        self.grid_size = 21
        self.current_puzzle = None
        self.current_state = None
        self.target_state = None
        self.fixed_indices = None
        
        self.row_words = [
            [0, 1, 2, 3, 4],
            [8, 9, 10, 11, 12],
            [16, 17, 18, 19, 20]
        ]
        self.col_words = [
            [0, 5, 8, 13, 16],
            [2, 6, 10, 14, 18],
            [4, 7, 12, 15, 20]
        ]

        self.vocab = self._load_vocab(vocab_file)
        self.reset(0)

    def _load_puzzles(self) -> List[dict]:
        puzzles = []
        with open(self.puzzles_file, "r") as f:
            for line in f:
                puzzles.append(json.loads(line))
        return puzzles

    def _load_vocab(self, vocab_file: str) -> set:
        vocab = set()
        with open(vocab_file, 'r') as f:
            for line in f:
                word = line.strip().lower()
                if len(word) == 5:
                    vocab.add(word)
        print(f"Loaded vocabulary with {len(vocab)} words")
        return vocab

    def _load_embeddings(self, embeddings_file: str) -> np.ndarray:
        try:
            embeddings = np.load(embeddings_file)
            print(f"Loaded embeddings: {embeddings.shape}")
            return embeddings.astype(np.float32)
        except FileNotFoundError:
            print(f"Creating random embeddings")
            return np.random.randn(27, 8).astype(np.float32)

    def _string_to_array(self, flat21: str) -> np.ndarray:
        return np.array([ord(c) - ord('a') + 1 for c in flat21], dtype=np.int32)

    def _array_to_string(self, arr: np.ndarray) -> str:
        return "".join([chr(int(x) + ord('a') - 1) for x in arr])

    def _get_word_for_position(self, pos: int) -> Optional[List[int]]:
        for word in self.row_words:
            if pos in word:
                return word
        for word in self.col_words:
            if pos in word:
                return word
        return None

    def _calculate_colors_and_embeddings(self, state: np.ndarray) -> np.ndarray:
        features = np.zeros((21, 10), dtype=np.float32)
        
        for i in range(21):
            current_letter = state[i]
            target_letter = self.target_state[i]
            
            if current_letter == target_letter:
                features[i, 0] = 1.0
            else:
                word_positions = self._get_word_for_position(i)
                if word_positions:
                    target_word_letters = [self.target_state[p] for p in word_positions]
                    if current_letter in target_word_letters:
                        features[i, 1] = 1.0
            
            letter_idx = int(current_letter) - 1 if current_letter >= 1 else 26
            features[i, 2:10] = self.embeddings[letter_idx]
        
        return features

    def _count_correct_positions(self, state: np.ndarray) -> int:
        return np.sum(state == self.target_state)
    
    def _count_valid_words(self, state: np.ndarray) -> int:
        current_letters = [chr(c + ord('a') - 1) for c in state]
        valid_count = 0
        
        for indices in self.row_words + self.col_words:
            word_str = ''.join([current_letters[i] for i in indices])
            if word_str in self.vocab:
                valid_count += 1
        
        return valid_count
    
    def _count_yellow_tiles(self, state: np.ndarray) -> int:
        """Count yellow letters"""
        features = self._calculate_colors_and_embeddings(state)
        return int(np.sum(features[:, 1]))  
        
    def reset(self, index: int) -> None:
        self.current_puzzle = self.puzzles[index]
        self.target_state = self._string_to_array(self.current_puzzle["target_flat21"])
        self.current_state = self._string_to_array(self.current_puzzle["shuffled_flat21"])
        self.fixed_indices = set(self.current_puzzle["fixed_indices"])
        self.moves = 0
        self.max_moves = 50
        self.prev_correct_count = self._count_correct_positions(self.current_state)
        self.prev_word_count = self._count_valid_words(self.current_state)
        self.prev_yellow_count = self._count_yellow_tiles(self.current_state)
        self.prev_features = self._calculate_colors_and_embeddings(self.current_state)

    def _update_state(self, action: int) -> None:
        pos1 = action // 21
        pos2 = action % 21
        temp = self.current_state[pos1]
        self.current_state[pos1] = self.current_state[pos2]
        self.current_state[pos2] = temp
        self.moves += 1

    def _get_reward(self, action: int) -> float:
        """
        Reward Shaping:Every improvement gets rewarded
        """
        reward = -0.05  #step penalty
        
        # Final reward
        if np.array_equal(self.current_state, self.target_state):
            return 20.0 
        
        current_correct = self._count_correct_positions(self.current_state)
        current_words = self._count_valid_words(self.current_state)
        current_yellow = self._count_yellow_tiles(self.current_state)
        
        correct_change = current_correct - self.prev_correct_count
        word_change = current_words - self.prev_word_count
        yellow_change = current_yellow - self.prev_yellow_count
        
        # For Green
        if correct_change > 0:
            reward += 2.0 * correct_change
        elif correct_change < 0:
            reward += -1.0 * abs(correct_change)
        
        # Completing word
        if word_change > 0:
            reward += 3.0 * word_change
        elif word_change < 0:
            reward += -1.5 * abs(word_change)
        
        # Yellow tiles
        if yellow_change > 0:
            reward += 0.2 * yellow_change 
    
        self.prev_correct_count = current_correct
        self.prev_word_count = current_words
        self.prev_yellow_count = current_yellow
        self.prev_features = self._calculate_colors_and_embeddings(self.current_state)
        
        return np.clip(reward, -5.0, 20.0)

    def _is_over(self) -> bool:
        solved = np.array_equal(self.current_state, self.target_state)
        max_moves_reached = self.moves >= self.max_moves
        return solved or max_moves_reached
    
    def _is_win(self) -> bool:
        return np.array_equal(self.current_state, self.target_state)

    def observe(self) -> np.ndarray:
        features = self._calculate_colors_and_embeddings(self.current_state)
        grid = np.zeros((5, 5, 10), dtype=np.float32)

        blank_embedding = self.embeddings[26]
        blank_feature_vector = np.zeros(10, dtype=np.float32)
        blank_feature_vector[2:10] = blank_embedding
        blank_positions = [(1,1), (1,3), (3,1), (3,3)]
        for r, c in blank_positions:
            grid[r, c, :] = blank_feature_vector
        
        position_to_grid = [
            (0,0), (0,1), (0,2), (0,3), (0,4),
            (1,0), (1,2), (1,4),
            (2,0), (2,1), (2,2), (2,3), (2,4),
            (3,0), (3,2), (3,4),
            (4,0), (4,1), (4,2), (4,3), (4,4)
        ]
        
        for flat_idx in range(21):
            r, c = position_to_grid[flat_idx]
            grid[r, c, :] = features[flat_idx]
        
        return grid.reshape(1, 5, 5, 10)

    def act(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        self._update_state(action)
        reward = self._get_reward(action)
        game_over = self._is_over()
        game_win = self._is_win()
        return self.observe(), reward, game_over, game_win

    def get_valid_actions(self) -> List[int]:
        valid_actions = []
        for pos1 in range(21):
            for pos2 in range(21):
                if pos1 != pos2:
                    action = pos1 * 21 + pos2
                    valid_actions.append(action)
        return valid_actions

    def get_num_actions(self) -> int:
        return 21 * 21

    def get_current_state_letters(self) -> np.ndarray:
        return self.current_state


Writing env.py


In [None]:
%%writefile experience_replay.py
"""
Prioritized Experience Replay - Focuses on important experiences
"""
import numpy as np
import tensorflow as tf
from tensorflow import keras
from typing import Tuple

class PrioritizedExperienceReplay(object):
    def __init__(self, max_memory: int = 20000, discount: float = 0.95, alpha: float = 0.6) -> None:
        self.max_memory = max_memory
        self.memory = list()
        self.priorities = list()
        self.discount = discount
        self.alpha = alpha  

    def add_experience(self, sars: list, game_over: bool, unmovable_positions = None, priority: float = 1.0) -> None:
        self.memory.append([sars, game_over, unmovable_positions])
        self.priorities.append(priority ** self.alpha)
        
        if len(self.memory) > self.max_memory:
            del self.memory[0]
            del self.priorities[0]

    def get_qlearning_batch(
        self, model: keras.Model, target_model: keras.Model, batch_size: int = 64
    ) -> Tuple[np.ndarray, np.ndarray]:
        memory_length = len(self.memory)
        num_inputs = min(memory_length, batch_size)
        
        probs = np.array(self.priorities) / sum(self.priorities)
        ids = np.random.choice(memory_length, size=num_inputs, replace=False, p=probs)
        
        sars = list(zip(*[self.memory[id_][0] for id_ in ids]))
        previous_states, action_ts, rewards, current_states = (
            np.concatenate(e) if isinstance(e[0], np.ndarray) else np.stack(e)
            for e in sars
        )
        game_over = np.array([self.memory[id_][1] for id_ in ids], dtype=bool)
        
        previous_states_tensor = tf.constant(previous_states, dtype=tf.float32)
        current_states_tensor = tf.constant(current_states, dtype=tf.float32)
        
        targets = model(previous_states_tensor, training=False).numpy()
        
        next_q_main = model(current_states_tensor, training=False).numpy()
        best_actions = np.argmax(next_q_main, axis=1)
        
        next_q_target = target_model(current_states_tensor, training=False).numpy()
        Q_sa = next_q_target[np.arange(num_inputs), best_actions]
        
        td_errors = []
        for i in range(num_inputs):
            old_q = targets[i, action_ts[i]]
            new_q = rewards[i] + self.discount * Q_sa[i] * (not game_over[i])
            td_error = abs(new_q - old_q)
            td_errors.append(td_error)
            targets[i, action_ts[i]] = new_q
        
        for i, idx in enumerate(ids):
            self.priorities[idx] = (td_errors[i] + 1e-5) ** self.alpha
        
        return previous_states, targets


Writing experience_replay.py


In [None]:
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPUs available: {len(tf.config.list_physical_devices('GPU'))}")

2025-11-22 20:17:17.281692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763842637.508792      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763842637.578951      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

TensorFlow version: 2.18.0
GPUs available: 2


In [None]:
%%writefile model_definition.py
"""
Simplified stable model - NO CLIPPING VERSION
"""
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def define_model(input_shape, hidden_size, num_actions, learning_rate, conv_filters=32):
    """Simpler architecture - rely on gradient clipping only"""
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(conv_filters, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.Conv2D(conv_filters, (3, 3), activation='relu', padding='same')(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(hidden_size, activation='relu')(x)
    x = layers.Dense(hidden_size // 2, activation='relu')(x)
    
    outputs = layers.Dense(num_actions, activation='linear')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=0.5)
    model.compile(optimizer=optimizer, loss='mse')
    
    return model


Overwriting model_definition.py


In [None]:
"""
GUIDED EXPLORATION DQN - Helps agent discover good actions
"""
import os
import json
import numpy as np
import tensorflow as tf
import logging
import sys

print(f"TensorFlow version: {tf.__version__}")
print(f"GPUs available: {len(tf.config.list_physical_devices('GPU'))}")

dataset_path = "/kaggle/input/waffle/waffles_shuffled_2k.jsonl"
embeddings_path = "/kaggle/input/waffle/letter_embeddings.npy"
vocab_path = "/kaggle/input/waffle/words.txt"

for mod in ['env', 'experience_replay', 'model_definition']:
    if mod in sys.modules:
        del sys.modules[mod]

from env import Waffle
from experience_replay import PrioritizedExperienceReplay
from model_definition import define_model

def guided_action_selection(env, epsilon, q_values, valid_actions, num_actions):
    """
    GUIDED EXPLORATION: Mix random, greedy-lookahead, and Q-learning
    """
    if np.random.rand() < epsilon:
        
        if np.random.rand() < 0.5:
            return np.random.choice(valid_actions)
        else:

            best_action = None
            best_score = -999
            
            movable = [p for p in range(21) if p not in env.fixed_indices]
            sample_actions = np.random.choice(len(movable) * (len(movable) - 1), 
                                             size=min(20, len(valid_actions)), 
                                             replace=False)
            
            for _ in range(min(20, len(valid_actions))):
                action = np.random.choice(valid_actions)
                pos1, pos2 = action // 21, action % 21
       
                env.current_state[pos1], env.current_state[pos2] = \
                    env.current_state[pos2], env.current_state[pos1]
                
                score = (env._count_correct_positions(env.current_state) * 2 +
                        env._count_valid_words(env.current_state) * 3)
          
                env.current_state[pos1], env.current_state[pos2] = \
                    env.current_state[pos2], env.current_state[pos1]
                
                if score > best_score:
                    best_score = score
                    best_action = action
            
            return best_action if best_action else np.random.choice(valid_actions)
    else:
        masked_q = np.full(num_actions, -np.inf)
        masked_q[valid_actions] = q_values[valid_actions]
        return int(np.argmax(masked_q))

def train_model(model, target_model, env, epochs, experience_replay, 
               epsilon_start, epsilon_end, epsilon_decay, batch_size, 
               puzzles_per_epoch, target_update_freq):
    logging.info("Starting GUIDED EXPLORATION training...")
    win_count = 0
    total_rewards = []
    epsilon = epsilon_start
    num_actions = env.get_num_actions()
    training_steps = 0
    initial_correctness_list = []
    for idx, p in enumerate(env.puzzles):
        state = env._string_to_array(p["shuffled_flat21"])
        target = env._string_to_array(p["target_flat21"])
        correct = np.sum(state == target)
        initial_correctness_list.append((idx, correct))

    bins = {
        'veryeasy': [idx for idx, c in initial_correctness_list if c >= 12], 
        'easy':     [idx for idx, c in initial_correctness_list if 9 <= c < 12],
        'medium':   [idx for idx, c in initial_correctness_list if 7 <= c < 9],
        'hard':     [idx for idx, c in initial_correctness_list if c < 7]
    }

    print(f"\n{'='*70}")
    print(f"GUIDED EXPLORATION + ULTRA-DENSE REWARDS")
    print(f"{'='*70}")
    print(f"Very Easy (≥12): {len(bins['veryeasy'])} puzzles")
    print(f"Easy (9-11):     {len(bins['easy'])} puzzles")
    print(f"Medium (7-8):    {len(bins['medium'])} puzzles")
    print(f"Hard (<7):       {len(bins['hard'])} puzzles")
    print(f"{'='*70}\n")

    def sample_puzzles(epoch):
        fraction = epoch / epochs
        if fraction < 0.60:
            return np.random.choice(bins['veryeasy'] + bins['easy'], 
                                   size=puzzles_per_epoch, replace=True)
        elif fraction < 0.85:
            return np.random.choice(bins['easy'] + bins['medium'], 
                                   size=puzzles_per_epoch, replace=True)
        else:
            all_puzzles = bins['easy'] + bins['medium'] + bins['hard']
            return np.random.choice(all_puzzles, size=puzzles_per_epoch, replace=True)
    
    @tf.function
    def predict_q(state):
        return model(state, training=False)
    
    for epoch in range(epochs):
        loss = 0.0
        final_correct = []
        final_words = []
        epoch_wins = 0
        
        puzzle_indices = sample_puzzles(epoch)
        
        for puzzle_idx in puzzle_indices:
            env.reset(puzzle_idx)
            
            if np.array_equal(env.current_state, env.target_state):
                continue
            
            current_state = env.observe()
            game_over = False
            episode_reward = 0
            step_count = 0
            
            fixed_positions = env.fixed_indices
            patience = 60 
            no_progress_steps = 0
            best_correct = env._count_correct_positions(env.current_state)
            
            while not game_over:
                current_correct = env._count_correct_positions(env.current_state)
                if current_correct > best_correct:
                    no_progress_steps = 0
                    best_correct = current_correct
                else:
                    no_progress_steps += 1

                if no_progress_steps >= patience:
                    game_over = True
                    break
                
                previous_state = current_state
                
                movable = [p for p in range(21) if p not in fixed_positions]
                valid_actions = []
                for pos1 in movable:
                    for pos2 in movable:
                        if pos1 != pos2:
                            action = pos1 * 21 + pos2
                            valid_actions.append(action)
             
                q = predict_q(tf.constant(previous_state, dtype=tf.float32)).numpy()[0]
                action = guided_action_selection(env, epsilon, q, valid_actions, num_actions)

                current_state, reward, game_over, game_win = env.act(action)
                episode_reward += reward
                step_count += 1
               
                priority = max(1.0, abs(reward))
                experience_replay.add_experience(
                    [previous_state, int(action), reward, current_state], 
                    game_over,
                    fixed_positions,
                    priority=priority
                )
                
                if len(experience_replay.memory) >= batch_size and step_count % 3 == 0:
                    inputs, targets = experience_replay.get_qlearning_batch(
                        model, target_model, batch_size=batch_size
                    )
                    batch_loss = model.train_on_batch(inputs, targets)
                    
                    if np.isnan(batch_loss) or batch_loss > 5000:
                        print(f"  ⚠️  Training unstable, stopping")
                        return model
                    
                    loss += batch_loss
                    training_steps += 1
                    
                    if training_steps % target_update_freq == 0:
                        target_model.set_weights(model.get_weights())
            
            total_rewards.append(episode_reward)
            final_correct.append(env._count_correct_positions(env.current_state))
            final_words.append(env._count_valid_words(env.current_state))
            
            if game_win:
                epoch_wins += 1
                print(f"  ★ WIN! Puzzle {puzzle_idx} in {step_count} moves")
        
        win_count += epoch_wins
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        
        if (epoch + 1) % 5 == 0 or epoch < 10:
            recent = min(150, len(total_rewards))
            avg_reward = np.mean(total_rewards[-recent:])
            win_rate = win_count / ((epoch + 1) * puzzles_per_epoch) * 100
            avg_correct = np.mean(final_correct[-recent:])
            avg_words = np.mean(final_words[-recent:])
            
            sample_state = env.observe()
            q_values = model(tf.constant(sample_state, dtype=tf.float32), training=False).numpy()[0]
            
            print(f"Ep {epoch+1:4d}/{epochs} | Loss: {loss:6.1f} | ε: {epsilon:.3f} | "
                  f"Wins: {win_count:3d} ({win_rate:4.1f}%) | R: {avg_reward:5.1f} | "
                  f"Correct: {avg_correct:.1f}/21 | Words: {avg_words:.1f}/6")
        
        if (epoch + 1) % 50 == 0:
            checkpoint_dir = '/kaggle/working/model_checkpoints'
            os.makedirs(checkpoint_dir, exist_ok=True)
            model.save_weights(f'{checkpoint_dir}/model_epoch_{epoch+1}.weights.h5')
            print(f"  ✓ Saved checkpoint")
    
    return model

EPOCHS = 250 
EPSILON_START = 0.95 
EPSILON_END = 0.3  
TARGET_EPSILON_EPOCH = 200
EPSILON_DECAY = (EPSILON_END / EPSILON_START) ** (1 / TARGET_EPSILON_EPOCH)
MAX_MEMORY = 20000
HIDDEN_SIZE = 64 
CONV_FILTERS = 16
BATCH_SIZE = 32
DISCOUNT = 0.95
LEARNING_RATE = 0.0005
PUZZLES_PER_EPOCH = 100
TARGET_UPDATE_FREQUENCY = 500

print(f"\n{'='*70}")
print(f"HYPERPARAMETERS - GUIDED EXPLORATION")
print(f"{'='*70}")
print(f"Epochs: {EPOCHS} | LR: {LEARNING_RATE} | Discount: {DISCOUNT}")
print(f"Epsilon: {EPSILON_START}→{EPSILON_END} | Hidden: {HIDDEN_SIZE}")
print(f"Exploration: 50% random + 50% greedy lookahead")
print(f"Prioritized replay: HIGH reward experiences trained more")
print(f"{'='*70}\n")

env = Waffle(puzzles_file=dataset_path, embeddings_file=embeddings_path, vocab_file=vocab_path)
num_actions = env.get_num_actions()
input_shape = (5, 5, 10)

print(f"✓ Environment loaded ({len(env.puzzles)} puzzles)\n")

print("Building models...")
model = define_model(input_shape, HIDDEN_SIZE, num_actions, LEARNING_RATE, CONV_FILTERS)
target_model = define_model(input_shape, HIDDEN_SIZE, num_actions, LEARNING_RATE, CONV_FILTERS)
target_model.set_weights(model.get_weights())
print("✓ Models built\n")

exp_replay = PrioritizedExperienceReplay(max_memory=MAX_MEMORY, discount=DISCOUNT)
logging.basicConfig(level=logging.INFO)

print("="*70)
print("STARTING GUIDED EXPLORATION TRAINING")
print("="*70)

trained_model = train_model(
    model, target_model, env, EPOCHS, exp_replay,
    EPSILON_START, EPSILON_END, EPSILON_DECAY,
    BATCH_SIZE, PUZZLES_PER_EPOCH, TARGET_UPDATE_FREQUENCY
)

output_dir = '/kaggle/working/model'
os.makedirs(output_dir, exist_ok=True)
trained_model.save_weights(f'{output_dir}/model.weights.h5', overwrite=True)
print(f"\n✓ Saved to {output_dir}/")


INFO:root:Starting GUIDED EXPLORATION training...


TensorFlow version: 2.18.0
GPUs available: 2

HYPERPARAMETERS - GUIDED EXPLORATION
Epochs: 250 | LR: 0.0005 | Discount: 0.95
Epsilon: 0.95→0.3 | Hidden: 64
Exploration: 50% random + 50% greedy lookahead
Prioritized replay: HIGH reward experiences trained more

Loaded embeddings: (27, 8)
Loaded vocabulary with 5757 words
✓ Environment loaded (2000 puzzles)

Building models...
✓ Models built

STARTING GUIDED EXPLORATION TRAINING

GUIDED EXPLORATION + ULTRA-DENSE REWARDS
Very Easy (≥12): 5 puzzles
Easy (9-11):     260 puzzles
Medium (7-8):    964 puzzles
Hard (<7):       771 puzzles



I0000 00:00:1763842738.593444      92 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1763842740.810835      48 service.cc:148] XLA service 0x21419810 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763842740.811948      48 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1763842740.811973      48 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1763842742.852549      48 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  ★ WIN! Puzzle 1971 in 39 moves
  ★ WIN! Puzzle 1867 in 37 moves
Ep    1/250 | Loss: 19012.4 | ε: 0.945 | Wins:   2 ( 2.0%) | R:  42.6 | Correct: 12.9/21 | Words: 1.3/6
  ★ WIN! Puzzle 606 in 44 moves
  ★ WIN! Puzzle 1080 in 46 moves
  ★ WIN! Puzzle 494 in 45 moves
  ★ WIN! Puzzle 973 in 27 moves
  ★ WIN! Puzzle 1213 in 22 moves
Ep    2/250 | Loss: 5287.0 | ε: 0.939 | Wins:   7 ( 3.5%) | R:  42.7 | Correct: 12.9/21 | Words: 1.3/6
  ★ WIN! Puzzle 1 in 47 moves
  ★ WIN! Puzzle 296 in 12 moves
  ★ WIN! Puzzle 1511 in 50 moves
  ★ WIN! Puzzle 887 in 38 moves
  ★ WIN! Puzzle 1511 in 34 moves
  ★ WIN! Puzzle 1252 in 47 moves
Ep    3/250 | Loss: 3260.9 | ε: 0.934 | Wins:  13 ( 4.3%) | R:  42.4 | Correct: 12.6/21 | Words: 1.2/6
  ★ WIN! Puzzle 1018 in 12 moves
  ★ WIN! Puzzle 1262 in 47 moves
  ★ WIN! Puzzle 954 in 36 moves
  ★ WIN! Puzzle 121 in 37 moves
Ep    4/250 | Loss: 2522.6 | ε: 0.928 | Wins:  17 ( 4.2%) | R:  42.5 | Correct: 12.8/21 | Words: 1.2/6
  ★ WIN! Puzzle 793 in 21 moves
  ★ 