In [1]:
%%writefile env.py
"""
Waffle Game Environment Definition Class - With Letter Embeddings (5×5×10 input)
"""

import numpy as np
import json
from typing import Tuple, List, Optional

class Waffle(object):
    def __init__(self, puzzles_file: str = "waffles_shuffled_2k.jsonl", 
                 embeddings_file: str = "letter_embeddings.npy",
                 vocab_file: str = "/kaggle/input/waffle/words.txt") -> None:
        self.puzzles_file = puzzles_file
        self.puzzles = self._load_puzzles()
        self.embeddings = self._load_embeddings(embeddings_file)
        self.grid_size = 21
        self.current_puzzle = None
        self.current_state = None
        self.target_state = None
        self.fixed_indices = None
        
        self.row_words = [
            [0, 1, 2, 3, 4],
            [8, 9, 10, 11, 12],
            [16, 17, 18, 19, 20]
        ]
        self.col_words = [
            [0, 5, 8, 13, 16],
            [2, 6, 10, 14, 18],
            [4, 7, 12, 15, 20]
        ]

        # Load vocabulary and precompute n-grams
        self.vocab = self._load_vocab(vocab_file)
        self.bigram_set = set()
        self.trigram_set = set()
        for word in self.vocab:
            self.bigram_set.update([word[i:i+2] for i in range(len(word)-1)])
            self.trigram_set.update([word[i:i+3] for i in range(len(word)-2)])

        self.reset(0)

    def _load_puzzles(self) -> List[dict]:
        puzzles = []
        with open(self.puzzles_file, "r") as f:
            for line in f:
                puzzles.append(json.loads(line))
        return puzzles

    def _load_vocab(self, vocab_file: str) -> set:
        vocab = set()
        with open(vocab_file, 'r') as f:
            for line in f:
                word = line.strip().lower()
                if len(word) == 5:
                    vocab.add(word)
        print(f"Loaded vocabulary with {len(vocab)} words")
        return vocab

    def _load_embeddings(self, embeddings_file: str) -> np.ndarray:
        try:
            embeddings = np.load(embeddings_file)
            print(f"Loaded embeddings: {embeddings.shape}")
            return embeddings.astype(np.float32)
        except FileNotFoundError:
            print(f"Creating random embeddings")
            return np.random.randn(27, 8).astype(np.float32)

    def _string_to_array(self, flat21: str) -> np.ndarray:
        return np.array([ord(c) - ord('a') + 1 for c in flat21], dtype=np.int32)

    def _array_to_string(self, arr: np.ndarray) -> str:
        return "".join([chr(int(x) + ord('a') - 1) for x in arr])

    def _get_word_for_position(self, pos: int) -> Optional[List[int]]:
        for word in self.row_words:
            if pos in word:
                return word
        for word in self.col_words:
            if pos in word:
                return word
        return None

    def _calculate_colors_and_embeddings(self, state: np.ndarray) -> np.ndarray:
        features = np.zeros((21, 10), dtype=np.float32)
        
        for i in range(21):
            current_letter = state[i]
            target_letter = self.target_state[i]
            
            if current_letter == target_letter:
                features[i, 0] = 1.0  # Green channel
            else:
                word_positions = self._get_word_for_position(i)
                if word_positions:
                    target_word_letters = [self.target_state[p] for p in word_positions]
                    if current_letter in target_word_letters:
                        features[i, 1] = 1.0  # Yellow channel
            
            letter_idx = int(current_letter) - 1 if current_letter >= 1 else 26
            features[i, 2:10] = self.embeddings[letter_idx]
        
        return features

    def _count_correct_positions(self, state: np.ndarray) -> int:
        return np.sum(state == self.target_state)
        
    def reset(self, index: int) -> None:
        self.current_puzzle = self.puzzles[index]
        self.target_state = self._string_to_array(self.current_puzzle["target_flat21"])
        self.current_state = self._string_to_array(self.current_puzzle["shuffled_flat21"])
        self.fixed_indices = set(self.current_puzzle["fixed_indices"])
        self.moves = 0
        self.max_moves = 50
        self.prev_correct_count = self._count_correct_positions(self.current_state)
        self.prev_features = self._calculate_colors_and_embeddings(self.current_state)

    def _update_state(self, action: int) -> None:
        pos1 = action // 21
        pos2 = action % 21
        temp = self.current_state[pos1]
        self.current_state[pos1] = self.current_state[pos2]
        self.current_state[pos2] = temp
        self.moves += 1

    def _get_reward(self, action: int) -> float:
        pos1 = action // 21
        pos2 = action % 21
        reward = -0.1
        
        if np.array_equal(self.current_state, self.target_state):
            return 1000.0 + reward
        
        current_correct = self._count_correct_positions(self.current_state)
        correct_change = current_correct - self.prev_correct_count
        current_features = self._calculate_colors_and_embeddings(self.current_state)
        
        if self.current_state[pos1] == self.current_state[pos2]:
            reward += -5
        
        if correct_change > 0:
            reward += 3 * correct_change
        elif correct_change < 0:
            reward += -3 * abs(correct_change)
        
        for pos in [pos1, pos2]:
            prev_green = self.prev_features[pos, 0]
            prev_yellow = self.prev_features[pos, 1]
            curr_green = current_features[pos, 0]
            curr_yellow = current_features[pos, 1]
            
            if prev_green == 0 and prev_yellow == 0 and curr_yellow == 1.0:
                reward += 0.5
            elif prev_yellow == 1.0 and curr_green == 0 and curr_yellow == 0:
                reward += -0.5
            # elif prev_yellow == 1.0 and curr_green == 1.0:
            #     reward += 3

        current_letters = [chr(c + ord('a') - 1) for c in self.current_state]
        
        def is_word(s):
            return s in self.vocab
        
        def count_valid_ngrams(s, ngram_set, n):
            return sum(1 for i in range(len(s) - n + 1) if s[i:i+n] in ngram_set)
        
        for indices in self.row_words + self.col_words:
            word_str = ''.join([current_letters[i] for i in indices])
            if is_word(word_str):
                reward += 5.0
            reward += 0.5 * count_valid_ngrams(word_str, self.bigram_set, 2)
            reward += 0.2 * count_valid_ngrams(word_str, self.trigram_set, 3)
        
        self.prev_correct_count = current_correct
        self.prev_features = current_features
        return reward

    def _is_over(self) -> bool:
        solved = np.array_equal(self.current_state, self.target_state)
        max_moves_reached = self.moves >= self.max_moves
        return solved or max_moves_reached
    
    def _is_win(self) -> bool:
        return np.array_equal(self.current_state, self.target_state)

    def observe(self) -> np.ndarray:
        features = self._calculate_colors_and_embeddings(self.current_state)
        grid = np.zeros((5, 5, 10), dtype=np.float32)

        blank_embedding = self.embeddings[26]
        blank_feature_vector = np.zeros(10, dtype=np.float32)
        blank_feature_vector[2:10] = blank_embedding
        blank_positions = [(1,1), (1,3), (3,1), (3,3)]
        for r, c in blank_positions:
            grid[r, c, :] = blank_feature_vector
        
        position_to_grid = [
            (0,0), (0,1), (0,2), (0,3), (0,4),
            (1,0), (1,2), (1,4),
            (2,0), (2,1), (2,2), (2,3), (2,4),
            (3,0), (3,2), (3,4),
            (4,0), (4,1), (4,2), (4,3), (4,4)
        ]
        
        for flat_idx in range(21):
            r, c = position_to_grid[flat_idx]
            grid[r, c, :] = features[flat_idx]
        
        return grid.reshape(1, 5, 5, 10)

    def act(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        self._update_state(action)
        reward = self._get_reward(action)
        game_over = self._is_over()
        game_win = self._is_win()
        return self.observe(), reward, game_over, game_win

    def get_valid_actions(self) -> List[int]:
        valid_actions = []
        for pos1 in range(21):
            for pos2 in range(21):
                if pos1 != pos2:
                    action = pos1 * 21 + pos2
                    valid_actions.append(action)
        return valid_actions

    def get_num_actions(self) -> int:
        return 21 * 21

    def get_current_state_letters(self) -> np.ndarray:
        return self.current_state

Writing env.py


In [2]:
%%writefile experience_replay.py
"""
Waffle Game Experience Replay Class - Double DQN without Action Masking
"""
import numpy as np
import tensorflow as tf
from tensorflow import keras
from typing import Tuple

class ExperienceReplay(object):
    def __init__(self, max_memory: int = 10000, discount: float = 0.99) -> None:
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def add_experience(self, sars: list, game_over: bool, unmovable_positions = None) -> None:
        """
        Store experience.
        
        Args:
            sars: [state, action, reward, next_state]
            game_over: Whether episode ended
            unmovable_positions: Not used now
        """
        self.memory.append([sars, game_over, unmovable_positions])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_qlearning_batch(
        self, model: keras.Model, target_model: keras.Model, batch_size: int = 128
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate training batch for Double DQN without action masking.
        
        Args:
            model: Main Q-network (for action selection)
            target_model: Target Q-network (for action evaluation)
            batch_size: Number of experiences to sample
            
        Returns:
            (states, target_q_values) for training
        """
        memory_length = len(self.memory)
        num_inputs = min(memory_length, batch_size)
        ids = np.random.choice(memory_length, size=num_inputs, replace=False)
        
        # Unpack experiences
        sars = list(zip(*[self.memory[id_][0] for id_ in ids]))
        previous_states, action_ts, rewards, current_states = (
            np.concatenate(e) if isinstance(e[0], np.ndarray) else np.stack(e)
            for e in sars
        )
        game_over = np.array([self.memory[id_][1] for id_ in ids], dtype=bool)
        
        # Convert to tensors
        previous_states_tensor = tf.constant(previous_states, dtype=tf.float32)
        current_states_tensor = tf.constant(current_states, dtype=tf.float32)
        
        # Get current Q-values for previous states
        targets = model(previous_states_tensor, training=False).numpy()
        
        # DOUBLE DQN:
        # Step 1: Main network selects best action for next state (no masking)
        next_q_main = model(current_states_tensor, training=False).numpy()
        best_actions = np.argmax(next_q_main, axis=1)
        
        # Step 2: Target network evaluates the chosen actions
        next_q_target = target_model(current_states_tensor, training=False).numpy()
        Q_sa = next_q_target[np.arange(num_inputs), best_actions]
        
        # Bellman update
        targets[np.arange(num_inputs), action_ts] = (
            rewards + self.discount * Q_sa * (~game_over)
        )
        
        return previous_states, targets


Writing experience_replay.py


In [3]:
# Usually not needed, TensorFlow is pre-installed on Kaggle
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPUs available: {len(tf.config.list_physical_devices('GPU'))}")

2025-11-22 18:20:58.484609: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763835658.714007      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763835658.784462      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

TensorFlow version: 2.18.0
GPUs available: 2


In [4]:
import os
import json
import numpy as np
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

from experience_replay import ExperienceReplay
from env import Waffle

# GPU Configuration
print("="*60)
print("SYSTEM CHECK")
print("="*60)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ {len(gpus)} GPU(s) configured")
    except RuntimeError as e:
        print(f"GPU warning (ignorable): {e}")
else:
    print("No GPU")
print("="*60)

# Find dataset
dataset_path = None
embeddings_path = '/kaggle/input/letter_embeddings.npy'

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'waffles_shuffled_2k.jsonl':
            dataset_path = os.path.join(dirname, filename)
            print(f"\n✓ Found dataset: {dataset_path}")
        if filename == 'letter_embeddings.npy':
            embeddings_path = os.path.join(dirname, filename)
            print(f"✓ Found embeddings: {embeddings_path}")

if not dataset_path:
    raise FileNotFoundError("waffles_shuffled_2k.jsonl not found")

# Model Definition - Conv2D for 5×5 grid
def define_model(input_shape, hidden_size, num_actions, 
                learning_rate=0.001, conv_filters=64, loss="mse"):
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', 
                     padding='same', input_shape=input_shape, dtype='float32'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_actions, dtype='float32'))
    model.compile(Adam(learning_rate=learning_rate), loss)
    return model

def train_model(model, target_model, env, epochs, experience_replay, 
               epsilon_start, epsilon_end, epsilon_decay, batch_size, 
               puzzles_per_epoch, target_update_freq):
    import numpy as np
    import logging
    logging.info("Starting training...")
    win_count = 0
    total_rewards = []
    epsilon = epsilon_start
    num_actions = env.get_num_actions()
    training_steps = 0
    
    # Precompute initial correctness for all puzzles
    initial_correctness_list = []
    for idx, p in enumerate(env.puzzles):
        state = env._string_to_array(p["shuffled_flat21"])
        target = env._string_to_array(p["target_flat21"])
        correct = np.sum(state == target)
        initial_correctness_list.append((idx, correct))

    bins = {
        'easy':   [idx for idx, c in initial_correctness_list if c >= 9],
        'medium': [idx for idx, c in initial_correctness_list if 7 <= c < 9],
        'hard':   [idx for idx, c in initial_correctness_list if 5 <= c < 7],
        'harder': [idx for idx, c in initial_correctness_list if c < 5]
    }

    def sample_puzzles(epoch):
        # Divide training epochs into four phases
        fraction = epoch / epochs
        if fraction < 0.25:
            return np.random.choice(bins['easy'], size=puzzles_per_epoch, replace=True)
        elif fraction < 0.50:
            mixed = bins['easy'] + bins['medium']
            return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)
        elif fraction < 0.75:
            mixed = bins['medium'] + bins['hard']
            return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)
        else:
            mixed = bins['hard'] + bins['harder']
            return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)
    
    @tf.function
    def predict_q(state):
        return model(state, training=False)
    
    for epoch in range(epochs):
        loss = 0.0
        final_correct_positions = []
        num_moves = []
        epoch_wins = 0
        
        puzzle_indices = sample_puzzles(epoch)
        
        for puzzle_idx in puzzle_indices:
            env.reset(puzzle_idx)
            current_state = env.observe()
            game_over = False
            episode_reward = 0
            step_count = 0

            if np.array_equal(env.current_state, env.target_state):
                print(f"Puzzle {puzzle_idx} is already solved at reset!")

            initial_correct = env._count_correct_positions(env.current_state)
            if initial_correct == 21:
                print(f"Puzzle {puzzle_idx} has all 21 correct at start!")
            
            fixed_positions = env.fixed_indices

            patience = 20
            no_progress_steps = 0
            prev_green_count = env._count_correct_positions(env.current_state)
            
            while not game_over:
                current_green_count = env._count_correct_positions(env.current_state)
                if current_green_count > prev_green_count:
                    no_progress_steps = 0
                    prev_green_count = current_green_count
                else:
                    no_progress_steps += 1

                if no_progress_steps >= patience:
                    # Early stop episode due to stagnation
                    game_over = True
                    # Optional: penalty reward for failing to progress
                    reward -= 10.0  
                    # Store experience for final state
                
                previous_state = current_state
                movable_positions = [pos for pos in range(21) if pos not in fixed_positions]
                valid_actions = []
                for i, pos1 in enumerate(movable_positions):
                    for pos2 in movable_positions[i+1:]:
                        action = pos1 * 21 + pos2
                        valid_actions.append(action)
                
                if np.random.rand() <= epsilon:
                    action = np.random.choice(valid_actions)
                else:
                    q = predict_q(tf.constant(previous_state, dtype=tf.float32)).numpy()[0]
                    masked_q = np.full(num_actions, -np.inf)
                    masked_q[valid_actions] = q[valid_actions]
                    action = int(np.argmax(masked_q))

                current_state, reward, game_over, game_win = env.act(action)
                episode_reward += reward
                step_count += 1
                
                experience_replay.add_experience(
                    [previous_state, int(action), reward, current_state], 
                    game_over,
                    fixed_positions
                )
                
                if len(experience_replay.memory) >= batch_size and step_count % 5 == 0:
                    inputs, targets = experience_replay.get_qlearning_batch(
                        model, target_model, batch_size=batch_size
                    )
                    loss += model.train_on_batch(inputs, targets)
                    training_steps += 1
                    
                    if training_steps % target_update_freq == 0:
                        target_model.set_weights(model.get_weights())
                        print(f"  [Target network updated at step {training_steps}]")
            
            total_rewards.append(episode_reward)
            final_correct = env._count_correct_positions(env.current_state)
            final_correct_positions.append(final_correct)
            num_moves.append(step_count)
            if game_win:
                epoch_wins += 1
        
        win_count += epoch_wins
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        
        if (epoch + 1) % 10 == 0 or epoch < 10:
            recent_window = min(400, len(total_rewards))
            avg_reward = np.mean(total_rewards[-recent_window:])
            win_rate = win_count / ((epoch + 1) * puzzles_per_epoch) * 100
            avg_correct = np.mean(final_correct_positions[-recent_window:])
            avg_moves = np.mean(num_moves[-recent_window:])
            print(f"Ep {epoch+1:4d}/{epochs} | Loss: {loss:8.4f} | ε: {epsilon:.4f} | "
                  f"Wins: {win_count} ({win_rate:5.2f}%) | AvgR: {avg_reward:7.2f} | AvgCorrect: {avg_correct:.1f}/21 | AvgMoves: {avg_moves:.1f}")
        
        if (epoch + 1) % 50 == 0:
            print(f"\n{'='*70}")
            print(f"CHECKPOINT - Epoch {epoch+1}/{epochs}")
            print(f"Total wins: {win_count}/{(epoch+1)*puzzles_per_epoch} ({win_rate:.2f}%)")
            print(f"Avg reward (recent): {avg_reward:.2f}")
            print(f"Avg correct (recent): {avg_correct:.1f}/21")
            print(f"Training steps: {training_steps}")
            print(f"{'='*70}\n")

            checkpoint_dir = '/kaggle/working/model_checkpoints'
            os.makedirs(checkpoint_dir, exist_ok=True)
            
            model.save_weights(f'{checkpoint_dir}/model_epoch_{epoch+1}.weights.h5')
            target_model.save_weights(f'{checkpoint_dir}/target_model_epoch_{epoch+1}.weights.h5')
            with open(f'{checkpoint_dir}/model_epoch_{epoch+1}.json', 'w') as f:
                json.dump(model.to_json(), f)
            print(f"✓ Saved checkpoint at epoch {epoch+1}")
    
    total_episodes = epochs * puzzles_per_epoch
    final_win_rate = win_count / total_episodes * 100
    print(f"\nTraining complete!")
    print(f"Total wins: {win_count}/{total_episodes} ({final_win_rate:.2f}%)")
    return model

# Hyperparameters
EPOCHS = 40
EPSILON_START = 1.0
EPSILON_END = 0.01
TARGET_EPSILON_EPOCH = 100  # Reach epsilon_end by this epoch
EPSILON_DECAY = (EPSILON_END / EPSILON_START) ** (1 / TARGET_EPSILON_EPOCH)
MAX_MEMORY = 10000
HIDDEN_SIZE = 256  # Reduced from 512
CONV_FILTERS = 64
BATCH_SIZE = 128
DISCOUNT = 0.99
LEARNING_RATE = 0.001  # Increased from 0.0005
PUZZLES_PER_EPOCH = 400
TARGET_UPDATE_FREQUENCY = 1000

print(f"\n{'='*70}")
print(f"EPSILON SCHEDULE")
print(f"{'='*70}")
print(f"Start: {EPSILON_START}")
print(f"End: {EPSILON_END}")
print(f"Target epoch: {TARGET_EPSILON_EPOCH}")
print(f"Decay rate: {EPSILON_DECAY:.6f}")
print(f"{'='*70}")

# Initialize Environment
env = Waffle(puzzles_file=dataset_path, embeddings_file=embeddings_path)
num_actions = env.get_num_actions()
input_shape = (5, 5, 10)

print(f"\n{'='*70}")
print(f"ENVIRONMENT SETUP")
print(f"{'='*70}")
print(f"Puzzles loaded:      {len(env.puzzles)}")
print(f"Puzzles per epoch:   {PUZZLES_PER_EPOCH}")
print(f"Total episodes:      {EPOCHS * PUZZLES_PER_EPOCH}")
print(f"Action space:        {num_actions}")
print(f"Input shape:         {input_shape} (5×5 grid × 10 features)")
print(f"{'='*70}\n")

print("="*70)
print("PUZZLE VALIDATION CHECK (First 10 puzzles)")
print("="*70)
for i in range(min(10, len(env.puzzles))):
    env.reset(i)
    initial_correct = env._count_correct_positions(env.current_state)
    is_solved = np.array_equal(env.current_state, env.target_state)
    num_fixed = len(env.fixed_indices)
    valid_actions = len(env.get_valid_actions())
    
    status = "✓ OK" if not is_solved else "PRE-SOLVED"
    print(f"Puzzle {i:3d}: Correct={initial_correct:2d}/21, Fixed={num_fixed:2d}, "
          f"ValidActions={valid_actions:3d} | {status}")


# Build Main Model
print("Building main model...")
model = define_model(
    input_shape=input_shape,
    hidden_size=HIDDEN_SIZE,
    num_actions=num_actions,
    learning_rate=LEARNING_RATE,
    conv_filters=CONV_FILTERS
)
model.summary()

# Build Target Model (Double DQN)
print("\nBuilding target model...")
target_model = define_model(
    input_shape=input_shape,
    hidden_size=HIDDEN_SIZE,
    num_actions=num_actions,
    learning_rate=LEARNING_RATE,
    conv_filters=CONV_FILTERS
)
target_model.set_weights(model.get_weights())
print("✓ Target network initialized with main network weights")

# GPU Test
test_input = tf.random.normal([1, 5, 5, 10])
test_output = model(test_input, training=False)
print(f"\n✓ GPU test passed")
print(f"  Input: {test_input.device}")
print(f"  Output: {test_output.device}\n")

# Train
exp_replay = ExperienceReplay(max_memory=MAX_MEMORY, discount=DISCOUNT)
logging.basicConfig(level=logging.INFO)

trained_model = train_model(
    model=model,
    target_model=target_model,
    env=env,
    epochs=EPOCHS,
    experience_replay=exp_replay,
    epsilon_start=EPSILON_START,
    epsilon_end=EPSILON_END,
    epsilon_decay=EPSILON_DECAY,
    batch_size=BATCH_SIZE,
    puzzles_per_epoch=PUZZLES_PER_EPOCH,
    target_update_freq=TARGET_UPDATE_FREQUENCY
)

# Save Models
output_dir = '/kaggle/working/model'
os.makedirs(output_dir, exist_ok=True)

trained_model.save_weights(f'{output_dir}/model.weights.h5', overwrite=True)
target_model.save_weights(f'{output_dir}/target_model.weights.h5', overwrite=True)
with open(f'{output_dir}/model.json', 'w') as f:
    json.dump(trained_model.to_json(), f)

print(f"\n✓ Models saved to {output_dir}/")
print(f"  - model.weights.h5 (main network)")
print(f"  - target_model.weights.h5 (target network)")
print(f"  - model.json (architecture)")

SYSTEM CHECK
✓ 2 GPU(s) configured

✓ Found dataset: /kaggle/input/waffle/waffles_shuffled_2k.jsonl
✓ Found embeddings: /kaggle/input/waffle/letter_embeddings.npy

EPSILON SCHEDULE
Start: 1.0
End: 0.01
Target epoch: 100
Decay rate: 0.954993
Loaded embeddings: (27, 8)
Loaded vocabulary with 5757 words

ENVIRONMENT SETUP
Puzzles loaded:      2000
Puzzles per epoch:   400
Total episodes:      16000
Action space:        441
Input shape:         (5, 5, 10) (5×5 grid × 10 features)

PUZZLE VALIDATION CHECK (First 10 puzzles)
Puzzle   0: Correct= 8/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   1: Correct= 9/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   2: Correct= 6/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   3: Correct= 7/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   4: Correct= 6/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   5: Correct= 7/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   6: Correct= 6/21, Fixed= 5, ValidActions=420 | ✓ OK
Puzzle   7: Correct= 6/21, Fixed= 5, ValidAction

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1763835675.488974      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763835675.489714      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5



Building target model...
✓ Target network initialized with main network weights


I0000 00:00:1763835677.001676      48 cuda_dnn.cc:529] Loaded cuDNN version 90300
INFO:root:Starting training...



✓ GPU test passed
  Input: /job:localhost/replica:0/task:0/device:GPU:0
  Output: /job:localhost/replica:0/task:0/device:GPU:0



I0000 00:00:1763835680.016056      48 service.cc:148] XLA service 0x2b7c4de0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763835680.017161      48 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1763835680.017180      48 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1763835682.950282      48 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  [Target network updated at step 1000]
  [Target network updated at step 2000]
  [Target network updated at step 3000]
Ep    1/40 | Loss: 249252.5627 | ε: 0.9550 | Wins: 0 ( 0.00%) | AvgR:  650.72 | AvgCorrect: 7.2/21 | AvgMoves: 50.0
  [Target network updated at step 4000]
  [Target network updated at step 5000]
  [Target network updated at step 6000]
  [Target network updated at step 7000]
Ep    2/40 | Loss: 32146.4060 | ε: 0.9120 | Wins: 0 ( 0.00%) | AvgR:  652.17 | AvgCorrect: 7.2/21 | AvgMoves: 50.0
  [Target network updated at step 8000]
  [Target network updated at step 9000]
  [Target network updated at step 10000]
  [Target network updated at step 11000]
Ep    3/40 | Loss: 19098.3089 | ε: 0.8710 | Wins: 0 ( 0.00%) | AvgR:  652.18 | AvgCorrect: 7.2/21 | AvgMoves: 50.0
  [Target network updated at step 12000]
  [Target network updated at step 13000]
  [Target network updated at step 14000]
  [Target network updated at step 15000]
Ep    4/40 | Loss: 14115.0567 | ε: 0.8318 | Wins

KeyboardInterrupt: 

In [5]:
def visualize_episode(model, env, puzzle_index=0, epsilon=0.0, max_steps=30):
    env.reset(puzzle_index)
    state = env.observe()
    total_reward = 0
    
    for step in range(max_steps):
        q_values = model(tf.constant(state, dtype=tf.float32)).numpy()[0]
        valid_actions = env.get_valid_actions()
        
        masked_q = np.full(env.get_num_actions(), -np.inf)
        masked_q[valid_actions] = q_values[valid_actions]
        action = int(np.argmax(masked_q))
        
        state, reward, done, win = env.act(action)
        total_reward += reward
        
        print(f"Step {step+1}: Action={action}, Reward={reward:.2f}, Done={done}, Win={win}")
        print(f"Current state letters: {env.get_current_state_letters()}")
        
        if done:
            print(f"Episode finished after {step+1} steps with total reward {total_reward:.2f}")
            break

visualize_episode(model, env, puzzle_index=0, epsilon=0.0, max_steps=30)

Step 1: Action=244, Reward=7.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  9 19  5  1  9  5  1 14 20 19]
Step 2: Action=244, Reward=19.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  5 19  9  1  9  5  1 14 20 19]
Step 3: Action=244, Reward=7.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  9 19  5  1  9  5  1 14 20 19]
Step 4: Action=244, Reward=19.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  5 19  9  1  9  5  1 14 20 19]
Step 5: Action=244, Reward=7.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  9 19  5  1  9  5  1 14 20 19]
Step 6: Action=244, Reward=19.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  5 19  9  1  9  5  1 14 20 19]
Step 7: Action=244, Reward=7.40, Done=False, Win=False
Current state letters: [14  1  1 14 15 11  1 20 19 20  6  9 19  5  1  9  5  1 14 20 

In [None]:
def manual_play(env, puzzle_index=0):
    """
    Allows a human user to play by typing swap actions.
    Displays the current letter grid after each move, showing green and yellow hints.

    Args:
        env: Waffle environment instance
        puzzle_index: which puzzle to load and play
    """
    env.reset(puzzle_index)
    
    def display_grid_with_colors(state_letters, features):
        """
        Displays the waffle grid with colors encoded as:
        - Green: uppercase letters (correct position)
        - Yellow: lowercase letters (right letter wrong position)
        - Grey: lowercase letters with dot prefix (letter does not belong here)
        - Blank tiles are spaces
        """
        position_to_grid = [
            (0,0), (0,1), (0,2), (0,3), (0,4),
            (1,0),   (1,2),   (1,4),
            (2,0), (2,1), (2,2), (2,3), (2,4),
            (3,0),   (3,2),   (3,4),
            (4,0), (4,1), (4,2), (4,3), (4,4)
        ]
        
        grid = [[' ' for _ in range(5)] for _ in range(5)]
        print(state_letters)
    
        for flat_idx in range(len(state_letters)):
            pos = position_to_grid[flat_idx]
            if pos is not None:
                r, c = pos
                letter_code = state_letters[flat_idx]
                if letter_code > 0:
                    letter_char = chr(letter_code + ord('a') - 1)
                    green = features[flat_idx, 0] >= 0.5
                    yellow = features[flat_idx, 1] >= 0.5
                    if green:
                        display_char = letter_char.upper()
                    elif yellow:
                        display_char = letter_char.lower()
                    else:
                        # Grey letter shown with dot prefix
                        display_char = '.' + letter_char.lower()
                else:
                    display_char = ' '
                grid[r][c] = display_char
    
        print("Current Waffle Grid (Green=uppercase, Yellow=lowercase, Grey=.letter):")
        for row in grid:
            print(' '.join(row))
        print()
    
    print(f"Starting manual play for puzzle {puzzle_index}")
    features = env._calculate_colors_and_embeddings(env.current_state)[..., :2]  # green, yellow only
    display_grid_with_colors(env.get_current_state_letters(), features)
    
    while True:
        valid_actions = env.get_valid_actions()
        print(f"Valid moves: {len(valid_actions)} possible swaps")
        print("Enter your swap action as two positions (0-20), separated by space (e.g., '3 15')")
        print("Positions map as per the 21 playable tiles, corners are not valid positions.")
        
        user_input = input("Your swap (or 'quit' to exit): ")
        if user_input.strip().lower() == 'quit':
            print("Exiting manual play.")
            break
        
        try:
            pos1, pos2 = map(int, user_input.strip().split())
        except Exception:
            print("Invalid input format. Please enter two integers separated by space.")
            continue
        
        if pos1 < 0 or pos1 >= 21 or pos2 < 0 or pos2 >= 21:
            print("Positions must be integers in range 0-20.")
            continue
        
        # Convert swap to action index (order doesn’t matter, but model expects pos1*21 + pos2)
        action = pos1 * 21 + pos2
        if action not in valid_actions:
            # Try reversed order in case it's the other way
            reversed_action = pos2 * 21 + pos1
            if reversed_action in valid_actions:
                action = reversed_action
            else:
                print("Invalid action (not allowed to swap fixed/green positions or swapping the same). Try different positions.")
                continue
        
        _, reward, done, win = env.act(action)
        print(f"Performed swap {pos1} <-> {pos2}, Reward: {reward:.2f}")
        features = env._calculate_colors_and_embeddings(env.current_state)[..., :2]
        display_grid_with_colors(env.get_current_state_letters(), features)
        
        if done:
            if win:
                print("Congratulations! You solved the puzzle!")
            else:
                print(f"Game over. Reached maximum moves ({env.max_moves}).")
            break


# Example usage:
from env import Waffle
env = Waffle(puzzles_file='/kaggle/input/waffle/waffles_shuffled_2k.jsonl', embeddings_file='/kaggle/input/waffle/letter_embeddings.npy')
manual_play(env, puzzle_index=1500)