In [1]:
%%writefile env.py
"""
Waffle Game Environment Definition Class - With Letter Embeddings
"""

import numpy as np
import json
from typing import Tuple, List, Optional

class Waffle(object):
    def __init__(self, puzzles_file: str = "waffles_shuffled.jsonl", 
                 embeddings_file: str = "letter_embeddings.npy") -> None:
        self.puzzles_file = puzzles_file
        self.puzzles = self._load_puzzles()
        self.embeddings = self._load_embeddings(embeddings_file)
        self.grid_size = 21
        self.current_puzzle = None
        self.current_state = None
        self.target_state = None
        self.fixed_indices = None
        
        self.row_words = [
            [0, 1, 2, 3, 4],
            [8, 9, 10, 11, 12],
            [16, 17, 18, 19, 20]
        ]
        self.col_words = [
            [0, 5, 8, 13, 16],
            [2, 6, 10, 14, 18],
            [4, 7, 12, 15, 20]
        ]
        self.reset(0)

    def _load_puzzles(self) -> List[dict]:
        puzzles = []
        with open(self.puzzles_file, "r") as f:
            for line in f:
                puzzles.append(json.loads(line))
        return puzzles

    def _load_embeddings(self, embeddings_file: str) -> np.ndarray:
        try:
            embeddings = np.load(embeddings_file)
            print(f"✓ Loaded embeddings: {embeddings.shape}")
            return embeddings.astype(np.float32)
        except FileNotFoundError:
            print(f"⚠️ Creating random embeddings")
            return np.random.randn(27, 8).astype(np.float32)

    def _string_to_array(self, flat21: str) -> np.ndarray:
        return np.array([ord(c) - ord('a') + 1 for c in flat21], dtype=np.int32)

    def _array_to_string(self, arr: np.ndarray) -> str:
        return "".join([chr(int(x) + ord('a') - 1) for x in arr])

    def _get_word_for_position(self, pos: int) -> Optional[List[int]]:
        for word in self.row_words:
            if pos in word:
                return word
        for word in self.col_words:
            if pos in word:
                return word
        return None

    def _calculate_colors_and_embeddings(self, state: np.ndarray) -> np.ndarray:
        """
        Calculate green/yellow colors + letter embeddings for current state.
        
        Returns:
            np.ndarray: Shape (21, 10) - [green, yellow, embedding_dim1, ..., embedding_dim8]
        """
        features = np.zeros((21, 10), dtype=np.float32)
        
        for i in range(21):
            current_letter = state[i]
            target_letter = self.target_state[i]
            
            # Green: correct letter in correct position
            if current_letter == target_letter:
                features[i, 0] = 1.0  # Green channel
            else:
                # Find which word this position belongs to
                word_positions = self._get_word_for_position(i)
                if word_positions:
                    target_word_letters = [self.target_state[p] for p in word_positions]
                    # Yellow: letter exists in the same word but wrong position
                    if current_letter in target_word_letters:
                        features[i, 1] = 1.0  # Yellow channel
                    # If neither green nor yellow, it's automatically grey (both 0)
            
            # Add letter embedding (8 dimensions)
            # Letter encoding: a=1, b=2, ..., z=26, blank=0
            letter_idx = int(current_letter) if current_letter <= 26 else 0
            features[i, 2:10] = self.embeddings[letter_idx]
        
        return features

    def _count_correct_positions(self, state: np.ndarray) -> int:
        return np.sum(state == self.target_state)

    # def reset(self) -> None:
    #     self.current_puzzle = self.puzzles[np.random.randint(len(self.puzzles))]
    #     self.target_state = self._string_to_array(self.current_puzzle["target_flat21"])
    #     self.current_state = self._string_to_array(self.current_puzzle["shuffled_flat21"])
    #     self.fixed_indices = set(self.current_puzzle["fixed_indices"])
    #     self.moves = 0
    #     self.max_moves = 200
    #     self.prev_correct_count = self._count_correct_positions(self.current_state)
    #     self.prev_features = self._calculate_colors_and_embeddings(self.current_state)
        
    def reset(self, index: int) -> None:
        self.current_puzzle = self.puzzles[index]
        self.target_state = self._string_to_array(self.current_puzzle["target_flat21"])
        self.current_state = self._string_to_array(self.current_puzzle["shuffled_flat21"])
        self.fixed_indices = set(self.current_puzzle["fixed_indices"])
        self.moves = 0
        self.max_moves = 200
        self.prev_correct_count = self._count_correct_positions(self.current_state)
        self.prev_features = self._calculate_colors_and_embeddings(self.current_state)

    def _update_state(self, action: int) -> None:
        pos1 = action // 21
        pos2 = action % 21
        if pos1 not in self.fixed_indices and pos2 not in self.fixed_indices:
            temp = self.current_state[pos1]
            self.current_state[pos1] = self.current_state[pos2]
            self.current_state[pos2] = temp
        self.moves += 1

    def _get_reward(self, action: int) -> float:
        pos1 = action // 21
        pos2 = action % 21
        reward = -0.1
        
        if np.array_equal(self.current_state, self.target_state):
            return 1000.0 + reward
        
        current_correct = self._count_correct_positions(self.current_state)
        correct_change = current_correct - self.prev_correct_count
        current_features = self._calculate_colors_and_embeddings(self.current_state)
        
        if self.current_state[pos1] == self.current_state[pos2]:
            reward += -1
        
        if (self.current_state[pos2] == self.target_state[pos1] or 
            self.current_state[pos1] == self.target_state[pos2]):
            reward += -10
        
        if correct_change > 0:
            reward += 1 * correct_change
        elif correct_change < 0:
            reward += -2 * abs(correct_change)
        
        # Check grey→yellow and yellow→grey transitions
        for pos in [pos1, pos2]:
            prev_green = self.prev_features[pos, 0]
            prev_yellow = self.prev_features[pos, 1]
            curr_green = current_features[pos, 0]
            curr_yellow = current_features[pos, 1]
            
            # Grey → Yellow (neither green nor yellow before, yellow now)
            if prev_green == 0 and prev_yellow == 0 and curr_yellow == 1.0:
                reward += 1
            # Yellow → Grey (yellow before, neither green nor yellow now)
            elif prev_yellow == 1.0 and curr_green == 0 and curr_yellow == 0:
                reward += -1
        
        self.prev_correct_count = current_correct
        self.prev_features = current_features
        return reward

    def _is_over(self) -> bool:
        solved = np.array_equal(self.current_state, self.target_state)
        max_moves_reached = self.moves >= self.max_moves
        return solved or max_moves_reached
    
    def _is_win(self) -> bool:
        return np.array_equal(self.current_state, self.target_state)

    def observe(self) -> np.ndarray:
        """
        Return state with colors + embeddings.
        
        Returns:
            np.ndarray: Shape (1, 21, 10) - 21 positions × 10 features (2 colors + 8 embedding dims)
        """
        features = self._calculate_colors_and_embeddings(self.current_state)
        return features.reshape(1, 21, 10)

    def act(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        self._update_state(action)
        reward = self._get_reward(action)
        game_over = self._is_over()
        game_win = self._is_win()
        return self.observe(), reward, game_over, game_win

    def get_valid_actions(self) -> List[int]:
        valid_actions = []
        movable_positions = [i for i in range(21) if i not in self.fixed_indices]
        for i, pos1 in enumerate(movable_positions):
            for pos2 in movable_positions[i+1:]:
                action = pos1 * 21 + pos2
                valid_actions.append(action)
        return valid_actions

    def get_num_actions(self) -> int:
        return 21 * 21

    def get_current_state_letters(self) -> np.ndarray:
        return self.current_state

Writing env.py


In [2]:
%%writefile experience_replay.py
"""
Waffle Game Experience Replay Class Definition - GPU Optimized
"""
import numpy as np
import tensorflow as tf
from tensorflow import keras
from typing import Tuple

class ExperienceReplay(object):
    def __init__(self, max_memory: int = 5000, discount: float = 0.95) -> None:
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def add_experience(self, sars: list, game_over: bool) -> None:
        self.memory.append([sars, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_qlearning_batch(
        self, model: keras.Model, batch_size: int = 32
    ) -> Tuple[np.ndarray, np.ndarray]:
        memory_length = len(self.memory)
        first_state = self.memory[0][0][0]
        state_shape = first_state.shape[1:]
        num_inputs = min(memory_length, batch_size)
        ids = np.random.choice(memory_length, size=num_inputs, replace=False)
        sars = list(zip(*[self.memory[id_][0] for id_ in ids]))
        previous_states, action_ts, rewards, current_states = (
            np.concatenate(e) if isinstance(e[0], np.ndarray) else np.stack(e)
            for e in sars
        )
        game_over = np.stack([self.memory[id_][1] for id_ in ids])
        
        previous_states_tensor = tf.constant(previous_states, dtype=tf.float32)
        current_states_tensor = tf.constant(current_states, dtype=tf.float32)
        
        targets = model(previous_states_tensor, training=False).numpy()
        Q_sa = tf.reduce_max(model(current_states_tensor, training=False), axis=1).numpy()
        targets[np.arange(num_inputs), action_ts] = (
            rewards + self.discount * Q_sa * ~game_over
        )
        return previous_states, targets

Writing experience_replay.py


In [3]:
# Usually not needed, TensorFlow is pre-installed on Kaggle
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPUs available: {len(tf.config.list_physical_devices('GPU'))}")

2025-11-15 15:38:07.559048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763221087.951910      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763221088.052024      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

TensorFlow version: 2.18.0
GPUs available: 2


In [None]:
import os
import json
import numpy as np
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

from experience_replay import ExperienceReplay
from env import Waffle

# GPU Configuration
print("="*60)
print("SYSTEM CHECK")
print("="*60)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ {len(gpus)} GPU(s) configured")
    except RuntimeError as e:
        print(f"GPU warning (ignorable): {e}")
else:
    print("⚠️ No GPU")
print("="*60)

# Find dataset
dataset_path = None
embeddings_path = '/kaggle/input/letter_embeddings.npy'  # Or your uploaded path

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'waffles_shuffled.jsonl':
            dataset_path = os.path.join(dirname, filename)
            print(f"\n✓ Found dataset: {dataset_path}")
        if filename == 'letter_embeddings.npy':
            embeddings_path = os.path.join(dirname, filename)
            print(f"✓ Found embeddings: {embeddings_path}")

if not dataset_path:
    raise FileNotFoundError("waffles_shuffled.jsonl not found")

# Model Definition - CHANGED INPUT SHAPE
def define_model(input_shape, hidden_size, num_actions, 
                learning_rate=0.001, conv_filters=64, loss="mse"):
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    
    model = Sequential()
    model.add(Conv1D(conv_filters, kernel_size=3, activation='relu', 
                     padding='same', input_shape=input_shape, dtype='float32'))
    model.add(Conv1D(conv_filters, kernel_size=3, activation='relu', padding='same'))
    model.add(Conv1D(conv_filters * 2, kernel_size=5, activation='relu', padding='same'))
    model.add(Flatten())
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(hidden_size // 2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_actions, dtype='float32'))
    model.compile(Adam(learning_rate=learning_rate), loss)
    return model

# Training Function
def train_model(model, env, epochs, experience_replay, 
               epsilon_start, epsilon_end, epsilon_decay, batch_size):
    logging.info("Starting training...")
    win_count = 0
    total_rewards = []
    epsilon = epsilon_start
    num_actions = env.get_num_actions()
    num_puzzles = len(env.puzzles)
    
    @tf.function
    def predict_q(state):
        return model(state, training=False)
    
    for epoch in range(epochs):
        loss = 0.0
        for i in range(num_puzzles):
            env.reset(i)
            current_state = env.observe()
            game_over = False
            episode_reward = 0
            step_count = 0
            
            while not game_over:
                previous_state = current_state
                
                if np.random.rand() <= epsilon:
                    valid_actions = env.get_valid_actions()
                    action = np.random.choice(valid_actions) if valid_actions else np.random.randint(0, num_actions)
                else:
                    q = predict_q(tf.constant(previous_state, dtype=tf.float32))
                    action = int(tf.argmax(q[0]).numpy())
                
                current_state, reward, game_over, game_win = env.act(action)
                episode_reward += reward
                step_count += 1
                
                if game_win:
                    win_count += 1
                
                experience_replay.add_experience(
                    [previous_state, int(action), reward, current_state], game_over
                )
                
                if len(experience_replay.memory) >= batch_size and step_count % 5 == 0:
                    inputs, targets = experience_replay.get_qlearning_batch(model, batch_size=batch_size)
                    loss += model.train_on_batch(inputs, targets)

            total_rewards.append(episode_reward)
        
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        
        if (epoch + 1) % 10 == 0 or epoch < 10:
            avg_reward = np.mean(total_rewards[-100:]) if len(total_rewards) >= 100 else np.mean(total_rewards)
            win_rate = win_count / (epoch + 1) * 100
            print(f"Ep {epoch+1:4d}/{epochs} | Loss: {loss:8.4f} | ε: {epsilon:.4f} | "
                  f"Wins: {win_count:3d} ({win_rate:5.1f}%) | AvgR: {avg_reward:7.2f}")
        
        if (epoch + 1) % 100 == 0:
            print(f"\n{'='*70}")
            print(f"CHECKPOINT - Epoch {epoch+1}/{epochs}")
            print(f"Wins: {win_count}/{epoch+1} ({win_rate:.1f}%)")
            print(f"Avg reward (last 100): {np.mean(total_rewards[-100:]):.2f}")
            print(f"{'='*70}\n")
    
    print(f"\nTraining complete! Win rate: {win_count}/{epochs} ({win_count/epochs*100:.1f}%)")
    return model

# Hyperparameters
EPOCHS = 200
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
MAX_MEMORY = 10000
HIDDEN_SIZE = 512
CONV_FILTERS = 64
BATCH_SIZE = 128
DISCOUNT = 0.99
LEARNING_RATE = 0.0005

# Initialize Environment with embeddings
env = Waffle(puzzles_file=dataset_path, embeddings_file=embeddings_path)
num_actions = env.get_num_actions()
input_shape = (21, 10)  # CHANGED: 21 positions × 10 features (2 colors + 8 embeddings)

print(f"\n{'='*70}")
print(f"ENVIRONMENT SETUP")
print(f"{'='*70}")
print(f"Puzzles loaded:      {len(env.puzzles)}")
print(f"Action space:        {num_actions}")
print(f"Input shape:         {input_shape} (21 pos × [2 colors + 8 embed])")
print(f"{'='*70}\n")

# Build Model
model = define_model(
    input_shape=input_shape,
    hidden_size=HIDDEN_SIZE,
    num_actions=num_actions,
    learning_rate=LEARNING_RATE,
    conv_filters=CONV_FILTERS
)

model.summary()

# GPU Test
test_input = tf.random.normal([1, 21, 10])
test_output = model(test_input, training=False)
print(f"\n✓ GPU test passed")
print(f"  Input: {test_input.device}")
print(f"  Output: {test_output.device}\n")

# Train
exp_replay = ExperienceReplay(max_memory=MAX_MEMORY, discount=DISCOUNT)
logging.basicConfig(level=logging.INFO)

trained_model = train_model(
    model=model,
    env=env,
    epochs=EPOCHS,
    experience_replay=exp_replay,
    epsilon_start=EPSILON_START,
    epsilon_end=EPSILON_END,
    epsilon_decay=EPSILON_DECAY,
    batch_size=BATCH_SIZE
)

# Save Model - FIXED
output_dir = '/kaggle/working/model'
os.makedirs(output_dir, exist_ok=True)

# FIX: Use .weights.h5 extension
trained_model.save_weights(f'{output_dir}/model.weights.h5', overwrite=True)
with open(f'{output_dir}/model.json', 'w') as f:
    json.dump(trained_model.to_json(), f)

print(f"\n✓ Model saved to {output_dir}/")
print(f"  - model.weights.h5")
print(f"  - model.json")

SYSTEM CHECK
✓ 2 GPU(s) configured

✓ Found dataset: /kaggle/input/waffle/waffles_shuffled.jsonl
✓ Found embeddings: /kaggle/input/waffle/letter_embeddings.npy
✓ Loaded embeddings: (27, 8)

ENVIRONMENT SETUP
Puzzles loaded:      100
Action space:        441
Input shape:         (21, 10) (21 pos × [2 colors + 8 embed])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1763221110.560302      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763221110.561085      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


I0000 00:00:1763221112.495289      48 cuda_dnn.cc:529] Loaded cuDNN version 90300
INFO:root:Starting training...



✓ GPU test passed
  Input: /job:localhost/replica:0/task:0/device:GPU:0
  Output: /job:localhost/replica:0/task:0/device:GPU:0



I0000 00:00:1763221116.325126      48 service.cc:148] XLA service 0x564cc660 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763221116.326839      48 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1763221116.326864      48 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1763221120.301907      48 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Ep    1/200 | Loss: 367116.5978 | ε: 0.9950 | Wins:   0 (  0.0%) | AvgR: -541.08
Ep    2/200 | Loss: 38292.4113 | ε: 0.9900 | Wins:   0 (  0.0%) | AvgR: -530.81
Ep    3/200 | Loss: 22472.1784 | ε: 0.9851 | Wins:   0 (  0.0%) | AvgR: -538.78
Ep    4/200 | Loss: 16012.6617 | ε: 0.9801 | Wins:   0 (  0.0%) | AvgR: -522.67
Ep    5/200 | Loss: 12481.0264 | ε: 0.9752 | Wins:   0 (  0.0%) | AvgR: -540.10
Ep    6/200 | Loss: 10240.6189 | ε: 0.9704 | Wins:   0 (  0.0%) | AvgR: -539.55
Ep    7/200 | Loss: 8694.9939 | ε: 0.9655 | Wins:   0 (  0.0%) | AvgR: -537.24
Ep    8/200 | Loss: 7563.5463 | ε: 0.9607 | Wins:   0 (  0.0%) | AvgR: -523.69
Ep    9/200 | Loss: 6698.8280 | ε: 0.9559 | Wins:   0 (  0.0%) | AvgR: -523.02
Ep   10/200 | Loss: 6016.9142 | ε: 0.9511 | Wins:   0 (  0.0%) | AvgR: -525.09
Ep   20/200 | Loss: 3053.5714 | ε: 0.9046 | Wins:   0 (  0.0%) | AvgR: -519.45
Ep   30/200 | Loss: 2098.0295 | ε: 0.8604 | Wins:   0 (  0.0%) | AvgR: -515.55
Ep   40/200 | Loss: 1626.9329 | ε: 0.8183 | W