In [3]:
%%writefile env.py

# CREATING WAFFLE ENVIRONMENT

import numpy as np
import json
from typing import Tuple, List, Optional, Set

class Waffle(object):
    def __init__(self, puzzles_file: str, 
                 embeddings_file: str,
                 vocab_file: str = "/kaggle/input/waffler/words.txt") -> None:
        self.puzzles_file = puzzles_file
        self.puzzles = self._load_puzzles()
        self.embeddings = self._load_embeddings(embeddings_file)
        self.grid_size = 21
        self.current_puzzle = None
        self.current_state = None
        self.target_state = None
        self.fixed_indices = None
        
        self.row_words = [
            [0, 1, 2, 3, 4],
            [8, 9,10,11,12],
            [16,17,18,19,20]
        ]
        self.col_words = [
            [0, 5, 8,13,16],
            [2, 6,10,14,18],
            [4, 7,12,15,20]
        ]

        self.vocab = self._load_vocab(vocab_file)

        self.reset(0)

    def _load_puzzles(self) -> List[dict]:
        puzzles = []
        with open(self.puzzles_file, "r") as f:
            for line in f:
                puzzles.append(json.loads(line))
        return puzzles

    def _load_vocab(self, vocab_file: str) -> Set[str]:
        vocab = set()
        with open(vocab_file, 'r') as f:
            for line in f:
                word = line.strip().lower()
                if len(word) == 5:
                    vocab.add(word)
        print(f"Loaded vocabulary with {len(vocab)} words")
        return vocab

    def _load_embeddings(self, embeddings_file: str) -> np.ndarray:
        try:
            embeddings = np.load(embeddings_file)
            print(f"Loaded embeddings: {embeddings.shape}")
            return embeddings.astype(np.float32)
        except FileNotFoundError:
            print(f"Creating random embeddings")
            return np.random.randn(27, 8).astype(np.float32)

    def _string_to_array(self, flat21: str) -> np.ndarray:
        return np.array([ord(c) - ord('a') + 1 for c in flat21], dtype=np.int32)

    def _array_to_string(self, arr: np.ndarray) -> str:
        return "".join([chr(int(x) + ord('a') - 1) for x in arr])

    def reset(self, index: int) -> None:
        self.current_puzzle = self.puzzles[index]
        self.target_state = self._string_to_array(self.current_puzzle["target_flat21"])
        self.current_state = self._string_to_array(self.current_puzzle["shuffled_flat21"])
        self.fixed_indices = set(self.current_puzzle["fixed_indices"])
        self.moves = 0
        self.max_moves = 50
        self.state_history = [self.current_state.copy()]
        self.prev_word_count = self._count_words(self.current_state)

    def _update_state(self, action: int) -> None:
        pos1 = action // 21
        pos2 = action % 21
        temp = self.current_state[pos1]
        self.current_state[pos1] = self.current_state[pos2]
        self.current_state[pos2] = temp
        self.moves += 1

    def _count_words(self, state: np.ndarray) -> int:
        letters = [chr(c + ord('a') - 1) for c in state]
        count = 0
        for indices in self.row_words + self.col_words:
            word_str = ''.join([letters[i] for i in indices])
            if word_str in self.vocab:
                count += 1
        return count

    def _is_over(self) -> bool:
        return self._is_win() or self.moves >= self.max_moves
    
    def _is_win(self) -> bool:
        return np.array_equal(self.current_state, self.target_state)

    def observe(self) -> np.ndarray:
        # Just for simplicity, use a zero-mean/normalized "feature cube" (no change from before here)
        features = np.zeros((21, 10), dtype=np.float32)
        for i in range(21):
            current_letter = self.current_state[i]
            target_letter = self.target_state[i]
            features[i, 0] = float(current_letter == target_letter)
            letter_idx = int(current_letter) - 1 if current_letter >= 1 else 26
            features[i, 2:10] = self.embeddings[letter_idx]
        grid = np.zeros((5, 5, 10), dtype=np.float32)
        blank_embedding = self.embeddings[26]
        blank_feature_vector = np.zeros(10, dtype=np.float32)
        blank_feature_vector[2:10] = blank_embedding
        blank_positions = [(1,1), (1,3), (3,1), (3,3)]
        for r, c in blank_positions:
            grid[r, c, :] = blank_feature_vector
        position_to_grid = [
            (0,0), (0,1), (0,2), (0,3), (0,4),
            (1,0), (1,2), (1,4),
            (2,0), (2,1), (2,2), (2,3), (2,4),
            (3,0), (3,2), (3,4),
            (4,0), (4,1), (4,2), (4,3), (4,4)
        ]
        for flat_idx in range(21):
            r, c = position_to_grid[flat_idx]
            grid[r, c, :] = features[flat_idx]
        return grid.reshape(1, 5, 5, 10)

    def get_valid_actions(self) -> List[int]:
        positions = [pos for pos in range(21) if pos not in self.fixed_indices]
        valid_actions = []
        for i, pos1 in enumerate(positions):
            for pos2 in positions[i+1:]:
                action = pos1 * 21 + pos2
                valid_actions.append(action)
        return valid_actions

    def get_num_actions(self) -> int:
        return 21 * 21

    # ---- PRIMARY: Revised reward system ----
    def _get_reward(self, action: int) -> float:
        reward = 0.0
        win = np.array_equal(self.current_state, self.target_state)
        if win:
            reward = 1.0
        elif self.moves >= self.max_moves:
            reward = -1.0
        else:
            # Reward only net gain in valid word count, penalize loss
            word_count = self._count_words(self.current_state)
            delta_word = word_count - self.prev_word_count
            if delta_word > 0:
                reward += 0.2 * delta_word
            elif delta_word < 0:
                reward -= 0.2 * abs(delta_word)
            reward -= 0.01
            # Heavier penalty for "undoing" (flip-flop) swaps
            if len(self.state_history) > 2 and np.array_equal(self.current_state, self.state_history[-2]):
                reward -= 0.05
        self.prev_word_count = self._count_words(self.current_state)
        self.state_history.append(self.current_state.copy())
        if len(self.state_history) > 10:
            self.state_history.pop(0)
        return reward

    def act(self, action: int) -> Tuple[np.ndarray, float, bool, bool]:
        self._update_state(action)
        reward = self._get_reward(action)
        game_over = self._is_over()
        game_win = self._is_win()
        return self.observe(), reward, game_over, game_win

    def get_current_state_letters(self) -> np.ndarray:
        return self.current_state

Writing env.py


In [4]:
%%writefile experience_replay.py

# WRITING EXPERIENCE REPLAY LOGIC

import numpy as np
import tensorflow as tf
from tensorflow import keras

class ExperienceReplay(object):
    def __init__(self, max_memory: int = 10000, discount: float = 0.99) -> None:
        self.max_memory = max_memory
        self.memory = []
        self.discount = discount

    def add_experience(self, sars: list, game_over: bool) -> None:
        self.memory.append([sars, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_qlearning_batch(self, model, target_model, batch_size: int = 128):
        memory_length = len(self.memory)
        num_inputs = min(memory_length, batch_size)
        ids = np.random.choice(memory_length, size=num_inputs, replace=False)
        sars = list(zip(*[self.memory[id_][0] for id_ in ids]))
        previous_states, action_ts, rewards, current_states = (
            np.concatenate(e) if isinstance(e[0], np.ndarray) else np.stack(e)
            for e in sars
        )
        game_over = np.array([self.memory[id_][1] for id_ in ids], dtype=bool)
        previous_states_tensor = tf.constant(previous_states, dtype=tf.float32)
        current_states_tensor = tf.constant(current_states, dtype=tf.float32)
        targets = model(previous_states_tensor, training=False).numpy()
        next_q_main = model(current_states_tensor, training=False).numpy()
        best_actions = np.argmax(next_q_main, axis=1)
        next_q_target = target_model(current_states_tensor, training=False).numpy()
        Q_sa = next_q_target[np.arange(num_inputs), best_actions]
        targets[np.arange(num_inputs), action_ts] = (
            rewards + self.discount * Q_sa * (~game_over)
        )
        return previous_states, targets


Writing experience_replay.py


In [2]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow running on GPU:", tf.test.is_gpu_available())

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Num GPUs Available: 2
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
TensorFlow running on GPU: True


I0000 00:00:1763841587.030596      48 gpu_device.cc:2022] Created device /device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763841587.031342      48 gpu_device.cc:2022] Created device /device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [None]:
import os
import json
import numpy as np
import logging
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam

from experience_replay import ExperienceReplay
from env import Waffle

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def define_model(input_shape, hidden_size, num_actions, learning_rate=0.0001):
    # Robust Adam with gradient clipping
    optimizer = Adam(learning_rate=learning_rate, clipnorm=1.0)
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape, dtype='float32'))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(Flatten())
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dense(hidden_size//2, activation='relu'))
    model.add(Dense(num_actions, dtype='float32'))
    model.compile(optimizer, loss='huber')  # Huber loss for DQN stability
    return model

# == Find dataset == #
dataset_path = None
embeddings_path = '/kaggle/input/letter_embeddings.npy'
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'waffles_shuffled_2k.jsonl':
            dataset_path = os.path.join(dirname, filename)
        if filename == 'letter_embeddings.npy':
            embeddings_path = os.path.join(dirname, filename)
if not dataset_path:
    raise FileNotFoundError("waffles_shuffled_2k.jsonl not found")

# == Hyperparameters == #
EPOCHS = 40
EPSILON_START = 1.0
EPSILON_END = 0.01
TARGET_EPSILON_EPOCH = 100
EPSILON_DECAY = (EPSILON_END / EPSILON_START) ** (1 / TARGET_EPSILON_EPOCH)
MAX_MEMORY = 10000
HIDDEN_SIZE = 256
BATCH_SIZE = 128
DISCOUNT = 0.99
LEARNING_RATE = 0.0001       # Lowered!
PUZZLES_PER_EPOCH = 400
TARGET_UPDATE_FREQUENCY = 1000

# == Environment & Models == #
env = Waffle(puzzles_file=dataset_path, embeddings_file=embeddings_path)
num_actions = env.get_num_actions()
input_shape = (5, 5, 10)

model = define_model(input_shape, HIDDEN_SIZE, num_actions, LEARNING_RATE)
target_model = define_model(input_shape, HIDDEN_SIZE, num_actions, LEARNING_RATE)
target_model.set_weights(model.get_weights())

exp_replay = ExperienceReplay(max_memory=MAX_MEMORY, discount=DISCOUNT)
logging.basicConfig(level=logging.INFO)

def sample_puzzles(epoch, env, epochs, puzzles_per_epoch):
    initial_correctness_list = []
    for idx, p in enumerate(env.puzzles):
        state = env._string_to_array(p["shuffled_flat21"])
        target = env._string_to_array(p["target_flat21"])
        correct = np.sum(state == target)
        initial_correctness_list.append((idx, correct))
    bins = {
        'easy':   [idx for idx, c in initial_correctness_list if c >= 9],
        'medium': [idx for idx, c in initial_correctness_list if 7 <= c < 9],
        'hard':   [idx for idx, c in initial_correctness_list if 5 <= c < 7],
        'harder':[idx for idx, c in initial_correctness_list if c < 5]
    }
    fraction = epoch / epochs
    if fraction < 0.25:
        return np.random.choice(bins['easy'], size=puzzles_per_epoch, replace=True)
    elif fraction < 0.50:
        mixed = bins['easy'] + bins['medium']
        return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)
    elif fraction < 0.75:
        mixed = bins['medium'] + bins['hard']
        return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)
    else:
        mixed = bins['hard'] + bins['harder']
        return np.random.choice(mixed, size=puzzles_per_epoch, replace=True)

def train_model(model, target_model, env, epochs, experience_replay,
               epsilon_start, epsilon_end, epsilon_decay, batch_size,
               puzzles_per_epoch, target_update_freq):
    win_count = 0
    total_rewards = []
    epsilon = epsilon_start
    num_actions = env.get_num_actions()
    training_steps = 0
    for epoch in range(epochs):
        loss = 0.0
        epoch_wins = 0
        avg_moves = []
        correct_letters = []
        puzzle_indices = sample_puzzles(epoch, env, epochs, puzzles_per_epoch)
        for puzzle_idx in puzzle_indices:
            env.reset(puzzle_idx)
            current_state = env.observe()
            game_over = False
            episode_reward = 0
            step_count = 0

            while not game_over:
                valid_actions = env.get_valid_actions()
                if np.random.rand() <= epsilon:
                    action = np.random.choice(valid_actions)
                else:
                    q = model.predict(current_state, verbose=0)[0]
                    masked_q = np.full(num_actions, -np.inf)
                    masked_q[valid_actions] = q[valid_actions]
                    action = int(np.argmax(masked_q))
                next_state, reward, game_over, game_win = env.act(action)
                episode_reward += reward
                step_count += 1
                experience_replay.add_experience(
                    [current_state, int(action), reward, next_state],
                    game_over
                )
                current_state = next_state

                if len(experience_replay.memory) >= batch_size and step_count % 5 == 0:
                    inputs, targets = experience_replay.get_qlearning_batch(
                        model, target_model, batch_size=batch_size)
                    loss += model.train_on_batch(inputs, targets)
                    training_steps += 1
                    if training_steps % target_update_freq == 0:
                        target_model.set_weights(model.get_weights())
                        print(f"  [Target network updated at step {training_steps}]")
            total_rewards.append(episode_reward)
            if game_win:
                epoch_wins += 1
            avg_moves.append(step_count)
            num_correct = np.sum(env.current_state == env.target_state)
            correct_letters.append(num_correct)
        win_count += epoch_wins
        avg_moves_epoch = np.mean(avg_moves)
        avg_correct_epoch = np.mean(correct_letters)
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        recent_window = min(400, len(total_rewards))
        avg_reward = np.mean(total_rewards[-recent_window:])
        win_percent = epoch_wins / puzzles_per_epoch * 100
        print(f"Ep {epoch+1}/{epochs} | Loss: {loss:8.4f} | ε: {epsilon:.4f} | "
              f"Wins in epoch: {epoch_wins} ({win_percent:5.2f}%) | "
              f"AvgMoves: {avg_moves_epoch:.1f} | "
              f"AvgCorrectAtEnd: {avg_correct_epoch:.1f}/21 | "
              f"AvgR: {avg_reward:7.2f}")

trained_model = train_model(
    model=model,
    target_model=target_model,
    env=env,
    epochs=EPOCHS,
    experience_replay=exp_replay,
    epsilon_start=EPSILON_START,
    epsilon_end=EPSILON_END,
    epsilon_decay=EPSILON_DECAY,
    batch_size=BATCH_SIZE,
    puzzles_per_epoch=PUZZLES_PER_EPOCH,
    target_update_freq=TARGET_UPDATE_FREQUENCY
)


Loaded embeddings: (27, 8)
Loaded vocabulary with 5757 words
  [Target network updated at step 1000]
  [Target network updated at step 2000]
  [Target network updated at step 3000]
Ep 1/40 | Loss: 318.6378 | ε: 0.9550 | Wins in epoch: 0 ( 0.00%) | AvgMoves: 50.0 | AvgCorrectAtEnd: 7.2/21 | AvgR:   -1.55
  [Target network updated at step 4000]
  [Target network updated at step 5000]
  [Target network updated at step 6000]
  [Target network updated at step 7000]
Ep 2/40 | Loss: 1184.1132 | ε: 0.9120 | Wins in epoch: 0 ( 0.00%) | AvgMoves: 50.0 | AvgCorrectAtEnd: 7.1/21 | AvgR:   -1.55
  [Target network updated at step 8000]
  [Target network updated at step 9000]
  [Target network updated at step 10000]
  [Target network updated at step 11000]
Ep 3/40 | Loss: 4579.7545 | ε: 0.8710 | Wins in epoch: 0 ( 0.00%) | AvgMoves: 50.0 | AvgCorrectAtEnd: 7.2/21 | AvgR:   -1.56
  [Target network updated at step 12000]
  [Target network updated at step 13000]
  [Target network updated at step 14000]
