# Tessellate RL

Train an RL agent on a 2-player board game with multiplicative scoring.

In [None]:
# Install dependencies
!pip install torch numpy -q

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

# Import game environment
from tessellate_env import TessellateEnv

## Quick Demo

In [None]:
# Create environment and play 5 random moves
env = TessellateEnv(reward_mode='mixed')
obs = env.reset()

print(f"Observation shape: {obs.shape}")
print(f"Action space: 0-99 (board positions)\n")

for i in range(5):
    valid = env.get_valid_actions()
    action = np.random.choice(valid)
    obs, reward, done, info = env.step(action)
    
    print(f"Move {i+1}: Action={action}, Reward={reward:.3f}")
    print(f"  Scores: Red={info['current_scores'][1]}, Blue={info['current_scores'][2]}")

## DQN Agent

In [None]:
class DQN(nn.Module):
    def __init__(self, input_size=101, hidden_size=128, output_size=100):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, lr=0.001, gamma=0.95, epsilon=0.1):
        self.q_net = DQN()
        self.target_net = DQN()
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=lr)
        self.memory = deque(maxlen=10000)
        self.gamma = gamma
        self.epsilon = epsilon
        
    def act(self, state, valid_actions):
        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        with torch.no_grad():
            q_values = self.q_net(torch.FloatTensor(state))
            q_values_valid = q_values[valid_actions]
            return valid_actions[q_values_valid.argmax().item()]
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def replay(self, batch_size=32):
        if len(self.memory) < batch_size:
            return
        
        batch = random.sample(self.memory, batch_size)
        states = torch.FloatTensor(np.array([e[0] for e in batch]))
        actions = torch.LongTensor([e[1] for e in batch])
        rewards = torch.FloatTensor([e[2] for e in batch])
        next_states = torch.FloatTensor(np.array([e[3] for e in batch]))
        dones = torch.FloatTensor([e[4] for e in batch])
        
        current_q = self.q_net(states).gather(1, actions.unsqueeze(1))
        next_q = self.target_net(next_states).max(1)[0].detach()
        target_q = rewards + (1 - dones) * self.gamma * next_q
        
        loss = nn.MSELoss()(current_q.squeeze(), target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    def update_target(self):
        self.target_net.load_state_dict(self.q_net.state_dict())

## Training

In [None]:
# Train agent
env = TessellateEnv(reward_mode='mixed')
agent = DQNAgent(epsilon=0.3)
episodes = 100
rewards = []

for ep in range(episodes):
    state = env.reset()
    total_reward = 0
    
    while True:
        valid = env.get_valid_actions()
        if not valid:
            break
            
        action = agent.act(state, valid)
        next_state, reward, done, _ = env.step(action)
        
        agent.remember(state, action, reward, next_state, done)
        agent.replay()
        
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    rewards.append(total_reward)
    
    # Update target network
    if ep % 10 == 0:
        agent.update_target()
        agent.epsilon *= 0.9  # Decay exploration
    
    if ep % 20 == 0:
        print(f"Episode {ep}: Avg Reward = {np.mean(rewards[-20:]):.3f}")

print(f"\nFinal avg reward: {np.mean(rewards[-20:]):.3f}")

## Evaluation

In [None]:
# Test trained agent vs random
def evaluate(agent, n_games=20):
    wins = 0
    
    for _ in range(n_games):
        env = TessellateEnv()
        state = env.reset()
        
        # Track scores for both players
        while not env.is_terminal():
            valid = env.get_valid_actions()
            if not valid:
                break
            
            # Agent plays RED (player 1)
            if env.game.current_turn == 1:
                action = agent.act(state, valid)
            else:
                # Random plays BLUE
                action = random.choice(valid)
            
            state, _, done, info = env.step(action)
            
            if done:
                if info['winner'] == 1:
                    wins += 1
                break
    
    return wins / n_games * 100

agent.epsilon = 0  # No exploration during evaluation
win_rate = evaluate(agent)
print(f"Win rate vs random: {win_rate:.1f}%")

## Save & Next Steps

In [None]:
# Save model
torch.save(agent.q_net.state_dict(), 'tessellate_dqn.pt')
print("Model saved to tessellate_dqn.pt")

print("\nNext steps:")
print("1. Try different reward modes: 'sparse', 'immediate', 'mixed'")
print("2. Implement self-play training")
print("3. Add PPO or A3C algorithms")
print("4. Visualize games: python -m http.server 8000 → browser.html")