# Alpha2048 - Training in Google Colab

This notebook demonstrates how to train a DQN agent to play 2048 using the alpha2048 package.

## 1. Install the Package

Install directly from GitHub repository:

In [None]:
!pip install git+https://github.com/nnaakkaaii/alpha2048.git#subdirectory=python

## 2. Import Required Libraries

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

# Import alpha2048 components
from pkg.environments.game_2048_env import Game2048Env
from pkg.agents.dqn_agent import DQNAgent
from pkg.networks.dqn import DQN
from pkg.networks.cnn import CNN
from pkg.utils.replay_memory import ReplayMemory
from pkg.utils.state import get_state_flatten, get_state_one_hot, get_state_cnn

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 3. Configure Training Parameters

In [None]:
# Training configuration
config = {
    'episodes': 1000,           # Number of training episodes
    'batch_size': 64,           # Batch size for training
    'lr': 1e-4,                 # Learning rate
    'gamma': 0.99,              # Discount factor
    'epsilon_start': 1.0,       # Starting exploration rate
    'epsilon_end': 0.01,        # Minimum exploration rate
    'epsilon_decay': 0.995,     # Exploration decay rate
    'target_update': 10,        # Target network update frequency
    'memory_size': 10000,       # Experience replay buffer size
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'network_type': 'cnn',      # 'mlp' or 'cnn'
    'use_double_dqn': True,     # Use Double DQN
}

print(f"Training configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

## 4. Create Environment and Agent

In [None]:
# Create environment
env = Game2048Env()

# Determine state size based on network type
if config['network_type'] == 'cnn':
    state_size = (16, 4, 4)  # CNN expects (channels, height, width)
    get_state_fn = get_state_cnn
else:
    state_size = 256  # One-hot encoding: 16 positions × 16 possible values
    get_state_fn = get_state_one_hot

# Create agent
agent = DQNAgent(
    state_size=state_size,
    action_size=4,
    lr=config['lr'],
    gamma=config['gamma'],
    epsilon=config['epsilon_start'],
    epsilon_min=config['epsilon_end'],
    epsilon_decay=config['epsilon_decay'],
    memory_size=config['memory_size'],
    batch_size=config['batch_size'],
    target_update=config['target_update'],
    device=config['device'],
    network_type=config['network_type'],
    use_double_dqn=config['use_double_dqn']
)

print(f"Agent created with {config['network_type'].upper()} network")
print(f"State size: {state_size}")
print(f"Action size: 4")

## 5. Training Loop

In [None]:
# Training metrics
scores = []
max_tiles = []
epsilon_values = []
losses = []

# Training loop
for episode in range(config['episodes']):
    state = env.reset()
    state = get_state_fn(state)
    total_reward = 0
    done = False
    
    while not done:
        # Choose action
        action = agent.act(state)
        
        # Take action in environment
        next_state, reward, done, info = env.step(action)
        next_state = get_state_fn(next_state)
        
        # Store experience
        agent.remember(state, action, reward, next_state, done)
        
        # Update state
        state = next_state
        total_reward += reward
        
        # Train agent
        if len(agent.memory) > config['batch_size']:
            loss = agent.replay()
            if loss is not None:
                losses.append(loss)
    
    # Update target network
    if episode % config['target_update'] == 0:
        agent.update_target_network()
    
    # Update epsilon
    agent.update_epsilon()
    
    # Record metrics
    scores.append(info['score'])
    max_tiles.append(info['max_tile'])
    epsilon_values.append(agent.epsilon)
    
    # Print progress
    if (episode + 1) % 10 == 0:
        avg_score = np.mean(scores[-10:])
        avg_max_tile = np.mean(max_tiles[-10:])
        print(f"Episode {episode + 1}/{config['episodes']}")
        print(f"  Avg Score: {avg_score:.0f}")
        print(f"  Avg Max Tile: {avg_max_tile:.0f}")
        print(f"  Epsilon: {agent.epsilon:.3f}")
        if losses:
            print(f"  Avg Loss: {np.mean(losses[-100:]):.4f}")
        print()
        
    # Plot progress every 100 episodes
    if (episode + 1) % 100 == 0:
        clear_output(wait=True)
        
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        # Plot scores
        axes[0, 0].plot(scores)
        axes[0, 0].set_title('Game Scores')
        axes[0, 0].set_xlabel('Episode')
        axes[0, 0].set_ylabel('Score')
        
        # Plot max tiles
        axes[0, 1].plot(max_tiles)
        axes[0, 1].set_title('Maximum Tiles')
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Max Tile')
        
        # Plot epsilon
        axes[1, 0].plot(epsilon_values)
        axes[1, 0].set_title('Exploration Rate (Epsilon)')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Epsilon')
        
        # Plot loss
        if losses:
            axes[1, 1].plot(losses[-1000:])  # Plot last 1000 losses
            axes[1, 1].set_title('Training Loss (Last 1000)')
            axes[1, 1].set_xlabel('Training Step')
            axes[1, 1].set_ylabel('Loss')
        
        plt.tight_layout()
        plt.show()

print("Training completed!")
print(f"Final average score (last 100 episodes): {np.mean(scores[-100:]):.0f}")
print(f"Final average max tile (last 100 episodes): {np.mean(max_tiles[-100:]):.0f}")
print(f"Best score achieved: {max(scores):.0f}")
print(f"Best tile achieved: {max(max_tiles):.0f}")

## 6. Save the Trained Model

In [None]:
# Save the model
model_path = 'alpha2048_model.pth'
torch.save({
    'model_state_dict': agent.q_network.state_dict(),
    'optimizer_state_dict': agent.optimizer.state_dict(),
    'epsilon': agent.epsilon,
    'config': config,
    'scores': scores,
    'max_tiles': max_tiles,
}, model_path)

print(f"Model saved to {model_path}")

# Download the model (for Colab)
try:
    from google.colab import files
    files.download(model_path)
    print("Model downloaded!")
except ImportError:
    print("Not running in Colab, model saved locally.")

## 7. Test the Trained Agent

In [None]:
def test_agent(agent, env, num_games=10, visualize=True):
    """Test the trained agent."""
    test_scores = []
    test_max_tiles = []
    
    # Set agent to evaluation mode (no exploration)
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0
    
    for game in range(num_games):
        state = env.reset()
        state = get_state_fn(state)
        done = False
        
        if visualize and game == 0:  # Visualize first game
            print(f"\nGame {game + 1}:")
            print("Initial board:")
            env.render()
        
        step = 0
        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            next_state = get_state_fn(next_state)
            state = next_state
            step += 1
            
            if visualize and game == 0 and (step % 50 == 0 or done):
                print(f"\nStep {step}, Action: {['Up', 'Down', 'Left', 'Right'][action]}")
                print(f"Score: {info['score']}, Max Tile: {info['max_tile']}")
                if not done:
                    env.render()
        
        test_scores.append(info['score'])
        test_max_tiles.append(info['max_tile'])
        
        if visualize and game == 0:
            print("\nFinal board:")
            env.render()
        
        print(f"Game {game + 1}: Score = {info['score']}, Max Tile = {info['max_tile']}")
    
    # Restore original epsilon
    agent.epsilon = original_epsilon
    
    print(f"\nTest Results ({num_games} games):")
    print(f"Average Score: {np.mean(test_scores):.0f} ± {np.std(test_scores):.0f}")
    print(f"Average Max Tile: {np.mean(test_max_tiles):.0f} ± {np.std(test_max_tiles):.0f}")
    print(f"Best Score: {max(test_scores)}")
    print(f"Best Tile: {max(test_max_tiles)}")
    
    return test_scores, test_max_tiles

# Test the agent
test_scores, test_max_tiles = test_agent(agent, env, num_games=10, visualize=True)

## 8. Visualize Test Performance

In [None]:
# Create performance visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Test scores distribution
axes[0].bar(range(len(test_scores)), test_scores)
axes[0].axhline(y=np.mean(test_scores), color='r', linestyle='--', label=f'Mean: {np.mean(test_scores):.0f}')
axes[0].set_title('Test Game Scores')
axes[0].set_xlabel('Game')
axes[0].set_ylabel('Score')
axes[0].legend()

# Max tiles distribution
unique_tiles, counts = np.unique(test_max_tiles, return_counts=True)
axes[1].bar(unique_tiles, counts)
axes[1].set_title('Maximum Tile Distribution')
axes[1].set_xlabel('Max Tile')
axes[1].set_ylabel('Count')
axes[1].set_xscale('log', base=2)

plt.tight_layout()
plt.show()

# Print tile achievement statistics
print("\nTile Achievement Statistics:")
for tile in [256, 512, 1024, 2048, 4096]:
    count = sum(1 for t in test_max_tiles if t >= tile)
    percentage = (count / len(test_max_tiles)) * 100
    print(f"  Reached {tile}: {count}/{len(test_max_tiles)} ({percentage:.1f}%)")

## 9. Load and Continue Training (Optional)

In [None]:
# Load a saved model to continue training
def load_model(agent, model_path):
    """Load a saved model."""
    checkpoint = torch.load(model_path, map_location=agent.device)
    agent.q_network.load_state_dict(checkpoint['model_state_dict'])
    agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    agent.epsilon = checkpoint['epsilon']
    print(f"Model loaded from {model_path}")
    print(f"Epsilon restored to: {agent.epsilon:.3f}")
    return checkpoint.get('scores', []), checkpoint.get('max_tiles', [])

# Example: Load and continue training
# scores, max_tiles = load_model(agent, 'alpha2048_model.pth')
# Continue training from here...

## 10. Hyperparameter Tuning Tips

- **Learning Rate**: Start with 1e-4, adjust based on loss convergence
- **Batch Size**: Larger batches (64-128) for more stable training
- **Epsilon Decay**: Slower decay (0.995-0.999) for better exploration
- **Network Type**: CNN generally performs better for 2048
- **Memory Size**: Larger buffer (10000+) for more diverse experiences
- **Target Update**: More frequent updates (5-10) for faster learning