In [5]:
import gymnasium as gym
import gymnasium_2048
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, defaultdict, Counter
import random

# --- Setup ---
env = gym.make("gymnasium_2048/TwentyFortyEight-v0")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Hyperparameters ---
GAMMA = 0.99
LR = 1e-3
EPSILON_START = 1.0
EPSILON_END = 0.05
EPSILON_DECAY = 0.995
BATCH_SIZE = 64
MEMORY_SIZE = 10000
TARGET_UPDATE_FREQ = 10
ROLLING_WINDOW = 20
CONVERGENCE_SCORE = 500
MAX_EPISODES = 500

ACTION_MAP = {0: "Up", 1: "Right", 2: "Down", 3: "Left"}

# --- DQN Model ---
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 4)
        )
    def forward(self, x):
        return self.net(x)

def preprocess(state):
    return torch.tensor(np.reshape(state, -1), dtype=torch.float32).to(device)

policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = deque(maxlen=MEMORY_SIZE)

epsilon = EPSILON_START
score_history = []
move_counter_global = defaultdict(int)
episode_bucket_counts = []

def sample_action(state, explore=True):
    if explore and np.random.rand() < epsilon:
        return env.action_space.sample()
    with torch.no_grad():
        return torch.argmax(policy_net(state)).item()

def optimize():
    if len(memory) < BATCH_SIZE:
        return
    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.stack(states)
    actions = torch.tensor(actions).unsqueeze(1).to(device)
    rewards = torch.tensor(rewards).float().to(device)
    next_states = torch.stack(next_states)
    dones = torch.tensor(dones).float().to(device)

    q_values = policy_net(states).gather(1, actions).squeeze()
    next_q_values = target_net(next_states).max(1)[0].detach()
    target = rewards + GAMMA * next_q_values * (1 - dones)

    loss = nn.functional.mse_loss(q_values, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# --- Training Loop ---
episode = 0
while episode < MAX_EPISODES:
    obs, _ = env.reset()
    state = preprocess(obs)
    total_reward = 0
    move_counter_episode = defaultdict(int)

    while True:
        action = sample_action(state)
        move_name = ACTION_MAP[action]
        move_counter_global[move_name] += 1
        move_counter_episode[move_name] += 1

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = preprocess(next_obs)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        optimize()
        if done:
            break

    if episode % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    score_history.append(total_reward)
    print(f"Episode {episode}, Score: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

    # Check convergence
    if episode >= ROLLING_WINDOW:
        avg_score = np.mean(score_history[-ROLLING_WINDOW:])
        if avg_score >= CONVERGENCE_SCORE:
            print(f"\n✅ Training converged at episode {episode} with avg score {avg_score:.2f}")
            break

    if (episode + 1) % 100 == 0:
        episode_bucket_counts.append(dict(move_counter_global))

    episode += 1

# --- Final Move Stats ---
print("\n=== Final Move Frequency (Total) ===")
for move in ACTION_MAP.values():
    print(f"{move}: {move_counter_global[move]}")

print("\n=== Moves Per 100 Episodes ===")
for i, snapshot in enumerate(episode_bucket_counts):
    print(f"--- Episodes {i * 100 + 1} to {(i + 1) * 100} ---")
    if i == 0:
        base = snapshot
    else:
        base = {
            move: snapshot[move] - episode_bucket_counts[i - 1].get(move, 0)
            for move in ACTION_MAP.values()
        }
    for move in ACTION_MAP.values():
        print(f"{move}: {base[move]}")
    print()

# --- Evaluation ---
print("\n🔍 Running 100 Evaluation Episodes (no exploration)...")
eval_scores = []
tile_counts = Counter()
move_counter_eval = defaultdict(int)

for _ in range(100):
    obs, _ = env.reset()
    state = preprocess(obs)
    total_reward = 0
    while True:
        action = sample_action(state, explore=False)
        move_counter_eval[ACTION_MAP[action]] += 1
        next_obs, reward, terminated, truncated, _ = env.step(action)
        state = preprocess(next_obs)
        total_reward += reward
        if terminated or truncated:
            break
    eval_scores.append(total_reward)
    final_board = np.argmax(next_obs, axis=2)  # one-hot to tile value
    tile_counts[2 ** np.max(final_board)] += 1

print(f"\n✅ Evaluation complete. Results over 100 episodes:")
print(f"Average Score: {np.mean(eval_scores):.2f}")
print(f"Max Score: {np.max(eval_scores):.2f}")
print("\nMax Tile Achieved (frequency):")
for tile, count in sorted(tile_counts.items(), reverse=True):
    print(f"{tile}: {count} times")

print("\nEvaluation Move Distribution:")
for move in ACTION_MAP.values():
    print(f"{move}: {move_counter_eval[move]}")

  total_reward += reward


Episode 0, Score: 104.00, Epsilon: 0.995
Episode 1, Score: 196.00, Epsilon: 0.990
Episode 2, Score: 56.00, Epsilon: 0.985
Episode 3, Score: 248.00, Epsilon: 0.980
Episode 4, Score: 240.00, Epsilon: 0.975
Episode 5, Score: 160.00, Epsilon: 0.970
Episode 6, Score: 172.00, Epsilon: 0.966
Episode 7, Score: 4.00, Epsilon: 0.961
Episode 8, Score: 64.00, Epsilon: 0.956
Episode 9, Score: 52.00, Epsilon: 0.951
Episode 10, Score: 168.00, Epsilon: 0.946
Episode 11, Score: 44.00, Epsilon: 0.942
Episode 12, Score: 252.00, Epsilon: 0.937
Episode 13, Score: 96.00, Epsilon: 0.932
Episode 14, Score: 236.00, Epsilon: 0.928
Episode 15, Score: 92.00, Epsilon: 0.923
Episode 16, Score: 172.00, Epsilon: 0.918
Episode 17, Score: 4.00, Epsilon: 0.914
Episode 18, Score: 196.00, Epsilon: 0.909
Episode 19, Score: 0.00, Epsilon: 0.905
Episode 20, Score: 212.00, Epsilon: 0.900
Episode 21, Score: 216.00, Epsilon: 0.896
Episode 22, Score: 36.00, Epsilon: 0.891
Episode 23, Score: 240.00, Epsilon: 0.887
Episode 24, Sco

KeyboardInterrupt: 