In [1]:
import gymnasium as gym
import gymnasium_2048
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, defaultdict, Counter
import random

# --- Setup --- 
# create and fully unwrap (removes TimeLimit wrapper)
env = gym.make("gymnasium_2048/TwentyFortyEight-v0")
env = env.unwrapped

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Hyperparameters ---
GAMMA = 0.99
LR = 1e-3
EPSILON_START = 1.0
EPSILON_END = 0.05
EPSILON_DECAY = 0.995
BATCH_SIZE = 64
MEMORY_SIZE = 10000
TARGET_UPDATE_FREQ = 10
ROLLING_WINDOW = 20
CONVERGENCE_SCORE = 2000
MAX_EPISODES = 10000
WARMUP_MEMORY = 1000
DEBUG_EPISODES = 0

ACTION_MAP = {0: "Up", 1: "Right", 2: "Down", 3: "Left"}

# --- DQN Model ---
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(16, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 4)
        )
    def forward(self, x):
        return self.net(x)

# --- Preprocess ---
def preprocess(state):
    # same as before: log2 scaling into float32
    if state.ndim == 3:
        exp_board = np.argmax(state, axis=2)
        board = exp_board.astype(np.float32)
    elif state.ndim == 2:
        board = np.where(state == 0, 0, np.log2(state)).astype(np.float32)
    else:
        raise ValueError(f"Unexpected state shape: {state.shape}")
    return torch.tensor(board.flatten(), dtype=torch.float32, device=device)

# --- Decode for debug & int‐casting to avoid uint8 wraparound ---
def decode_obs(obs):
    if obs.ndim == 3:
        exp_board = np.argmax(obs, axis=2)
        # cast to int32 so merges >256 don’t wrap
        return np.where(exp_board == 0, 0, 2 ** exp_board).astype(np.int32)
    # obs may be uint8; cast it
    return obs.astype(np.int32)

# --- Slide logic for valid moves ---
def slide_row(row):
    nonzeros = [int(v) for v in row if v]  # ensure Python ints
    merged = []
    i = 0
    while i < len(nonzeros):
        if i + 1 < len(nonzeros) and nonzeros[i] == nonzeros[i+1]:
            merged.append(nonzeros[i] * 2)
            i += 2
        else:
            merged.append(nonzeros[i])
            i += 1
    merged += [0] * (len(row) - len(merged))
    return merged

def slide(board, action):
    # board is assumed int32 numpy array
    b = board.copy().astype(np.int32)
    if action == 0:  # Up
        b = b.T
        for i in range(4):
            b[i] = slide_row(list(b[i]))
        b = b.T
    elif action == 2:  # Down
        b = b.T
        for i in range(4):
            rev = list(reversed(b[i]))
            slid = slide_row(rev)
            b[i] = list(reversed(slid))
        b = b.T
    elif action == 3:  # Left
        for i in range(4):
            b[i] = slide_row(list(b[i]))
    elif action == 1:  # Right
        for i in range(4):
            rev = list(reversed(b[i]))
            slid = slide_row(rev)
            b[i] = list(reversed(slid))
    return b

# --- Valid actions without env.step ---
def valid_actions(obs):
    board = decode_obs(obs)
    # try each action on integer board
    return [a for a in range(4) if not np.array_equal(board, slide(board, a))]

# --- Select action ---
def sample_action(state, obs, explore=True):
    valids = valid_actions(obs)
    if not valids:
        return None
    if explore and random.random() < epsilon:
        return random.choice(valids)
    with torch.no_grad():
        q_vals = policy_net(state).cpu().numpy()
        return max(valids, key=lambda a: q_vals[a])

# --- Networks & memory ---
policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory    = deque(maxlen=MEMORY_SIZE)

epsilon = EPSILON_START
score_history = []
move_counter_global = defaultdict(int)
episode_bucket_counts = []

# --- Optimize ---
def optimize():
    if len(memory) < BATCH_SIZE:
        return
    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)
    states      = torch.stack(states)
    actions     = torch.tensor(actions).unsqueeze(1).to(device)
    rewards     = torch.tensor(rewards).float().to(device)
    next_states = torch.stack(next_states)
    dones       = torch.tensor(dones).float().to(device)

    q      = policy_net(states).gather(1, actions).squeeze()
    q_next = target_net(next_states).max(1)[0].detach()
    target = rewards + GAMMA * q_next * (1 - dones)

    loss = nn.functional.mse_loss(q, target)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
    optimizer.step()

# --- Training Loop ---
global_max_tile = 0
episode = 0
episode_max_tile = 0
while episode < MAX_EPISODES:
    obs, _ = env.reset()
    # track the largest tile this episode
    episode_max_tile = 0
    state   = preprocess(obs)
    running_score = 0
    move_counter_episode = defaultdict(int)

    step = 0
    while True:
        action = sample_action(state, obs)
        if action is None:
            break

        move_counter_episode[ACTION_MAP[action]] += 1
        move_counter_global[ACTION_MAP[action]] += 1

        next_obs, reward, terminated, truncated, info = env.step(action)
        # decode board into ints and update max‐tile
        board = decode_obs(next_obs)
        episode_max_tile = max(episode_max_tile, int(board.max()))
        done = terminated or truncated
        next_state = preprocess(next_obs)

        memory.append((state, action, reward, next_state, done))
        state, obs = next_state, next_obs

        # force Python int accumulation
        running_score = int(running_score) + int(reward)

        if len(memory) > WARMUP_MEMORY:
            optimize()
        if done:
            break
        step += 1

    # Episode summary
    print(f"Episode {episode}, Score:{running_score}, MaxTile:{episode_max_tile}, Epsilon:{epsilon:.3f}")

    # Update global max tile
    global_max_tile = max(global_max_tile, episode_max_tile)

    # Target network update
    if episode % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    score_history.append(running_score)

    # Check convergence
    if (episode >= ROLLING_WINDOW and
        np.mean(score_history[-ROLLING_WINDOW:]) >= CONVERGENCE_SCORE):
        print(f"\n✅ Converged at episode {episode}")
        break

    episode += 1

# Print best max tile over all episodes
print(f"🏆 Best MaxTile over all episodes: {global_max_tile}")



  self.total_score += self.step_score


Episode 0, Score:1252, MaxTile:128, Epsilon:1.000
Episode 1, Score:988, MaxTile:64, Epsilon:0.995
Episode 2, Score:828, MaxTile:64, Epsilon:0.990
Episode 3, Score:1568, MaxTile:128, Epsilon:0.985
Episode 4, Score:1872, MaxTile:128, Epsilon:0.980
Episode 5, Score:2048, MaxTile:128, Epsilon:0.975
Episode 6, Score:596, MaxTile:64, Epsilon:0.970
Episode 7, Score:1204, MaxTile:128, Epsilon:0.966
Episode 8, Score:720, MaxTile:64, Epsilon:0.961
Episode 9, Score:388, MaxTile:32, Epsilon:0.956
Episode 10, Score:1272, MaxTile:128, Epsilon:0.951
Episode 11, Score:328, MaxTile:32, Epsilon:0.946
Episode 12, Score:1300, MaxTile:128, Epsilon:0.942
Episode 13, Score:532, MaxTile:64, Epsilon:0.937
Episode 14, Score:1752, MaxTile:128, Epsilon:0.932
Episode 15, Score:600, MaxTile:64, Epsilon:0.928
Episode 16, Score:1444, MaxTile:128, Epsilon:0.923
Episode 17, Score:1344, MaxTile:128, Epsilon:0.918
Episode 18, Score:1036, MaxTile:128, Epsilon:0.914
Episode 19, Score:1312, MaxTile:128, Epsilon:0.909
Episod

  score += 2 ** (board[row, col] + 1)


Episode 8185, Score:1768, MaxTile:128, Epsilon:0.050
Episode 8186, Score:2036, MaxTile:128, Epsilon:0.050
Episode 8187, Score:672, MaxTile:64, Epsilon:0.050
Episode 8188, Score:1224, MaxTile:128, Epsilon:0.050
Episode 8189, Score:1304, MaxTile:128, Epsilon:0.050
Episode 8190, Score:828, MaxTile:64, Epsilon:0.050
Episode 8191, Score:756, MaxTile:64, Epsilon:0.050
Episode 8192, Score:1464, MaxTile:128, Epsilon:0.050
Episode 8193, Score:1192, MaxTile:64, Epsilon:0.050
Episode 8194, Score:276, MaxTile:32, Epsilon:0.050
Episode 8195, Score:616, MaxTile:64, Epsilon:0.050
Episode 8196, Score:516, MaxTile:32, Epsilon:0.050
Episode 8197, Score:2452, MaxTile:256, Epsilon:0.050
Episode 8198, Score:1456, MaxTile:128, Epsilon:0.050
Episode 8199, Score:2328, MaxTile:128, Epsilon:0.050
Episode 8200, Score:1064, MaxTile:64, Epsilon:0.050
Episode 8201, Score:412, MaxTile:32, Epsilon:0.050
Episode 8202, Score:516, MaxTile:32, Epsilon:0.050
Episode 8203, Score:920, MaxTile:64, Epsilon:0.050
Episode 8204,

In [2]:
# --- Final Stats ---
print("\n=== Move Frequencies Overall ===")
for m in ACTION_MAP.values():
    print(f"{m}: {move_counter_global[m]}")

# --- Evaluation (greedy, uncapped) ---
eval_scores, tile_counts, move_counter_eval = [], Counter(), defaultdict(int)
eval_max_tiles = []
for _ in range(100):
    obs, _ = env.reset()
    state = preprocess(obs)
    total = 0
    episode_max_tile = 0
    while True:
        action = sample_action(state, obs, explore=False)
        if action is None:
            break
        move_counter_eval[ACTION_MAP[action]] += 1

        old_board = decode_obs(obs)
        obs, r, term, trunc, _ = env.step(action)
        state = preprocess(obs)
        total = int(total) + int(r)
        
        # update episode max tile
        board = decode_obs(obs)
        episode_max_tile = max(episode_max_tile, int(board.max()))

        # stop only on true terminal
        if term or trunc or np.array_equal(decode_obs(obs), old_board):
            break

    eval_scores.append(int(total))
    tile_counts[2**int(np.max(decode_obs(obs)))] += 1
    eval_max_tiles.append(episode_max_tile)

best_tile = max(eval_max_tiles)
print(f"Eval Avg: {np.mean(eval_scores):.2f}, Max: {np.max(eval_scores)}")
print("Eval Tiles:", tile_counts)
print("Eval Moves:", move_counter_eval)
print("Per-Game MaxTiles:", eval_max_tiles)
print("Best MaxTile over 100 games:", best_tile)


=== Move Frequencies Overall ===
Up: 299912
Right: 315868
Down: 317975
Left: 309964
Eval Avg: 1442.64, Max: 3988
Eval Tiles: Counter({340282366920938463463374607431768211456: 51, 18446744073709551616: 28, 115792089237316195423570985008687907853269984665640564039457584007913129639936: 13, 4294967296: 6, 65536: 2})
Eval Moves: defaultdict(<class 'int'>, {'Down': 6919, 'Left': 3046, 'Up': 2282, 'Right': 2849})
Per-Game MaxTiles: [128, 64, 64, 256, 64, 128, 128, 32, 16, 128, 128, 64, 256, 128, 64, 64, 128, 128, 128, 128, 128, 16, 256, 32, 128, 256, 128, 128, 64, 32, 128, 128, 64, 128, 128, 32, 32, 64, 128, 64, 128, 128, 128, 128, 64, 64, 128, 128, 64, 64, 128, 128, 256, 64, 128, 128, 128, 128, 256, 64, 128, 128, 64, 256, 64, 128, 64, 256, 128, 128, 256, 32, 128, 64, 256, 128, 128, 256, 128, 256, 64, 128, 64, 128, 128, 128, 128, 128, 128, 64, 64, 256, 64, 128, 64, 64, 64, 128, 128, 128]
Best MaxTile over 100 games: 256


du_ma = np.convolve(dqn_scores, np.ones(100)/100, mode='valid')
print(du_ma.max())