In [None]:
!pip install gymnasium==0.29.1 pygame stable-baselines3[extra] sb3-contrib --quiet

import gymnasium as gym, numpy as np, random, pygame, cv2, torch
from gymnasium import spaces
from sb3_contrib import QRDQN
from stable_baselines3.common.monitor import Monitor
from collections import deque

SCREEN_W, SCREEN_H = 288, 512
BASEY = int(SCREEN_H*0.79)
pygame.init()

def new_pipe():
    gap_y = random.randint(100, 300)
    x     = SCREEN_W + 10
    return [{'x': x, 'y': gap_y-320}, {'x': x, 'y': gap_y+100}]

class FlappyCore:
    """Pure game‑state (no Gym) so we can reuse for vector or pixel."""
    def __init__(self):
        self.reset()
    def reset(self):
        self.bird_y, self.vel_y = SCREEN_H//2, -9
        self.pipes = list(new_pipe())
        self.score = 0
    def step(self, action):
        if action and self.bird_y > -24: self.vel_y = -9
        self.vel_y = min(self.vel_y+1, 10)
        self.bird_y += self.vel_y
        for p in self.pipes: p['x'] -= 4
        if self.pipes[-1]['x'] < SCREEN_W-150: self.pipes.extend(new_pipe())
        if self.pipes[0]['x'] < -52: self.pipes = self.pipes[2:]

        up, lo = self.pipes[0], self.pipes[1]
        terminated = self.bird_y+24 >= BASEY or (
            up['x'] < 60 < up['x']+52 and
            not (up['y']+320 < self.bird_y < lo['y']-24))
        reward = -1.0 if terminated else 0.0
        if not terminated and up['x']+52 < 60 <= up['x']+56:
            reward = 1.0; self.score += 1
        return reward, terminated

class FlappyPixelEnv(gym.Env):
    metadata = {"render_modes": []}
    def __init__(self, frame_stack=4):
        self.game = FlappyCore()
        self.stack = deque(maxlen=frame_stack)
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(
            0, 255, shape=(frame_stack, 84, 84), dtype=np.uint8)
        self.canvas = pygame.Surface((SCREEN_W, SCREEN_H))
    def _render_rgb(self):
        self.canvas.fill((135,206,250))
        for i in range(0, len(self.game.pipes), 2):
            up, lo = self.game.pipes[i], self.game.pipes[i+1]
            pygame.draw.rect(self.canvas, (0,255,0),
                             pygame.Rect(up['x'], up['y'], 52, 320))
            pygame.draw.rect(self.canvas, (0,255,0),
                             pygame.Rect(lo['x'], lo['y'], 52, 320))
        pygame.draw.rect(self.canvas, (255,255,0),
                         pygame.Rect(60, self.game.bird_y, 34, 24))
        arr = pygame.surfarray.array3d(self.canvas).swapaxes(0,1)  # HWC
        gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
        return cv2.resize(gray, (84,84), interpolation=cv2.INTER_AREA)
    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.game.reset()
        frame = self._render_rgb()
        self.stack.clear()
        for _ in range(self.stack.maxlen): self.stack.append(frame)
        obs = np.array(self.stack, dtype=np.uint8)
        return obs, {}
    def step(self, action):
        reward, terminated = self.game.step(int(action))
        frame = self._render_rgb()
        self.stack.append(frame)
        obs = np.array(self.stack, dtype=np.uint8)
        reward += 0.2
        return obs, reward, terminated, False, {}

env = Monitor(FlappyPixelEnv())
dqn_pix = QRDQN(
    "CnnPolicy",
    env,
    learning_rate      = 5e-4,
    buffer_size        = 50_000,
    learning_starts    = 5_000,
    batch_size         = 32,
    target_update_interval = 1_000,
    exploration_fraction  = 0.15,
    gamma              = 0.99,
    verbose            = 1,
)
dqn_pix.learn(total_timesteps=750_000, progress_bar=True)

def evaluate(agent, episodes=50):
    test_env = FlappyPixelEnv()
    scores=[]
    for _ in range(episodes):
        obs, _ = test_env.reset()
        done, pipes = False, 0
        while not done:
            act, _ = agent.predict(obs, deterministic=True)
            obs, r, done, _, _ = test_env.step(act)
            if r == 1.0: pipes += 1
        scores.append(pipes)
    return np.mean(scores), np.max(scores)

avg, best = evaluate(dqn_pix)
print(f"\nPixel agent  —  average pipes: {avg:.1f}   |   best: {best}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/953.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.5/184.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m127.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━

Output()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 14.9     |
|    n_updates        | 97894    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 93.4     |
|    ep_rew_mean      | 18.4     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 6160     |
|    fps              | 168      |
|    time_elapsed     | 2351     |
|    total_timesteps  | 396911   |
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 12.6     |
|    n_updates        | 97977    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90.5     |
|    ep_rew_mean      | 17.8     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 6


Pixel agent  —  average pipes: 0.0   |   best: 0


In [None]:
def patched_step(self, action):
    if action and self.bird_y > -24:
        self.vel_y = -9
    self.vel_y = min(self.vel_y + 1, 10)
    self.bird_y += self.vel_y

    prev_right = self.pipes[0]['x'] + 52
    for p in self.pipes: p['x'] -= 4
    if self.pipes[-1]['x'] < SCREEN_W-150:
        self.pipes.extend(new_pipe())
    if self.pipes[0]['x'] < -52:
        self.pipes = self.pipes[2:]
    curr_right = self.pipes[0]['x'] + 52

    up, lo = self.pipes[0], self.pipes[1]
    terminated = self.bird_y+24 >= BASEY or (
        up['x'] < 60 < up['x']+52 and
        not (up['y']+320 < self.bird_y < lo['y']-24))
    reward = -1.0 if terminated else 0.0
    if not terminated and prev_right >= 60 > curr_right:
        reward = 1.0
        self.score += 1
    return reward, terminated

FlappyCore.step = patched_step

dqn_pix.learn(total_timesteps=500_000, reset_num_timesteps=False, progress_bar=True)

avg, best = evaluate(dqn_pix)
print(f"\nPixel agent — average pipes: {avg:.1f}   |   best episode: {best}")

Output()

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 363      |
|    ep_rew_mean      | 78.9     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 7492     |
|    fps              | 157      |
|    time_elapsed     | 3        |
|    total_timesteps  | 750580   |
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 12.3     |
|    n_updates        | 186394   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 378      |
|    ep_rew_mean      | 82.3     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 7496     |
|    fps              | 163      |
|    time_elapsed     | 18       |
|    total_timesteps  | 752953   |
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 15.5     |
|    n_updates      


Pixel agent — average pipes: 0.0   |   best episode: 0


In [None]:
def living_bonus(r):
    return r + 0.2

train_env = Monitor(gym.wrappers.TransformReward(FlappyPixelEnv(), living_bonus))
dqn_pix.set_env(train_env)

dqn_pix.exploration_final_eps = 0.10

dqn_pix.learn(total_timesteps=2_000_000,
              reset_num_timesteps=False,
              progress_bar=True)

avg, best = evaluate(dqn_pix)
print(f"\nPixel agent — average pipes: {avg:.1f}   |   best: {best}")