In [None]:
!pip install gym
!pip install torch
!pip install torchvision
!pip install atari_py
!pip install ale-py
!pip install gym[atari]
!pip install gym[accept-rom-license]
!pip install tianshou

Collecting atari_py
  Downloading atari-py-0.2.9.tar.gz (540 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.6/540.6 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: atari_py
  Building wheel for atari_py (setup.py) ... [?25l[?25hdone
  Created wheel for atari_py: filename=atari_py-0.2.9-cp310-cp310-linux_x86_64.whl size=2872169 sha256=c996b818ede3ef51be6702c6d9cb7baa7c243b49bc3154eebd6efb811af85733
  Stored in directory: /root/.cache/pip/wheels/75/6f/04/1f3bf5255580101e16ff487564354dddcdd23ec3b43b775b7a
Successfully built atari_py
Installing collected packages: atari_py
Successfully installed atari_py-0.2.9
Collecting ale-py
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install stable-baselines3[extra]


Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting ale-py>=0.9.0 (from stable-baselines3[extra])
  Using cached ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Using cached ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
Downloading stable_baselines3-2.4.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ale-py, stable-baselines3
  Attempting uninstall: ale-py
    Found existing installation: ale-py 0.7.5
    Uninstalling ale-py-0.7.5:
      Successfully uninstalled ale-py-0.7.5
Successfully installed ale-py-0.10.1 stable-baselines3-2.4.0


In [None]:
import ale_py
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.atari_wrappers import FireResetEnv
import numpy as np
from collections import deque

def _parse_reset_result(reset_result):
    contains_info = (
        isinstance(reset_result, tuple)
        and len(reset_result) == 2
        and isinstance(reset_result[1], dict)
    )
    if contains_info:
        return reset_result[0], reset_result[1], contains_info
    return reset_result, {}, contains_info


class FrameStack(gym.Wrapper):
    """Stack n_frames last frames.

    :param gym.Env env: the environment to wrap.
    :param int n_frames: the number of frames to stack.
    """

    def __init__(self, env, n_frames):
        super().__init__(env)
        self.n_frames = n_frames
        self.frames = deque([], maxlen=n_frames)
        shape = (n_frames, *env.observation_space.shape)
        self.observation_space = gym.spaces.Box(
            low=np.min(env.observation_space.low),
            high=np.max(env.observation_space.high),
            shape=shape,
            dtype=env.observation_space.dtype,
        )

    def reset(self, **kwargs):
        obs, info, return_info = _parse_reset_result(self.env.reset(**kwargs))
        for _ in range(self.n_frames):
            self.frames.append(obs)
        return (self._get_ob(), info) if return_info else self._get_ob()

    def step(self, action):
        step_result = self.env.step(action)
        if len(step_result) == 4:
            obs, reward, done, info = step_result
            new_step_api = False
        else:
            obs, reward, term, trunc, info = step_result
            new_step_api = True
        self.frames.append(obs)
        if new_step_api:
            return self._get_ob(), reward, term, trunc, info
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        # the original wrapper use `LazyFrames` but since we use np buffer,
        # it has no effect
        return np.stack(self.frames, axis=0)

In [None]:
from gymnasium import ObservationWrapper
from scipy.ndimage import rotate

class RotateObservationWrapper(ObservationWrapper):
    def __init__(self, env, angle=90):
        super(RotateObservationWrapper, self).__init__(env)
        self.angle = angle
        orig_shape = env.observation_space.shape
        rotated_shape = rotate(np.zeros(orig_shape), angle, reshape=True).shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=rotated_shape, dtype=np.uint8
        )

    def observation(self, obs):
        rotated_obs = rotate(obs, self.angle, reshape=True, mode='constant', cval=0)
        return np.clip(rotated_obs, 0, 255).astype(np.uint8)

  and should_run_async(code)


In [None]:
def make_env(env_name, **kwargs):
    env = gym.make(env_name, **kwargs)
    env = FireResetEnv(env)
    env = gym.wrappers.AtariPreprocessing(
        env           = env,
        noop_max      = 0,
        frame_skip    = 4,
        screen_size   = 84,
        grayscale_obs = True,
        scale_obs     = True,
    )
    # Don't use gym.wrappers.FrameStack with Tianshou
    #env = RotateObservationWrapper(env, angle=90)
    env = FrameStack(env, n_frames=4)
    return env


  and should_run_async(code)


In [None]:
import torch
import torch.nn as nn
class DQN(nn.Module):

    def __init__(self, input_shape, n_actions, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=8, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        conv_out_size = self.get_conv_out_size(input_shape)

        # Or: LazyLinear
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def get_conv_out_size(self, image_dim):
        return np.prod(self.conv(torch.rand(*image_dim)).data.shape)

    def forward(self, inp, state=None, **kwargs):
        inp = torch.tensor(inp).cuda()
        return self.fc(self.conv(inp)), state
        # if isinstance(inp, np.ndarray):
        #     inp = torch.from_numpy(inp)
        #     inp = inp.type(torch.float32) / 255.0  # Convert to float and normalize to [0, 1]
        # else:
        #     inp = torch.tensor(inp)
        # inp = inp.cuda()  # Move to GPU if available
        # return self.fc(self.conv(inp)), state


In [None]:
from tianshou.env import DummyVectorEnv, ShmemVectorEnv
from tianshou.data import Collector, PrioritizedVectorReplayBuffer
from tianshou.policy import DQNPolicy
from tianshou.trainer import offpolicy_trainer

#ENV_NAME = "PongNoFrameskip-v4"
ENV_NAME = "BreakoutNoFrameskip-v4"

# General hyperparams
GAMMA = .99
BATCH_SIZE = 64
LEARNING_RATE = 0.0001
SYNC_TARGET_FRAMES = 500
NUM_STACK = 4
STEP_PER_EPOCH = 100_000
STEP_PER_COLLECT = 10
UPDATE_PER_STEP = 0.1
MAX_EPOCH = 100
EPISODE_PER_STEP = 1

# Buffer
BUFFER_SIZE = 100_000
ALPHA = 0.5
BETA_START = 0.4
BETA_FINAL = 1.
BETA_ANNEAL = 5_000_000

# Epsilon for exploration.
#EPS_START = 1.
EPS_START = 0.7
EPS_FINAL = 0.01
EPSILON_DECAY = 1e6

# N ENVS
N_TRAIN_ENVS = 10
N_TEST_ENVS = 1

In [None]:
train_envs = ShmemVectorEnv([lambda: make_env(ENV_NAME) for _ in range(N_TRAIN_ENVS)])
test_env = ShmemVectorEnv([lambda: make_env(ENV_NAME) for _ in range(N_TEST_ENVS)])

  and should_run_async(code)


In [None]:
!unzip /content/logs_pong_reduced_action.zip
#!unzip /content/logs_pong_trained_17_namitha.zip

Archive:  /content/logs_pong_reduced_action.zip
   creating: content/logs/
   creating: content/logs/dqn/
  inflating: content/logs/dqn/pong_model.pth  
  inflating: content/logs/dqn/events.out.tfevents.1733082252.a970f0eccd84.619.0  


In [None]:
pre_trained_model_path = "/content/content/logs/dqn/pong_model.pth"
pre_trained_weights = torch.load(pre_trained_model_path)

  pre_trained_weights = torch.load(pre_trained_model_path)


In [None]:
net = DQN(
    train_envs.observation_space[0].shape,
    train_envs.action_space[0].n
)

print("before")
for name, param in net.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data.mean()}, {param.data.std()}")

#net.load_state_dict(pre_trained_weights["pong_model"], strict=False)

# Control the emphasis on pretrained weights (e.g., 70% pretrained, 30% random)
#alpha = 0.7  # Confidence in pretrained weights
import random
alpha = random.uniform(0.1, 0.9)
# Loop through model parameters and blend with pretrained weights
model_state_dict = net.state_dict()  # Default weights
pretrained_state_dict = pre_trained_weights["pong_model"]

for name, param in model_state_dict.items():
    if name in pretrained_state_dict:
        # Blend default and pretrained weights
        model_state_dict[name] = (
            alpha * pretrained_state_dict[name]
            + (1 - alpha) * param
        )

# Load the blended weights back into the model
net.load_state_dict(model_state_dict)


# for param in net.conv.parameters():
#    param.requires_grad = False

print("after")
for name, param in net.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data.mean()}, {param.data.std()}")

optim = torch.optim.Adam(net.parameters(), lr=0.0001)


before
conv.0.weight: -0.0006027188501320779, 0.03606986254453659
conv.0.bias: 0.0051870387978851795, 0.0404130183160305
conv.2.weight: 3.924756674678065e-05, 0.012737413868308067
conv.2.bias: 0.0014037569053471088, 0.012110497802495956
conv.4.weight: 9.227669215761125e-05, 0.0240715853869915
conv.4.bias: -0.0018060975708067417, 0.02371685393154621
fc.1.weight: -1.468199798182468e-06, 0.01443453412503004
fc.1.bias: -1.854225411079824e-05, 0.01433774083852768
fc.3.weight: 0.0002975070383399725, 0.025480741634964943
fc.3.bias: 0.01074017584323883, 0.022106081247329712
after
conv.0.weight: -0.0006027188501320779, 0.03606986254453659
conv.0.bias: 0.0051870387978851795, 0.0404130183160305
conv.2.weight: 3.924756674678065e-05, 0.012737413868308067
conv.2.bias: 0.0014037569053471088, 0.012110497802495956
conv.4.weight: 9.227669215761125e-05, 0.0240715853869915
conv.4.bias: -0.0018060975708067417, 0.02371685393154621
fc.1.weight: -1.468199798182468e-06, 0.01443453412503004
fc.1.bias: -1.854225

In [None]:
policy = DQNPolicy(
    model              = net,
    optim              = optim,
    estimation_step    = 1,
    target_update_freq = SYNC_TARGET_FRAMES,
    discount_factor    = GAMMA,
    is_double          = False
).cuda()

  and should_run_async(code)


In [None]:
buffer = PrioritizedVectorReplayBuffer(
    total_size         = BUFFER_SIZE,
    buffer_num         = len(train_envs),
    alpha              = ALPHA,
    beta               = BETA_START,
    stack_num          = NUM_STACK,
    ignore_obs_next    = True,
    save_only_last_obs = True
)

train_collector = Collector(
    policy = policy,
    env    = train_envs,
    buffer = buffer,
    exploration_noise = True
)

# For test_collector, we don't need buffer nor exploration (random actions)
test_collector = Collector(policy, test_env)

  and should_run_async(code)


In [None]:
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
import os

log_path = os.path.join('logs', "dqn")
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)

def train_fn(epoch, env_step):
    eps = max(EPS_FINAL, EPS_START - env_step / EPSILON_DECAY * (EPS_START - EPS_FINAL))
    policy.set_eps(eps)

    if env_step <= BETA_ANNEAL:
        beta = BETA_START - env_step / BETA_ANNEAL * (BETA_START- BETA_FINAL)
    else:
        beta = BETA_FINAL
    buffer.set_beta(beta)

    if env_step % 1000 == 0:
        logger.write("train/env_step", env_step, {"train/eps": eps})
        logger.write("train/env_step", env_step, {"train/beta": beta})

In [None]:
def save_checkpoint_fn(epoch: int, env_step: int, gradient_step: int):
    if env_step % 1000 == 0:
        ckpt_path = os.path.join(log_path, f"breakout_model.pth")
        torch.save({"breakout_model": policy.state_dict()}, ckpt_path)

In [None]:
!pip install tensorboard
!pip install jupyter-tensorboard


Collecting jupyter-tensorboard
  Downloading jupyter_tensorboard-0.2.0.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython>=5.0.0->ipykernel->notebook>=5.0->jupyter-tensorboard)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: jupyter-tensorboard
  Building wheel for jupyter-tensorboard (setup.py) ... [?25l[?25hdone
  Created wheel for jupyter-tensorboard: filename=jupyter_tensorboard-0.2.0-py2.py3-none-any.whl size=15247 sha256=1faa71105cf730997afa6dab92773c88b2e79d479f940d64695a33b9b07fc3e9
  Stored in directory: /root/.cache/pip/wheels/6a/e8/ee/64103cb21821f6a747a19b93a417b7e925857daab8ec5c7f50
Successfully built jupyter-tensorboard
Installing collected packages: jedi, jupyter-tensorboard
Successfully 

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir logs/dqn

  and should_run_async(code)


In [None]:
result = offpolicy_trainer(
    policy              = policy,
    train_collector     = train_collector,
    test_collector      = test_collector,
    max_epoch           = MAX_EPOCH,
    step_per_epoch      = STEP_PER_EPOCH,
    step_per_collect    = STEP_PER_COLLECT,
    update_per_step     = UPDATE_PER_STEP,
    episode_per_test    = EPISODE_PER_STEP,
    stop_fn             = lambda mean_rewards: mean_rewards >= 17,
    save_checkpoint_fn  = save_checkpoint_fn,
    batch_size          = BATCH_SIZE,
    train_fn            = train_fn,
    test_fn             = lambda epoch, env_step: policy.set_eps(0.0),
    logger              = logger,
    test_in_train       = False,
)


Epoch #1: 100001it [09:39, 172.43it/s, env_step=100000, len=143, loss=0.002, n/ep=0, n/st=10, rew=0.00]                            


Epoch #1: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #1


Epoch #2: 100001it [09:30, 175.33it/s, env_step=200000, len=361, loss=0.005, n/ep=0, n/st=10, rew=7.00]


Epoch #2: test_reward: 11.000000 ± 0.000000, best_reward: 11.000000 ± 0.000000 in #2


Epoch #3: 100001it [09:31, 174.97it/s, env_step=300000, len=156, loss=0.007, n/ep=0, n/st=10, rew=1.00]                            


Epoch #3: test_reward: 8.000000 ± 0.000000, best_reward: 11.000000 ± 0.000000 in #2


Epoch #4: 100001it [09:40, 172.23it/s, env_step=400000, len=521, loss=0.010, n/ep=1, n/st=10, rew=11.00]                            


Epoch #4: test_reward: 9.000000 ± 0.000000, best_reward: 11.000000 ± 0.000000 in #2


Epoch #5: 100001it [09:32, 174.64it/s, env_step=500000, len=671, loss=0.012, n/ep=0, n/st=10, rew=21.00]                            


Epoch #5: test_reward: 11.000000 ± 0.000000, best_reward: 11.000000 ± 0.000000 in #2


Epoch #6: 100001it [09:32, 174.65it/s, env_step=600000, len=597, loss=0.019, n/ep=0, n/st=10, rew=23.00]


Epoch #6: test_reward: 44.000000 ± 0.000000, best_reward: 44.000000 ± 0.000000 in #6


In [None]:
!zip -r logs.zip /content/logs


  adding: content/logs/ (stored 0%)
  adding: content/logs/dqn/ (stored 0%)
  adding: content/logs/dqn/events.out.tfevents.1733140164.600fd5718129.438.0 (deflated 69%)
  adding: content/logs/dqn/breakout_model.pth (deflated 7%)


In [None]:
from google.colab import files
files.download('/content/logs.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def make_video(output_path='video/'):
    policy.eval()
    rec_env = DummyVectorEnv(
        [
            lambda: gym.wrappers.RecordVideo(
                make_env(ENV_NAME, render_mode="rgb_array", ),
                         #num_stack=NUM_STACK),
                video_folder=output_path
            )
        ]
    )
    rec_env.seed(7)
    collector = Collector(policy, rec_env, exploration_noise=False)
    collector.collect(n_episode=1)

make_video()


  """


In [None]:
!zip -r video.zip /content/video
from google.colab import files
files.download('/content/video.zip')

  adding: content/video/ (stored 0%)
  adding: content/video/rl-video-episode-0.mp4 (deflated 30%)
  adding: content/video/rl-video-episode-1.mp4 (deflated 24%)


  and should_run_async(code)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>