In [None]:
!pip install gym gym[other] tensorflow keras autorom gym[accept-rom-license] gym[atari] torch

In [2]:
import numpy as np
from collections import deque
import gym
import random

In [4]:
env = gym.make("Pong-v4", obs_type='grayscale', render_mode='rgb_array', full_action_space=False)
env = gym.wrappers.AtariPreprocessing(env=env, frame_skip=1)
env = gym.wrappers.FrameStack(env=env, num_stack=4)
env = gym.wrappers.RecordVideo(env, 'videos', episode_trigger= lambda x : x % 30 == 0)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
  logger.warn(
  logger.warn(


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, action_size):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)  # Assuming input_shape is (channels, height, width)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Compute the size of the output of the last conv layer
        def conv2d_size_out(size, kernel_size=3, stride=1):
            return (size - (kernel_size - 1) - 1) // stride + 1

        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(84, 8, 4), 4, 2), 3, 1)
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(84, 8, 4), 4, 2), 3, 1)
        linear_input_size = convw * convh * 64

        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(linear_input_size, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        return self.fc(x)

def update_target_network(target, source):
    target.load_state_dict(source.state_dict())


In [6]:
import copy
# Set parameters
N = 60000  # Replay memory capacity
M = 10000  # Number of episodes
T = 10000  # Max steps per episode
C = 40  # Target network update frequency
epsilon = 1
epsilon_decay = 0.99
epsilon_min = 0.1
gamma = 0.99
action_size = env.action_space.n  # Number of actions
state_size = env.observation_space.shape[0]  # State size

# Initialize replay memory


Q = DQN(action_size)
Q_hat = copy.deepcopy(Q)
D = deque(maxlen=N)


# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Q.to(device)
Q_hat.to(device)
optimizer = optim.Adam(Q.parameters(), lr=0.0025)
criterion = nn.MSELoss()



# Convert numpy array to PyTorch tensor
def preprocess_state(state):
  return torch.tensor(np.asarray(state)).float().div(255).unsqueeze(0).to(device)  # Scales to [0,1]


Using GPU: NVIDIA GeForce RTX 2060


In [7]:
from tqdm import tqdm

frames = 0
actions_q = []
rewards_all = []
# Training loop
for episode in tqdm(range(M)):
    total_reward = 0
    state = preprocess_state(env.reset()[0])# Add batch dimension
    for t in range(T):
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = random.randrange(action_size)
        else:
            with torch.no_grad():  # No need to track gradients here
                act_values = Q(state)
                action = act_values.max(1)[1].item()  # Choose the action with the highest Q-value
                actions_q.append(action)

        # Execute action in environment and observe next state and reward
        for i in range(4):
            next_state, reward, done, _, _ = env.step(action)
            frames += 1
            if i < 3:
              state = preprocess_state(next_state)
            total_reward += reward

        next_state = preprocess_state(next_state)

        # Store transition in D (experience replay buffer)
        D.append((state, action, reward, next_state, done))

        state = next_state

        # Check if the episode is done
        if done :
            if episode % 20 == 0:
              print(f"Episode: {episode}/{M}, Score: {total_reward}, Nb_frames : {frames}")
              rewards_all.append(total_reward)  
            break



        # Train using a random minibatch from D
        if len(D) > 5000:
            minibatch = random.sample(D, 32)
            # Extract tensors from the minibatch
            states = torch.cat([s for s, a, r, ns, d in minibatch]).to(device)
            actions = torch.tensor([a for s, a, r, ns, d in minibatch], device=device).long()
            rewards = torch.tensor([r for s, a, r, ns, d in minibatch], device=device).float()
            next_states = torch.cat([ns for s, a, r, ns, d in minibatch]).to(device)
            dones = torch.tensor([d for s, a, r, ns, d in minibatch], device=device).float()


            # Compute Q values for current states
            Q_values = Q(states)
            # Select the Q value for the action taken, which are the ones we want to update
            Q_values = Q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

            # Compute the Q values for next states using the target network
            with torch.no_grad():
                next_state_values = Q_hat(next_states).max(1)[0]
                # If done is true, we want to ignore the next state value
                next_state_values[dones == 1] = 0.0
                # Compute the target Q values
                target_Q_values = rewards + (gamma * next_state_values)

            # Zero the parameter gradients
            optimizer.zero_grad()
            # Compute loss
            loss = criterion(Q_values, target_Q_values)
            # Backward pass
            loss.backward()
            optimizer.step()

    # Update epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target network
    if episode % C == 0:
        Q_hat.load_state_dict(Q.state_dict())

  if not isinstance(terminated, (bool, np.bool8)):
  logger.warn(
  0%|          | 0/10000 [00:01<?, ?it/s]

Moviepy - Building video /home/meliioko/dqn-breakout/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/meliioko/dqn-breakout/videos/rl-video-episode-0.mp4



  0%|          | 1/10000 [00:01<5:18:01,  1.91s/it]

Moviepy - Done !
Moviepy - video ready /home/meliioko/dqn-breakout/videos/rl-video-episode-0.mp4
Episode: 0/10000, Score: -20.0, Nb_frames : 1172


  0%|          | 21/10000 [01:43<13:18:49,  4.80s/it]

Episode: 20/10000, Score: -19.0, Nb_frames : 26972


  0%|          | 30/10000 [02:08<7:24:48,  2.68s/it] 

Moviepy - Building video /home/meliioko/dqn-breakout/videos/rl-video-episode-30.mp4.
Moviepy - Writing video /home/meliioko/dqn-breakout/videos/rl-video-episode-30.mp4



  0%|          | 31/10000 [02:09<7:55:04,  2.86s/it]

Moviepy - Done !
Moviepy - video ready /home/meliioko/dqn-breakout/videos/rl-video-episode-30.mp4


  0%|          | 35/10000 [02:19<7:44:34,  2.80s/it]

: 

In [None]:
np.save('actions.npy', np.asarray(actions_q))
torch.save(Q.state_dict(), 'q.pt')
saved_actions = np.save('rewards.npy', np.asarray(rewards_all))
