<a href="https://colab.research.google.com/github/magalaReuben/practicaldeepreinforcementlearning/blob/main/Lecture3/DeepQLearningPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install swig
!pip install gymnasium[box2d]

[31mERROR: Operation cancelled by user[0m[31m
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts

In [1]:
import gymnasium as gym
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random


In [2]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')
print("observation_space", *env.observation_space.shape)
print("action_space", env.action_space.n)

observation_space 8
action_space 4


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import numpy as np

MEM_SIZE = 10000
BATCH_SIZE = 64

class ReplayMemory:
    def __init__(self):
        self.mem_count = 0

        self.states = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=bool)

    def add(self, state, action, reward, state_, done):
        mem_index = self.mem_count % MEM_SIZE

        self.states[mem_index]  = state
        self.actions[mem_index] = action
        self.rewards[mem_index] = reward
        self.states_[mem_index] = state_
        self.dones[mem_index] =  1 - done

        self.mem_count += 1

    def sample(self):
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states  = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        dones   = self.dones[batch_indices]

        return states, actions, rewards, states_, dones

In [5]:
LEARNING_RATE = 0.0001

class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        # to cater for envs of different dimensions
        self.input_shape = env.observation_space.shape
        # must be a finite discrete action space
        self.action_space = env.action_space.n

        self.layer1 = nn.Linear(*self.input_shape, 1024)
        self.layer2 = nn.Linear(1024, 1024)
        self.layer3 = nn.Linear(1024, self.action_space)

        self.optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()
        self.to(device)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)

        return x

In [6]:
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.9999

gamma = 0.95

class DqnAgent:
    def __init__(self):
        self.memory = ReplayMemory()
        self.exploration_rate = max_epsilon
        self.network = DQN()

    def choose_action(self, state):
        random_num = random.uniform(0, 1)
        if random_num > self.exploration_rate:
            state = torch.tensor(state).float().detach().to(device).unsqueeze(0)
            q_values = self.network(state)
            return torch.argmax(q_values).item()
        else:
            return env.action_space.sample()

    def learn(self):
        if self.memory.mem_count < BATCH_SIZE:
            return

        states, actions, rewards, next_states, dones = self.memory.sample()
        states = torch.tensor(states , dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.long).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.bool).to(device)
        batch_indices = np.arange(BATCH_SIZE, dtype=np.int64)

        q_values = self.network(states)
        next_q_values = self.network(next_states)

        predicted_value_of_now = q_values[batch_indices, actions]
        predicted_value_of_future = torch.max(next_q_values, dim=1)[0]

        q_target = rewards + gamma * predicted_value_of_future * dones

        loss = self.network.loss(q_target, predicted_value_of_now)
        self.network.optimizer.zero_grad()
        loss.backward()
        self.network.optimizer.step()

        self.exploration_rate *= decay_rate
        self.exploration_rate = max(min_epsilon, self.exploration_rate)

    def returning_epsilon(self):
        return self.exploration_rate

In [7]:
EPISODES = 1500
best_reward = float('-inf')
average_reward = 0
episode_number = []
average_reward_number = []
observation_space = env.observation_space.shape[0]

agent = DqnAgent()

for i in range(1, EPISODES):
    state = env.reset()[0]
    state = np.reshape(state, [1, observation_space])
    score = 0

    while True:
        #env.render()
        action = agent.choose_action(state)
        state_, reward, terminated, truncated, info = env.step(action)
        state_ = np.reshape(state_, [1, observation_space])
        agent.memory.add(state, action, reward, state_, terminated)
        agent.learn()
        state = state_
        score += reward

        if terminated:
            if score > best_reward:
                #torch.save(agent.network.state_dict(), 'best_model_weights.pth')
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)

plt.plot(episode_number, average_reward_number)
plt.show()

Episode 1 Average Reward -387.71680924833123 Best Reward -387.71680924833123 Last Reward -387.71680924833123 Epsilon 0.9975029977012647
Episode 2 Average Reward -284.69008565511126 Best Reward -181.66336206189123 Last Reward -181.66336206189123 Epsilon 0.9873796776559268
Episode 3 Average Reward -184.47721197559733 Best Reward 15.948535383430553 Last Reward 15.948535383430553 Epsilon 0.9799036631402607
Episode 4 Average Reward -230.67592761427815 Best Reward 15.948535383430553 Last Reward -369.27207453032065 Epsilon 0.9714150558198451
Episode 5 Average Reward -186.50548569405916 Best Reward 15.948535383430553 Last Reward -9.823718013183196 Epsilon 0.9581967607057171
Episode 6 Average Reward -174.21804860838293 Best Reward 15.948535383430553 Last Reward -112.78086318000187 Epsilon 0.9498961936772615
Episode 7 Average Reward -167.06033136499195 Best Reward 15.948535383430553 Last Reward -124.11402790464602 Epsilon 0.9393161859438268
Episode 8 Average Reward -160.18124130816005 Best Rewar

KeyboardInterrupt: 

In [9]:
import numpy as np
import imageio

images = []
obs = env.reset()[0]
img = env.render()
print(img)
max_steps = 6000
step = 0

agent = DqnAgent()
agent.network.load_state_dict(torch.load('best_model_weights.pth'))
state = env.reset()[0]
state = np.reshape(state, [1, observation_space])

while step < max_steps:
    action = agent.choose_action(state)
    state_, reward, terminated, truncated, info = env.step(action)
    img = env.render()
    images.append(img)
    if terminated:
      env.reset()
    step += 1
    print(f"Step: {step}")

print(images)
imageio.mimsave("output.mp4", [np.array(img) for i, img in enumerate(images)], fps=60)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step: 1001
Step: 1002
Step: 1003
Step: 1004
Step: 1005
Step: 1006
Step: 1007
Step: 1008
Step: 1009
Step: 1010
Step: 1011
Step: 1012
Step: 1013
Step: 1014
Step: 1015
Step: 1016
Step: 1017
Step: 1018
Step: 1019
Step: 1020
Step: 1021
Step: 1022
Step: 1023
Step: 1024
Step: 1025
Step: 1026
Step: 1027
Step: 1028
Step: 1029
Step: 1030
Step: 1031
Step: 1032
Step: 1033
Step: 1034
Step: 1035
Step: 1036
Step: 1037
Step: 1038
Step: 1039
Step: 1040
Step: 1041
Step: 1042
Step: 1043
Step: 1044
Step: 1045
Step: 1046
Step: 1047
Step: 1048
Step: 1049
Step: 1050
Step: 1051
Step: 1052
Step: 1053
Step: 1054
Step: 1055
Step: 1056
Step: 1057
Step: 1058
Step: 1059
Step: 1060
Step: 1061
Step: 1062
Step: 1063
Step: 1064
Step: 1065
Step: 1066
Step: 1067
Step: 1068
Step: 1069
Step: 1070
Step: 1071
Step: 1072
Step: 1073
Step: 1074
Step: 1075
Step: 1076
Step: 1077
Step: 1078
Step: 1079
Step: 1080
Step: 1081
Step: 1082
Step: 1083
Step: 1084
Step: 1085


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

