In [1]:
!pip3 install gym --upgrade
!pip3 install pyglet
!pip3 install Box2D
!pip3 install box2d-py
!pip3 install gym[Box_2D]
!pip3 install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
from torch.cuda import device_count
import torch
import gym
import random
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [18]:
# enviroment
import gym

env = gym.make('LunarLander-v2')
state_size = 8
action_size = 4

In [19]:
import random
from collections import namedtuple, deque

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ExperienceReplay():

    def __init__(self,buffer_size):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.experience = Transition
        self.seed = random.seed(0)
    
    def store_trans(self, state, action,next_state, reward, done):
        e = self.experience(state, action, next_state, reward, done)
        self.memory.append(e)
    
    def sample(self,batch_size):
        experiences = random.sample(self.memory, k=batch_size)
        return experiences

    def __len__(self):
        return len(self.memory)

In [21]:
import torch.nn as nn
import torch.nn.functional as F
class DeepQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DeepQNetwork, self).__init__()
        torch.manual_seed(0)
        fc1_units = 256
        fc2_units = 64
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, X):
        x = self.fc1(X)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        action = self.fc3(x)
        return action

In [22]:
class DQNAgent():
    def __init__(self, state_size, action_size, batch_size,
                 gamma=0.99, buffer_size=250000, alpha=5e-4):
      
        # network parameter
        self.state_size = state_size
        self.action_size = action_size
        # hyperparameters
        self.batch_size = batch_size
        self.gamma = gamma
        self.buffer_size = buffer_size

        # Replay memory
        self.experience_replay = ExperienceReplay(self.buffer_size)

        # network
        self.value_net = DeepQNetwork(state_size, action_size).to(device)

        # optimizer
        self.optimizer = optim.Adam(self.value_net.parameters(), lr=alpha)

    def take_action(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.value_net.eval()
        with torch.no_grad():
            action_values = self.value_net(state)
        self.value_net.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def update_params(self):
        if len(self.experience_replay) < self.batch_size:
            return
        batch = self.experience_replay.sample(self.batch_size)
        state_batch = torch.from_numpy(np.vstack([e.state for e in batch if e is not None])).float().to(device)
        action_batch = torch.from_numpy(np.vstack([e.action for e in batch if e is not None])).long().to(device)
        next_state_batch = torch.from_numpy(np.vstack([e.next_state for e in batch if e is not None])).float().to(device)
        reward_batch = torch.from_numpy(np.vstack([e.reward for e in batch if e is not None])).float().to(device)
        done_batch = torch.from_numpy(np.vstack([e.done for e in batch if e is not None]).astype(np.uint8)).float().to(device)
        
        q_expected = self.value_net.forward(state_batch).gather(1, action_batch)
        
        q_targets_next = self.value_net.forward(next_state_batch).detach().max(1)[0].unsqueeze(1)
        ### Calculate target value from bellman equation
        q_targets = reward_batch + self.gamma * q_targets_next * (1-done_batch)
        ### Calculate expected value 
      
        ### Loss calculation
        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    def save(self, fname):
      torch.save(agent.value_net.state_dict(), fname)
    
    def load(self, fname, device):
        self.value_net.load_state_dict(torch.load(fname)).to(device)


In [23]:
import os
from gym.wrappers.monitoring import video_recorder
os.environ['SDL_VIDEODRIVER']='dummy'
def save_video(agent, env_name):
    env = gym.make('LunarLander-v2')
    vid = video_recorder.VideoRecorder(env, path="{}.mp4".format(env_name))
    agent.value_net.load_state_dict(torch.load('checkpoint.pth'))
    state = env.reset()
    done = False
    while not done:
        frame = env.render(mode='rgb_array')
        vid.capture_frame()
        
        action = agent.take_action(state)

        state, reward, done, _ = env.step(action)        
    env.close()

In [25]:
# NOTE: DON'T change values
n_episodes = 100
eps = 1.0
eps_decay_rate = 0.97
eps_end = 0.01

In [26]:
Batch_size = 64
agent = DQNAgent(state_size, action_size, Batch_size)

crs = np.zeros(n_episodes) # cummulative rewards
crs_recent = deque(maxlen=25) # recent cummulative rewards
for i_episode in range(1, n_episodes+1):
    state = env.reset()
    score = 0
    done = False
    while not done:
        action = agent.take_action(state, eps)
        next_state, reward, done, _ = env.step(action)
        agent.experience_replay.store_trans(state, action, next_state, reward, done)
        agent.update_params()
        state = next_state
        score += reward

    # decrease epsilon
    eps = max(eps_end, eps_decay_rate*eps) 
    crs[i_episode-1] = score 
    crs_recent.append(score)                  
    if i_episode % 50  == 0:
        agent.save('checkpoint.pth')
    print('\rEpisode {}\tAverage Reward: {:.2f}\tEpsilon: {:.2f}'.format(i_episode, np.mean(crs_recent), eps), end="")
    if i_episode % 25 == 0:
        print('\rEpisode {}\tAverage Reward: {:.2f}\tEpsilon: {:.2f}'.format(i_episode, np.mean(crs_recent), eps))

    # if np.mean(crs_recent)>=200.0:
    #     print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(crs_recent)))
    #     agent.save('checkpoint.pth')
    #     break

Episode 25	Average Reward: -157.25	Epsilon: 0.47
Episode 50	Average Reward: -117.95	Epsilon: 0.22
Episode 75	Average Reward: -98.39	Epsilon: 0.10
Episode 100	Average Reward: -59.58	Epsilon: 0.05
Episode 125	Average Reward: -71.28	Epsilon: 0.02
Episode 150	Average Reward: -55.18	Epsilon: 0.01
Episode 175	Average Reward: -44.59	Epsilon: 0.01
Episode 200	Average Reward: -72.43	Epsilon: 0.01
Episode 225	Average Reward: -13.68	Epsilon: 0.01
Episode 250	Average Reward: 127.06	Epsilon: 0.01


In [None]:
plt.plot(np.arange(len(crs)), crs)
plt.ylabel('Reward')
plt.xlabel('Training Steps')
plt.show()

In [28]:
agent = DQNAgent(state_size=8, action_size=4, batch_size= 64)
save_video(agent,'DQN_64_250_2')