In [1]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from collections import deque
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F

from IPython import display as ipythondisplay

In [2]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('./video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [3]:
env = gym.make('LunarLander-v2')
env.reset()
for i in range(500):
    env.render()
    state,reward,done,_ = env.step(env.action_space.sample())

    if done: break
env.close()

The state vector of length 7 contains the following information in this order:
```
[
  posistion x, 
  position y, 
  velocity in x direction, 
  velocity in y direction, 
  lander angle, 
  left leg makes contact (0/1), 
  right leg makes contact (0/1)
]
```

In [113]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [114]:
class Model(nn.Module):
    def __init__(self, state_size, num_actions):
        super(Model, self).__init__()
        self.state_size = state_size
        self.num_actions = num_actions
        self.layer1 = nn.Linear(state_size, 64)
        self.layer2 = nn.Linear(64,64)
        self.output = nn.Linear(64, self.num_actions)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.output(x)    

class ReplayBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen = self.max_size)
  
    def add(self, item):
        self.buffer.append(item)

    def sample(self, n):
        samples = random.sample(self.buffer,k=n)
        state = np.array([item[0] for item  in samples])
        action = np.array([item[1] for item  in samples])
        reward = np.array([item[2] for item  in samples])
        next_state = np.array([item[3] for item  in samples])
        done = np.array([item[4] for item  in samples])
        return state,action,reward,next_state,done
    

    def __len__(self):
        return len(self.buffer)

class DQN_Agent():
    def __init__(self, state_size=8, action_size=4):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = .95
        self.epsilon = 1.0 # exploration rate
        self.replay_buffer = ReplayBuffer(max_size=1000)
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.995
        self.learning_rate = 0.01
        self.model = Model(self.state_size, self.action_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-4)
        self.rewards_per_episode = []
        self.current_episode = 0
        self.eval = False

    def act(self, state):
        '''
        agent acts with an e-greedy policy.  If self.eval is True, then agent
        will chose action with largest Q value.
        '''
        if self.eval == False and np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model(torch.tensor(state)).detach().numpy()
        return np.argmax(act_values)
  
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.gamma*self.epsilon)

    def load_agent_model(self, path):
        self.model = torch.load(path).to(device)


    def train(self, episodes, env, batch_size=32):
        self.eval = False
        for e in range(episodes):
            episode_rewards = 0
            state = env.reset()
            done = False
            total_reward = 0
            t=0
            # start playing Lunar Lander
            while not done:
                # get action 
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                self.replay_buffer.add((state,action,reward,next_state,done))
                if len(self.replay_buffer) >= batch_size and ((t % 4) == 0) :
                    # vectorized replay
                    state, action, reward, next_state_replay, replay_done = self.replay_buffer.sample(batch_size)
                    y = torch.tensor(reward).float().to(device)
                    y += self.gamma * \
                       self.model(torch.tensor(next_state_replay)).detach().max(1).values *\
                       (1-done)
                    y = y.unsqueeze(1)
                    state = torch.tensor(state).to(device)
                    action = torch.tensor(action).to(device)
                    
                    Q_exp = self.model(state).gather(1,action.unsqueeze(1))
                    loss = F.mse_loss(Q_exp, y)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                state = next_state
            torch.save(self.model.state_dict(), "model.pt")
            t+=1

            self.decay_epsilon()
            self.current_episode+=1
            self.rewards_per_episode.append(total_reward)

            window = 25
            if ((self.current_episode) % window) == 0:
                avg_e = np.mean(self.rewards_per_episode[self.current_episode-window:self.current_episode+1])
                print(f"Episode {self.current_episode} | Reward: {avg_e} | Epsilon: {self.epsilon}")

In [115]:
my_agent = DQN_Agent()

In [116]:
my_agent.train(5000,env,batch_size=32)

Episode 25 | Reward: -242.9460503069809 | Epsilon: 0.27738957312183365


KeyboardInterrupt: 

In [None]:
env = wrap_env(env)
state = env.reset()
done = False
my_agent.eval = True
while not done:
  env.render()
  action = my_agent.act(state)
  state, reward, done, _  = env.step(action)
env.close()
show_video()
