In [None]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

In [None]:
!pip install box2d

In [5]:
# coding: utf-8

__author__ = 'zhenhang.sun@gmail.com'
__version__ = '1.0.0'

import gym
import math
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, 256)
        self.linear2 = nn.Linear(256, 128)
        self.linear3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

class Agent(object):
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.eval_net = Net(self.state_space_dim, self.action_space_dim).to(self.device)
        self.target_net = Net(self.state_space_dim, self.action_space_dim).to(self.device)
        self.target_net.load_state_dict(self.eval_net.state_dict())
        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.lr)
        self.loss_fn = nn.MSELoss()
        self.epsi = [self.epsi_high] * 1000
        self.curr_epsi = self.epsi_high
        self.buffer = []
        self.steps = 0
        self.episodes = 0
        
    def act(self, s0):
        if random.random() < self.curr_epsi:
            # Select next action randomly
            a0 = random.randrange(self.action_space_dim)
        else:
            # Select next action according to policy
            self.eval_net.eval()
            with torch.no_grad():
                s0 = torch.FloatTensor(s0).view(1,-1).to(self.device)
                a0 = torch.argmax(self.eval_net(s0)).item()
        
        self.steps += 1
        return a0

    def put(self, s0, a0, r1, s1, done):
        # Save transition to replay buffer
        if len(self.buffer) == self.capacity:
            self.buffer.pop(0)
        self.buffer.append((s0, a0, r1, s1, done))

        if done:
            self.episodes += 1

    def sample(self):
        error_samples = random.sample( self.buffer, self.batch_size)

        s0, a0, r1, s1, d1 = zip(*error_samples)
        s0 = torch.FloatTensor(s0).to(self.device)
        a0 = torch.LongTensor(a0).view(self.batch_size, -1).to(self.device)
        r1 = torch.FloatTensor(r1).view(self.batch_size, -1).to(self.device)
        s1 = torch.FloatTensor(s1).to(self.device)
        d1 = torch.FloatTensor(d1).view(self.batch_size, -1).to(self.device)

        self.eval_net.eval()
        with torch.no_grad():
            y_pred = self.eval_net(s0).gather(1, a0)
            y_true = r1 + (1-d1) * self.gamma * torch.max(self.target_net(s1).detach(), dim=1)[0].view(self.batch_size, -1)

        errors = abs(y_pred - y_true)
        indices = torch.sort(errors, dim=0, descending=True)[1]
        indices = torch.flatten(indices).tolist()
        indices = indices[:int(self.batch_size/2)]

        if self.batch_size % 2 == 0:
            samples = random.sample(self.buffer, int(self.batch_size/2))
        else:
            samples = random.sample(self.buffer, int(self.batch_size/2)+1)
        # samples = random.sample(self.buffer, int(self.batch_size/2)+int(self.batch_size%2==1))

        for idx in indices:
            samples.append(error_samples[idx])

        # Fix overlap transitions

        return samples

        
    def learn(self):
        if len(self.buffer) < self.batch_size:
            return
        
        # Sample a batch of transitions from replay buffer
        #--- uncomment for prioritized replay ---#
        # samples = self.sample()
        #----------------------------------------#

        #--- uncomment for normal replay ---#
        samples = random.sample( self.buffer, self.batch_size)
        #-----------------------------------#
        
        s0, a0, r1, s1, d1 = zip(*samples)
        s0 = torch.FloatTensor(s0).to(self.device)
        a0 = torch.LongTensor(a0).view(self.batch_size, -1).to(self.device)
        r1 = torch.FloatTensor(r1).view(self.batch_size, -1).to(self.device)
        s1 = torch.FloatTensor(s1).to(self.device)
        d1 = torch.FloatTensor(d1).view(self.batch_size, -1).to(self.device)

        # Compute current and target Q-values
        self.eval_net.train()
        y_pred = self.eval_net(s0).gather(1, a0)
        y_true = r1 + (1-d1) * self.gamma * torch.max(self.target_net(s1).detach(), dim=1)[0].view(self.batch_size, -1)
        
        # Optimize model
        loss = self.loss_fn(y_pred, y_true)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_episode(self, episode_rewards):
        # Increase batch size
        #--- uncomment for adaptive batch size ---#
        # mean = np.mean(episode_rewards[-50:])
        # if len(episode_rewards) >= 50 and mean > 170:
        #     self.batch_size = min(256, self.batch_size+1)
        # else:
        #     self.batch_size = min(128, self.batch_size)
        #-----------------------------------------#

        # Update epsilons
        #--- uncomment for adaptive epsilon ---#
        # for n in range(len(self.epsi)):
        #     self.epsi_low =  (0.01/1000) * n
        #     self.decay = 0.995 - (0.01*(1000-n))/1000
        #     self.epsi[n] = max(self.epsi_low, self.epsi[n] * self.decay)    
        # self.curr_epsi = self.epsi[self.steps-1]
        #--------------------------------------#

        #--- uncomment for normal epsilon ---#
        self.curr_epsi = max(self.epsi_low, self.curr_epsi * self.decay)
        #------------------------------------#
        
        self.steps = 0

        # Update target network
        if self.episodes % self.update_interval == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
            self.target_net.eval()


In [None]:
# coding: utf-8

__author__ = 'zhenhang.sun@gmail.com'
__version__ = '1.0.0'

# cd LunarLander-v2
# activate gym
# python lunarlander-0.py

import gym
import torch
from IPython import display
import matplotlib.pyplot as plt

# from dqn_lunarlander import Agent


def plot_rewards():
    plt.figure(2)
    plt.clf()
    rewards_t = torch.FloatTensor(episode_rewards)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Total Rewards')
    plt.plot(rewards_t.numpy())
    # take 100 episode averages and plot them too
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.cat(99*[torch.tensor([-200])]), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if len(rewards_t) >= 100 and means[-1:] >= 200:
        plt.title('Solved!')
        plt.pause(0.001)
        return True


if __name__ == '__main__':

    trial_episodes = []
    for trial in range(10):
        env = gym.make('LunarLander-v2')

        params = {
            'gamma': 0.995, #0.995
            'epsi_high': 0.5,   #0.7, 0.5
            'epsi_low': 0.00,   #0.01
            'decay': 0.995,  #0.99, 0.995
            'lr': 0.0001,   #0.0001
            'capacity': 400000, #100000
            'batch_size': 128,  #64
            'error_batch_size': 128,   #64  rename: per_bathc_size
            'update_interval': 1,   # unit: episode
            'state_space_dim': env.observation_space.shape[0],
            'action_space_dim': env.action_space.n
        }
        agent = Agent(**params)


        episode_rewards = []
        for episode in range(1000):
            s0 = env.reset()
            tot_rewards = 0

            while True:
                # env.render()
                a0 = agent.act(s0)
                s1, r1, done, _ = env.step(a0)      
        
                agent.put(s0, a0, r1, s1, done)
                tot_rewards += r1 
                
                if done:
                    episode_rewards.append(tot_rewards)
                    success = plot_rewards()
                    break

                s0 = s1
                agent.learn()

            agent.update_episode(episode_rewards)

            print("Episode: {} Rewards: {}".format(episode, tot_rewards))
            if success:
                print("Solved!")
                break
        trial_episodes.append(episode)
        print(trial_episodes)
    env.close()
    plt.ioff()
    plt.show()


In [None]:
import numpy as np 
print(trial_episodes)
print(np.mean(trial_episodes))