In [1]:
%%capture
!pip install box2d-py

In [2]:
import os
import cv2
import gym
import time
import collections

import numpy as np
import torch as T
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

%matplotlib inline

In [20]:
T.cuda.get_device_name()

'Tesla K80'

## **Network**

In [26]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, observation_shape, n_actions):
        super().__init__()
        # ANN
        self.fc1 = nn.Linear(observation_shape[0], 2048)
        self.fc2 = nn.Linear(2048, 1536)
        self.pi = nn.Linear(1536, n_actions)
        self.V = nn.Linear(1536, 1)
        # UTILS
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.to(self.device)

    def forward(self, state):
        t = F.relu(self.fc1(state))
        t = F.relu(self.fc2(t))
        pi = self.pi(t)
        V = self.V(t)
        return pi, V

## **Agent**

In [27]:
class ActorCriticAgent:
    def __init__(self, observation_shape, n_actions, lr, gamma):
        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.LR = lr
        self.GAMMA = gamma
        # AC
        self.actor_critic = ActorCriticNetwork(lr, observation_shape, n_actions)

    def get_action(self, observation):
        observation = T.tensor(observation, dtype=T.float32).to(self.actor_critic.device)
        state = T.unsqueeze(observation, 0)                 # T.tensor([array]) = T.unsqueeze(T.tensor(array), 0)
        # PROB DISTRIBUTION
        probabilities, _ = self.actor_critic(state)
        probabilities = F.softmax(probabilities, dim=1)
        action_distribution = T.distributions.Categorical(probabilities)
        # ACTION & LOG_PROB
        action = action_distribution.sample()
        logprob = action_distribution.log_prob(action)
        return action.item(), logprob

    def learn(self, observation, reward, observation_, logprob, done):
        observation = T.tensor([observation], dtype=T.float32).to(self.actor_critic.device)
        reward = T.tensor([reward], dtype=T.float32).to(self.actor_critic.device)
        observation_ = T.tensor([observation_], dtype=T.float32).to(self.actor_critic.device)
        # GET V and V_ from CRITIC
        _, V = self.actor_critic(observation)
        _, V_ = self.actor_critic(observation_)
        # CALC LOSS & BACKPROP
        delta = reward + (( self.GAMMA * V_ * (1-int(done)) ) - V) # d = [R + ((gamma * V_) - V)] if (done) else [R - V]
        actor_loss = -(delta * logprob)
        critic_loss = (delta)**2
        loss = actor_loss + critic_loss
        self.actor_critic.optimizer.zero_grad()
        loss.backward()
        self.actor_critic.optimizer.step()

## **Training**

In [32]:
env_name = "LunarLander-v2"
env = gym.make(env_name)

N_EPISODES = 3000

In [33]:
agent = ActorCriticAgent(observation_shape=env.observation_space.shape,
                         n_actions=env.action_space.n,
                         lr=5e-6,                        # SMALL TO PREVENT LARGE CHANGES IN POLICY
                         gamma=0.99)

In [None]:
episode_rewards, episode_lengths, mean_rewards = [],[],[]
best_reward = -np.inf

for episode_n in tqdm(range(N_EPISODES)):
    total_reward, total_moves = 0,0

    done = False
    observation = env.reset()

    while not done:
        action, logprob = agent.get_action(observation)
        observation_, reward, done, _ = env.step(action)

        total_reward += reward
        total_moves += 1

        agent.learn(observation, reward, observation_, logprob, done)
        
        observation = observation_
    
    episode_rewards.append(total_reward)
    episode_lengths.append(total_moves)

    mean_reward = np.mean(episode_rewards[-100:])
    mean_rewards.append(mean_reward)
    if(mean_reward > best_reward):
        best_reward = mean_reward
        print("BEST: ",best_reward)

    print("ITER: ",episode_n,"\tRWD: ",round(total_reward,2),"\tM_RWD: ",round(mean_reward,2),"\tLEN: ",total_moves)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))

BEST:  -122.70520181008123
ITER:  0 	RWD:  -122.71 	M_RWD:  -122.71 	LEN:  96
BEST:  -103.98701572541232
ITER:  1 	RWD:  -85.27 	M_RWD:  -103.99 	LEN:  88
ITER:  2 	RWD:  -251.53 	M_RWD:  -153.17 	LEN:  83
ITER:  3 	RWD:  -393.76 	M_RWD:  -213.32 	LEN:  179
ITER:  4 	RWD:  -234.85 	M_RWD:  -217.62 	LEN:  99
ITER:  5 	RWD:  -79.43 	M_RWD:  -194.59 	LEN:  90
ITER:  6 	RWD:  -244.33 	M_RWD:  -201.7 	LEN:  78
ITER:  7 	RWD:  -311.5 	M_RWD:  -215.42 	LEN:  156
ITER:  8 	RWD:  -17.03 	M_RWD:  -193.38 	LEN:  126
ITER:  9 	RWD:  -164.33 	M_RWD:  -190.47 	LEN:  140
ITER:  10 	RWD:  -8.0 	M_RWD:  -173.88 	LEN:  121
ITER:  11 	RWD:  -48.77 	M_RWD:  -163.46 	LEN:  89
ITER:  12 	RWD:  -66.29 	M_RWD:  -155.98 	LEN:  115
ITER:  13 	RWD:  -91.34 	M_RWD:  -151.37 	LEN:  192
ITER:  14 	RWD:  -342.58 	M_RWD:  -164.11 	LEN:  129
ITER:  15 	RWD:  -207.35 	M_RWD:  -166.82 	LEN:  138
ITER:  16 	RWD:  -266.83 	M_RWD:  -172.7 	LEN:  209
ITER:  17 	RWD:  -325.44 	M_RWD:  -181.18 	LEN:  110
ITER:  18 	RWD:  -184

In [None]:
plt.plot(mean_rewards)