<a href="https://colab.research.google.com/github/moodlep/MLC_A3C/blob/main/a3c_pm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[?25l[K     |▊                               | 10 kB 25.8 MB/s eta 0:00:01[K     |█▌                              | 20 kB 26.8 MB/s eta 0:00:01[K     |██▏                             | 30 kB 30.0 MB/s eta 0:00:01[K     |███                             | 40 kB 30.4 MB/s eta 0:00:01[K     |███▋                            | 51 kB 33.1 MB/s eta 0:00:01[K     |████▍                           | 61 kB 32.8 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 30.8 MB/s eta 0:00:01[K     |█████▉                          | 81 kB 31.7 MB/s eta 0:00:01[K     |██████▋                         | 92 kB 32.2 MB/s eta 0:00:01[K     |███████▎                        | 102 kB 34.3 MB/s eta 0:00:01[K     |████████                        | 112 kB 34.3 MB/s eta 0:00:01[K     |████████▊                       | 122 kB 34.3 MB/s eta 0:00:01[K     |█████████▌                      | 133 

In [2]:
import os
import Box2D
import pyglet
import imageio
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'


In [16]:
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


In [4]:
# The env - quick test: 

env = gym.make("LunarLander-v2")

s = env.reset()

for _ in range(5): 
  a = env.action_space.sample()
  next_state, reward, done, info = env.step(a)
  print(next_state, reward, a)


[-0.00403366  1.4108136  -0.20916715 -0.01503031  0.00628436  0.07962076
  0.          0.        ] -1.2594954679889543 1
[-0.00601196  1.4104248  -0.20221329 -0.01730717  0.01062587  0.08683817
  0.          0.        ] -0.020585698559887067 2
[-0.00807648  1.4094313  -0.21303622 -0.04421211  0.01713547  0.13020416
  0.          0.        ] -2.0449457888962557 1
[-0.01022396  1.4078481  -0.22343937 -0.07048976  0.02572237  0.17175414
  0.          0.        ] -2.4036715007549376 1
[-0.01228151  1.4056733  -0.2121342  -0.09678624  0.03203259  0.12621632
  0.          0.        ] -0.3327739370253642 3


In [14]:
env.action_space,env.action_space.n, env.observation_space, env.observation_space.shape[0], env.observation_space.sample()

(Discrete(4),
 4,
 Box(-inf, inf, (8,), float32),
 8,
 array([-1.0567338 , -0.9767524 , -0.11754707,  0.04958226, -1.0946387 ,
        -0.60420877, -0.39772463,  0.6373612 ], dtype=float32))

### Actor - policy NN and value NN 
### data collection -> batch
### train: calculate loss 

In [46]:

class Policy(nn.Module):
	def __init__(self, state_dim,action_dim,hidden=100):
		super(Policy, self).__init__()

		self.l1 = nn.Linear(state_dim, hidden)
		self.l2 = nn.Linear(hidden, hidden)
		self.l3 = nn.Linear(hidden,action_dim)

	def forward(self, state):
		q = F.leaky_relu(self.l1(state))
		q = F.leaky_relu(self.l2(q))
		return F.softmax(self.l3(q), dim = 1)
	
	def get_action(self, state):
		# check no grad; confirmed (https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/main.py#L114)
		with torch.no_grad():  
			pol = self.forward(state)
			dist = torch.distributions.Categorical(pol)
		return dist.sample() #returns a batch of values 
	
	def log_prob(self, state, actions):
		# Part of the loss term
		pol = self.forward(state)
		print("policy: ", pol)
		print("categoricals: ", torch.distributions.Categorical(pol))
		print("log_probs: action 1 ", torch.distributions.Categorical(pol).log_prob(actions[0]))
		print("log_probs: action 2 ", torch.distributions.Categorical(pol).log_prob(actions[1]))
		log_prob = torch.distributions.Categorical(pol).log_prob(actions)
		return log_prob
	
	def entropy(self, state):
		pol = self.forward(state)
		return torch.distributions.Categorical(pol).entropy()
    


In [47]:
# create batch of states 
batch_states = torch.rand(5, env.observation_space.shape[0])

policy = Policy(env.observation_space.shape[0], env.action_space.n)
policy(batch_states).data

tensor([[0.2653, 0.2191, 0.2110, 0.3045],
        [0.2722, 0.2180, 0.2224, 0.2874],
        [0.2656, 0.2196, 0.2169, 0.2979],
        [0.2675, 0.2265, 0.2190, 0.2870],
        [0.2662, 0.2250, 0.2050, 0.3038]])

In [48]:
batch_actions = policy.get_action(batch_states)
batch_actions

tensor([0, 0, 3, 1, 0])

In [49]:
policy.log_prob(batch_states, batch_actions)

policy:  tensor([[0.2653, 0.2191, 0.2110, 0.3045],
        [0.2722, 0.2180, 0.2224, 0.2874],
        [0.2656, 0.2196, 0.2169, 0.2979],
        [0.2675, 0.2265, 0.2190, 0.2870],
        [0.2662, 0.2250, 0.2050, 0.3038]], grad_fn=<SoftmaxBackward>)
categoricals:  Categorical(probs: torch.Size([5, 4]))
log_probs: action 1  tensor([-1.3268, -1.3011, -1.3258, -1.3187, -1.3235],
       grad_fn=<SqueezeBackward1>)
log_probs: action 2  tensor([-1.3268, -1.3011, -1.3258, -1.3187, -1.3235],
       grad_fn=<SqueezeBackward1>)


tensor([-1.3268, -1.3011, -1.2111, -1.4851, -1.3235],
       grad_fn=<SqueezeBackward1>)

In [None]:
class Critic(nn.Module):
  	def __init__(self, state_dim,hidden=100):
		  super(Critic, self).__init__()
    
      self.l1 = nn.Linear(state_dim, hidden)
      self.l2 = nn.Linear(hidden, hidden)
      self.l3 = nn.Linear(hidden,1)

    def forward(self, state):
      q = F.leaky_relu(self.l1(state))
      q = F.leaky_relu(self.l2(q))
      return self.l3(q)



In [None]:
class ActorCritic():
  	def __init__(self, env, pol):
    
    
    


In [None]:
# worker process
# Input: A2C network, env, no of steps, 

# 1. 