<a href="https://colab.research.google.com/github/moodlep/MLC_A3C/blob/main/a3c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[?25l[K     |▊                               | 10 kB 17.9 MB/s eta 0:00:01[K     |█▌                              | 20 kB 11.2 MB/s eta 0:00:01[K     |██▏                             | 30 kB 9.3 MB/s eta 0:00:01[K     |███                             | 40 kB 8.1 MB/s eta 0:00:01[K     |███▋                            | 51 kB 4.3 MB/s eta 0:00:01[K     |████▍                           | 61 kB 4.5 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 4.8 MB/s eta 0:00:01[K     |█████▉                          | 81 kB 5.4 MB/s eta 0:00:01[K     |██████▋                         | 92 kB 3.9 MB/s eta 0:00:01[K     |███████▎                        | 102 kB 4.2 MB/s eta 0:00:01[K     |████████                        | 112 kB 4.2 MB/s eta 0:00:01[K     |████████▊                       | 122 kB 4.2 MB/s eta 0:00:01[K     |█████████▌                      | 133 kB 4.2 MB/

In [None]:
import os
import Box2D
import pyglet
import imageio
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'


In [None]:
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp


In [None]:
# The env - quick test: 

env = gym.make("LunarLander-v2")

s = env.reset()

for _ in range(5): 
  a = env.action_space.sample()
  next_state, reward, done, info = env.step(a)
  print(next_state, reward, a)


[-0.00671301  1.3930244  -0.3336074  -0.4105778   0.00580508  0.0372713
  0.          0.        ] -0.5840980788976526 3
[-0.00994568  1.3831863  -0.3229907  -0.43725243  0.00553722 -0.00535743
  0.          0.        ] -0.47987389698741256 3
[-0.01317825  1.372748   -0.32298952 -0.46392414  0.00527007 -0.0053438
  0.          0.        ] -1.0996906444878505 0
[-0.01624842  1.3630856  -0.30755308 -0.4294401   0.00580306  0.01066116
  0.          0.        ] 4.3169633396255565 2
[-0.01931849  1.3528231  -0.3075549  -0.45611992  0.00633539  0.01064755
  0.          0.        ] -1.2222191633063915 0


In [None]:
class SharedAdam(torch.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        # State initialization
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)

                # share in memory
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

In [None]:
env.action_space

Discrete(4)

### Actor - policy NN and value NN 
### data collection -> batch
### train: calculate loss 

In [None]:

class Policy(nn.Module):
	def __init__(self, state_dim,action_dim,hidden=100):
		super(Policy, self).__init__()

		self.l1 = nn.Linear(state_dim, hidden)
		self.l2 = nn.Linear(hidden, hidden)
		self.l3 = nn.Linear(hidden,action_dim)

	def forward(self, state):
		q = F.leaky_relu(self.l1(state))
		q = F.leaky_relu(self.l2(q))
		return F.softmax(self.l3(q), dim = 1)
	
	def get_action(self,state):
		with torch.no_grad():
			pol = self.forward(state)
			dist = torch.distributions.Categorical(pol)
		return dist.sample() #returns a batch of values
	
	def log_prob(self, state, actions):
		  # Part of the loss term
			pol = self.forward(state)
			log_prob = torch.distributions.Categorical(pol).log_prob(actions)
			return log_prob
	
	def entropy(self, state):
			pol = self.forward(state)
			return torch.distributions.Categorical(pol).entropy()
    


In [None]:
# create batch of states 
batch_states = torch.rand(5, env.observation_space.shape[0])

policy = Policy(env.observation_space.shape[0], env.action_space.n)
policy(batch_states).data

tensor([[0.2868, 0.2679, 0.2083, 0.2371],
        [0.2706, 0.2647, 0.2139, 0.2509],
        [0.2659, 0.2731, 0.2127, 0.2483],
        [0.2693, 0.2704, 0.2137, 0.2466],
        [0.2707, 0.2673, 0.2149, 0.2472]])

In [None]:
batch_actions = policy.get_action(batch_states)
batch_actions

tensor([1, 1, 2, 2, 0])

In [None]:
policy.log_prob(batch_states, batch_actions)

tensor([-1.3171, -1.3293, -1.5480, -1.5429, -1.3069],
       grad_fn=<SqueezeBackward1>)

In [None]:
class Critic(nn.Module):
    def __init__(self, state_dim,hidden=100):
      super(Critic, self).__init__()
    
      self.l1 = nn.Linear(state_dim, hidden)
      self.l2 = nn.Linear(hidden, hidden)
      self.l3 = nn.Linear(hidden,1)

    def forward(self, state):
      q = F.leaky_relu(self.l1(state))
      q = F.leaky_relu(self.l2(q))
      return self.l3(q)



In [None]:
#testing the critic output
critic = Critic(env.observation_space.shape[0])
critic(batch_states[0]).shape


torch.Size([1])

In [None]:
class ActorCriticWorker(mp.Process):
	def __init__(self,env_name,glb_critic,glb_policy,opt_crt,opt_pol,T,lock,gamma = 0.99,max_step=100):
		self.env = gym.make(env_name)
		self.t = 0
		self.max_step = max_step
		self.T = T
		self.lock = lock
		self.gamma = gamma

		self.actor = Policy(self.env.observation_space.shape[0], self.env.action_space.n)
		self.critic = Critic(self.env.observation_space.shape[0])
		self.global_critic = global_critic
		self.global_policy = global_policy
	
	def run(self):

		# 1. Sync local from global
		self.actor.load_state_dict(self.global_policy.state_dict())
		self.critic.load_state_dict(self.global_critic.state_dict())
	
		# 2. Create a rollout
		t_start = self.t		
		state   = self.env.reset() #giving us a state from the gym env.
	  done    = False
		states  = []
		actions = []
		rewards = []
		while not done and (self.t - t_start+1)%self.max_step !=0:
			    action = self.actor.get_action(state)
			    next_state, reward,done, _info = self.env.step(action)
			    rewards.append(reward)
			    actions.append(action)
			 		states.append(state)
			    state = next_state
					self.t  += 1					
					# lock memory
					with self.lock:
						self.T.value +=1

		# Calculate reward
		with torch.no_grad():
			if not done:			
				R = self.critic(torch.tensor(state,dtype = torch.float64)) #calculating the value function
			else:
				R = torch.tensor([0],dtype = torch.float64)
		
		for i in range(len(states)-1,-1,-1):
			  R = torch.tensor([rewards[i]]) + self.gamma*R
				#Calculating gradients
				


		# 3. Calculate loss 



# T is a global counter
# Tmax is total steps overall
# t is the local counter per process
    
    


In [None]:
# worker process
# Input: A2C network, env, no of steps, 

# 1. 
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n


global_critic = Critic(state_dim)
global_policy = Policy(state_dim,action_dim)
global_critic.share_memory()
global_policy.share_memory()

global_opt_crt = SharedAdam(global_critic.parameters())
global_opt_pol = SharedAdam(global_policy.parameters())


global_ctr = mp.Value('i',0)
lock = mp.Lock()

pr = [mp.Process(target=test,args=(a,)) for _ in range(5)]

for p in pr:
    p.start()

    
for p in pr:
    p.join()


NameError: ignored