# robot environment

- balence inverted pendulum by moving cart
- state   : 4 values : positions + velocities
- actions : 2 discrete : move cart left, move cart right

![image](../doc/images/cart_pole.png)
![image](../doc/images/diagrams-rl_principle.png)

# parallel environments

- on-policy agents requires multiple parallel environments to reduce data correlation
- we create lis of environments

![image](../doc/images/diagrams-envs.png)

In [3]:
import numpy
import torch
import time
import gymnasium as gym

'''
    multiple environments wrapper
'''
class EnvsList:
    def __init__(self, env_name, n_envs, render_mode = None, Wrapper = None, max_steps = 1000):
        
        self.envs   = []

        self.max_steps = max_steps
        self.steps     = numpy.zeros(n_envs)

        for _ in range(n_envs):
            if isinstance(env_name, str):
                env = gym.make(env_name, render_mode=render_mode)
            else:
                env = env_name(render_mode=render_mode)

            if Wrapper is not None:
                env = Wrapper(env)

            self.envs.append(env)

        self.observation_space = self.envs[0].observation_space
        self.action_space      = self.envs[0].action_space
      

    def __len__(self):
        return len(self.envs)

    def step(self, actions):

        states  = []
        rewards = []
        dones   = [] 
        infos   = []

        self.steps+= 1
        for i in range(len(self.envs)):
            state, reward, done, _, info = self.envs[i].step(actions[i])

            if self.steps[i] > self.max_steps:
                done = True

            states.append(state)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)

        states  = numpy.stack(states)
        rewards = numpy.stack(rewards)
        dones   = numpy.stack(dones)

        return states, rewards, dones, infos
    
    def reset_all(self):
        states = []
        infos  = []

        for i in range(len(self.envs)):
            state, info = self.envs[i].reset()
           
            states.append(state)
            infos.append(info)

        states  = numpy.stack(states)
        self.steps[:] = 0

        return states, infos
    
    def reset(self, env_id):
        self.steps[env_id] = 0 
        return self.envs[env_id].reset()
        
    def render(self, env_id):
        return self.envs[env_id].render()

    def __getitem__(self, index):
        return self.envs[index]


In [4]:
# create mutliple environments
env_name = "CartPole-v1"

print("creating envs")
envs = EnvsList(env_name, 32)
states, _ = envs.reset_all()

print("states shape ", states.shape)

creating envs
states shape  (32, 4)


# PPO agent





In [5]:
  
class AgentPPO():
    def __init__(self, envs, Model):
        self.envs = envs

        # auto select device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # agent hyperparameters
        self.gamma              = 0.99
        self.entropy_beta       = 0.001
        self.eps_clip           = 0.1 
        self.adv_coeff          = 1.0
        self.val_coeff          = 0.5

        self.trajectory_steps   = 128
        self.batch_size         = 256
        
        self.training_epochs    = 4
        self.envs_count         = len(envs)
        self.learning_rate      = 0.0001


        self.state_shape    = self.envs.observation_space.shape
        self.actions_count  = self.envs.action_space.n

        # policy buffer for storing trajectory
        self._buffer_init()
        
        # create model
        self.model = Model(self.state_shape, self.actions_count)
        self.model.to(self.device)
        print(self.model)

        # initialise optimizer and trajectory buffer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

   
    # agent main step
    def step(self, states, training_enabled = False):        
        states_t = torch.tensor(states, dtype=torch.float).to(self.device)

        # obtain model output, logits and values
        logits_t, values_t  = self.model.forward(states_t)

        # sample action, probs computed from logits
        action_probs_t        = torch.nn.functional.softmax(logits_t, dim = 1)
        action_distribution_t = torch.distributions.Categorical(action_probs_t)
        action_t              = action_distribution_t.sample()
        actions               = action_t.detach().to("cpu").numpy()
       
        # environment step
        states_new, rewards, dones, infos = self.envs.step(actions)

        #put into trajectory buffer
        if training_enabled:
            self._buffer_add(states_t, logits_t, values_t, actions, rewards, dones)

            # if buffer is full, run training loop and clear buffer after
            if self.buffer_ptr >= self.trajectory_steps:
                self._compute_returns(self.gamma)
                self._train()
                self._buffer_init()
  

        return states_new, rewards, dones, infos
    
    def save(self, result_path):
        torch.save(self.model.state_dict(), result_path + "/model.pt")

    def load(self, result_path):
        self.model.load_state_dict(torch.load(result_path + "/model.pt", map_location = self.device))

    def _train(self): 
        samples_count = self.trajectory_steps*self.envs_count
        batch_count = samples_count//self.batch_size

        # epoch training
        for e in range(self.training_epochs):
            for batch_idx in range(batch_count):
                # sample batch
                states, logits, actions, returns, advantages = self._sample_batch(self.batch_size)
                
                # compute main PPO loss
                loss_ppo = self.loss_ppo(states, logits, actions, returns, advantages)

                self.optimizer.zero_grad()        
                loss_ppo.backward()

                # gradient clip for stabilising training
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
                self.optimizer.step() 

         

    '''
        main PPO loss
    '''
    def loss_ppo(self, states, logits, actions, returns, advantages):
        logits_new, values_new  = self.model.forward(states)

        
        log_probs_old = torch.nn.functional.log_softmax(logits, dim = 1).detach()

        probs_new     = torch.nn.functional.softmax(logits_new,     dim = 1)
        log_probs_new = torch.nn.functional.log_softmax(logits_new, dim = 1)

        '''
            compute critic loss, as MSE
            L = (T - V(s))^2
        '''
        values_new = values_new.squeeze(1)
        loss_value = (returns.detach() - values_new)**2
        loss_value = loss_value.mean()

        ''' 
            compute actor loss, surrogate loss
        '''
        advantages       = self.adv_coeff*advantages.detach() 
        advantages  = (advantages - torch.mean(advantages))/(torch.std(advantages) + 1e-10)

        log_probs_new_  = log_probs_new[range(len(log_probs_new)), actions]
        log_probs_old_  = log_probs_old[range(len(log_probs_old)), actions]
                        
        ratio       = torch.exp(log_probs_new_ - log_probs_old_)
        p1          = ratio*advantages
        p2          = torch.clamp(ratio, 1.0 - self.eps_clip, 1.0 + self.eps_clip)*advantages
        loss_policy = -torch.min(p1, p2)  
        loss_policy = loss_policy.mean()  
    
        '''
            compute entropy loss, to avoid greedy strategy
            L = beta*H(pi(s)) = beta*pi(s)*log(pi(s))
        '''
        loss_entropy = (probs_new*log_probs_new).sum(dim = 1)
        loss_entropy = self.entropy_beta*loss_entropy.mean()

        loss = self.val_coeff*loss_value + loss_policy + loss_entropy

        return loss
    
    '''
        trajectory buffer methods
        this is mostly held in separated class
    '''
    # trajectory buffer init
    def _buffer_init(self):
        self.states     = torch.zeros((self.trajectory_steps, self.envs_count, ) + self.state_shape, dtype=torch.float32, device=self.device)
        self.logits     = torch.zeros((self.trajectory_steps, self.envs_count,  self.actions_count), dtype=torch.float32, device=self.device)
        self.values     = torch.zeros((self.trajectory_steps, self.envs_count, ), dtype=torch.float32, device=self.device)        
        self.actions    = torch.zeros((self.trajectory_steps, self.envs_count, ), dtype=int, device=self.device)
        self.reward     = torch.zeros((self.trajectory_steps, self.envs_count, ), dtype=torch.float32, device=self.device)
        self.dones      = torch.zeros((self.trajectory_steps, self.envs_count, ), dtype=torch.float32, device=self.device)

        self.buffer_ptr = 0  

    # add new items into buffer
    def _buffer_add(self, states, logits, values, actions, rewards, dones):
        self.states[self.buffer_ptr]    = states.detach().to("cpu").clone() 
        self.logits[self.buffer_ptr]    = logits.detach().to("cpu").clone() 
        self.values[self.buffer_ptr]    = values.squeeze(1).detach().to("cpu").clone() 
        self.actions[self.buffer_ptr]   = torch.from_numpy(actions)
        
        self.reward[self.buffer_ptr]    = torch.from_numpy(rewards)
        self.dones[self.buffer_ptr]     = torch.from_numpy(dones).float()
        
        self.buffer_ptr = self.buffer_ptr + 1 


    def _compute_returns(self, gamma, lam = 0.95):
        self.returns, self.advantages   = self._gae(self.reward, self.values, self.dones, gamma, lam)
        
        #reshape buffer for faster batch sampling
        self.states     = self.states.reshape((self.trajectory_steps*self.envs_count, ) + self.state_shape)
        self.logits     = self.logits.reshape((self.trajectory_steps*self.envs_count, self.actions_count))

        self.values     = self.values.reshape((self.trajectory_steps*self.envs_count, ))        
     
        self.actions    = self.actions.reshape((self.trajectory_steps*self.envs_count, ))
        
        self.reward     = self.reward.reshape((self.trajectory_steps*self.envs_count, ))
      
        self.dones      = self.dones.reshape((self.trajectory_steps*self.envs_count, ))

        self.returns    = self.returns.reshape((self.trajectory_steps*self.envs_count, ))
        self.advantages = self.advantages.reshape((self.trajectory_steps*self.envs_count, ))
   
    # sampel random batch from buffer
    def _sample_batch(self, batch_size):
        indices         = torch.randint(0, self.envs_count*self.trajectory_steps, size=(batch_size, ))

        states          = self.states[indices]
        logits          = self.logits[indices]
        
        actions         = self.actions[indices]
        
        returns         = self.returns[indices]
        advantages      = self.advantages[indices]
       
        return states, logits, actions, returns, advantages
    
    # gae returns computing - more stable than basic returns computatiom
    def _gae(self, rewards, values, dones, gamma, lam):
        buffer_size = rewards.shape[0]
        envs_count  = rewards.shape[1]
        
        returns     = torch.zeros((buffer_size, envs_count), dtype=torch.float32)
        advantages  = torch.zeros((buffer_size, envs_count), dtype=torch.float32)

        last_gae    = torch.zeros((envs_count), dtype=torch.float32)
        
        for n in reversed(range(buffer_size-1)):
            delta           = rewards[n] + gamma*values[n+1]*(1.0 - dones[n]) - values[n]
            last_gae        = delta + gamma*lam*last_gae*(1.0 - dones[n])
            
            returns[n]      = last_gae + values[n]
            advantages[n]   = last_gae
 
        return returns, advantages


# actor critic model

- simple fully connected model
- 128 hidden units in two layers
- two separated heads for actor anc critic

![image](../doc/images/diagrams-fc_model.png)
 


In [6]:
'''
    two hidden layers FC model for actor critic architecture
'''
class ModelFC(torch.nn.Module):
    def __init__(self, input_shape, n_actions, n_hidden = 128):
        super(ModelFC, self).__init__()

        n_inputs = input_shape[0]

        # FC model, with two hidden layers and two output heads
        self.lin0 = torch.nn.Linear(n_inputs, n_hidden)
        self.act0 = torch.nn.SiLU()
        self.lin1 = torch.nn.Linear(n_hidden, n_hidden)
        self.act1 = torch.nn.SiLU()

        self.lin_actor  = torch.nn.Linear(n_hidden, n_actions)
        self.lin_critic = torch.nn.Linear(n_hidden, 1)

        # orthogonal weight init
        torch.nn.init.orthogonal_(self.lin0.weight, 0.5)
        torch.nn.init.zeros_(self.lin0.bias)
        torch.nn.init.orthogonal_(self.lin1.weight, 0.5)
        torch.nn.init.zeros_(self.lin1.bias)

        # output layers with lower init gain
        torch.nn.init.orthogonal_(self.lin_actor.weight, 0.01)
        torch.nn.init.zeros_(self.lin_actor.bias)
        torch.nn.init.orthogonal_(self.lin_critic.weight, 0.1)
        torch.nn.init.zeros_(self.lin_critic.bias)

    def forward(self, state):
        # obtain features
        z = self.lin0(state)
        z = self.act0(z)
        z = self.lin1(z)
        z = self.act1(z)

        # obtain actor and critic outputs
        logits = self.lin_actor(z)
        value  = self.lin_critic(z)

        return logits, value
    

# main training loop

In [None]:

agent = AgentPPO(envs, ModelFC)


episodes_count  = numpy.zeros(len(envs))
rewards_sum     = numpy.zeros(len(envs))
rewards_episode = numpy.zeros(len(envs))

n_steps = 100000


states, _ = envs.reset_all()
for n in range(n_steps):
    # agent main step
    states_new, rewards, dones, infos = agent.step(states, True)

    # accumulate rewards for stats
    rewards_sum+= rewards

    # reset environments which finished episode 
    dones_idx = numpy.where(dones)[0]
    for i in dones_idx:
        states_new[i], _ = envs.reset(i)

        episodes_count[i]+= 1
        rewards_episode[i] = rewards_sum[i]
        rewards_sum[i] = 0

    states = states_new.copy()

    if n%1000 == 0:
        episodes_mean = round(episodes_count.mean(), 2)
        rewards_mean  = round(rewards_episode.mean(), 3)
        rewards_std   = round(rewards_episode.std(), 3)
        print(n, episodes_mean, rewards_mean, rewards_std)


agent.save("CartPole/")

print("\n\n")
print("training done")


ModelFC(
  (lin0): Linear(in_features=4, out_features=128, bias=True)
  (act0): SiLU()
  (lin1): Linear(in_features=128, out_features=128, bias=True)
  (act1): SiLU()
  (lin_actor): Linear(in_features=128, out_features=2, bias=True)
  (lin_critic): Linear(in_features=128, out_features=1, bias=True)
)
0 0.0 0.0 0.0
1000 43.59 20.062 10.04
2000 87.19 21.156 7.754
3000 131.97 23.75 13.679
4000 175.53 21.5 11.264
5000 212.03 33.75 15.067
6000 236.88 54.781 36.038
7000 254.59 65.531 33.213
8000 267.47 89.719 44.32
9000 277.69 106.406 52.544
10000 284.69 154.531 105.741
11000 289.91 279.219 229.223
12000 295.62 195.75 142.813


# inference

In [None]:
# create single env
envs = EnvsList(env_name, 1, render_mode="human")

agent = AgentPPO(envs, ModelFC)
agent.load("CartPole/") 

states, _ = envs.reset_all()
n_steps = 1000000
for n in range(n_steps):
    # agent main step
    states_new, rewards, dones, infos = agent.step(states)

    # reset environments which finished episode 
    dones_idx = numpy.where(dones)[0]
    for i in dones_idx:
        states_new[i], _ = envs.reset(i)

    states = states_new.copy()


ModelFC(
  (lin0): Linear(in_features=4, out_features=128, bias=True)
  (act0): SiLU()
  (lin1): Linear(in_features=128, out_features=128, bias=True)
  (act1): SiLU()
  (lin_actor): Linear(in_features=128, out_features=2, bias=True)
  (lin_critic): Linear(in_features=128, out_features=1, bias=True)
)


