<a href="https://colab.research.google.com/github/Nguyencongdat1997/RL.TryOut/blob/developments-ppo/Simple_PPO_single_processing%2C_discrete_action_space%2C_decoupled_actor_critic_networks%2C_no_entropy_bonus_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment

In [23]:
import gym
import random

In [20]:
env = gym.make('CartPole-v0')
observation_shape = env.observation_space.shape[0]
n_actions = env.action_space.n
action_space = [x for x in range(actions)]

In [21]:
print(n_actions)
sample_action = env.action_space.sample()
print(sample_action)
print(observation_shape)
state = env.reset()
print(state)
state, reward, done, info = env.step(sample_action)
print(state, reward, done, info)

2
1
4
[-0.04132161 -0.02188737  0.0433473  -0.0051939 ]
[-0.04175936  0.17258698  0.04324342 -0.28389129] 1.0 False {}


In [24]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        # env.render()
        action = random.choice(action_space)
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:21.0
Episode:2 Score:12.0
Episode:3 Score:41.0
Episode:4 Score:22.0
Episode:5 Score:21.0


# PPO torch

## Import

In [14]:
import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

## Experience Replay

In [37]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

    def store_step(self, state, action, probs, value, reward, next_state, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(value)
        self.rewards.append(reward)
        self.dones.append(done)

    def sample_buffer(self, batch_size):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
    

## Network

In [94]:
class Actor(nn.Module):
    def __init__(self, n_actions, input_dims, learning_rate):        
        super(Actor, self).__init__()
        fc1_dims = 256
        fc2_dims = 256

        self.actor = nn.Sequential(
            nn.Linear(*input_dims, fc1_dims),
            nn.ReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.ReLU(),
            nn.Linear(fc2_dims, n_actions),
            nn.Softmax(dim= -1),
        )        
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        distribution = self.actor(state)
        distribution = Categorical(distribution)
        return distribution

    def save_model(self, checkpoint_dir='./trained_models/ppo/torch'):
        file_name = checkpoint_dir + '/actor/m'
        T.save(self.state_dict(), file_name)
    
    def load_model(self, checkpoint_dir='./trained_models/ppo/torch'):
        file_name = checkpoint_dir + '/actor/m'
        self.load_state_dict(T.load(file_name))

In [95]:
class Critic(nn.Module):
    def __init__(self, input_dims, learning_rate):        
        super(Critic, self).__init__()
        fc1_dims = 256
        fc2_dims = 256

        self.critic = nn.Sequential(
            nn.Linear(*input_dims, fc1_dims),
            nn.ReLU(),
            nn.Linear(fc1_dims, fc2_dims),
            nn.ReLU(),
            nn.Linear(fc2_dims, 1)
        )        
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)
        return value

    def save_model(self, checkpoint_dir='./trained_models/ppo/torch'):
        file_name = checkpoint_dir + '/critic/m'
        T.save(self.state_dict(), file_name)
    
    def load_model(self, checkpoint_dir='./trained_models/ppo/torch'):
        file_name = checkpoint_dir + '/critic/m'
        self.load_state_dict(T.load(file_name))

## Agent

In [104]:
class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, actor_learning_rate=0.0003, critic_learning_rate=0.0003,
               policy_clip=0.2, learn_batch_size=64, learn_epochs=10, gae_lambda=0.95, mem_size=100000):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.learn_epochs = learn_epochs
        self.learn_batch_size = learn_batch_size
        self.gae_lambda= gae_lambda

        self.actor = Actor(n_actions, input_dims, actor_learning_rate)
        self.critic = Critic(input_dims, critic_learning_rate)
        self.memory = ReplayBuffer(mem_size, input_dims)
        
    def store_step(self, state, action, probs, value, reward, next_state, done):
        self.memory.store_step(state, action, probs, value, reward, next_state, done) 

    def save_models(self, checkpoint_dir='./trained_models/ppo/torch'):
        self.actor.save_model(checkpoint_dir)
        self.critic.save_model(checkpoint_dir)
    
    def load_models(self, checkpoint_dir='./trained_models/ppo/torch'):
        self.actor.load_model(checkpoint_dir)
        self.critic.load_model(checkpoint_dir)
    
    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()        

        prob = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, prob, value
    
    def learn(self):
        for _ in range(self.learn_epochs):
            states, actions, old_probs, values, rewards, dones, batches = self.memory.sample_buffer(self.learn_batch_size)
            advantages = np.zeros(len(rewards), dtype=np.float32)
            for t in range(len(rewards)-1):
                discount = 1
                advantage_t = 0
                for k in range(t, len(rewards) -1):
                    advantage_t += discount*(rewards[k] + self.gamma*values[k+1]*(1-int(dones[k])) - values[k] )
                    discount *= self.gamma*self.gae_lambda
                advantages[t] = advantage_t

            advantages = T.tensor(advantages).to(self.actor.device)
            values = T.tensor(values).to(self.actor.device)

            for batch in batches:
                states = T.tensor(states[batches], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_probs[batches]).to(self.actor.device)
                actions = T.tensor(actions[batches]).to(self.actor.device)

                dist = self.actor(states)
                predicted_values = self.critic(states)
                predicted_values = T.squeeze(predicted_values)

                new_probs = dist.log_prob(actions)
                prob_ratio = (new_probs-old_probs).exp()  #new_probs.exp() / old_probs.exp() 
                actor_loss =  -T.min(
                    prob_ratio*advantages[batch], 
                    T.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantages[batch]
                ).mean()
                
                returns = advantages[batch] + values[batch]
                critic_loss = ((returns - predicted_values)**2).mean()

                total_loss = actor_loss + 0.5*critic_loss
                
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()
        self.memory.clear_memory()

## Training

In [106]:
n_games = 300
agent = Agent(n_actions=env.action_space.n, input_dims=env.observation_space.shape, 
              learn_batch_size=64, actor_learning_rate=0.0003, critic_learning_rate=0.0003, learn_epochs=10)
break_to_learn = 20

avg_score = 0
time_steps = 0
learn_steps = 0
best_score = env.reward_range[0]
score_history = []
for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action, prob, value = agent.choose_action(observation)
        next_observation, reward, done, info = env.step(action)
        time_steps += 1
        score += reward        
        agent.store_step(observation, action, prob, value, reward, next_observation, done)
        if (time_steps+1) % break_to_learn == 0:
            agent.learn()
            learn_steps += 1
        observation = next_observation        
        
    score_history.append(score)
    avg_score = np.mean(score_history[-10:])
    if avg_score > best_score:
        best_score = avg_score
        agent.save_models('./trained_models')

    print('episode', i, 'time_steps', time_steps, 'learning_steps', learn_steps, 'score %.1f' % score, 'avg score %.1f' % avg_score)

episode 0 time_steps 16 learning_steps 0 score 16.0 avg score 16.0
episode 1 time_steps 33 learning_steps 1 score 17.0 avg score 16.5
episode 2 time_steps 56 learning_steps 2 score 23.0 avg score 18.7




episode 3 time_steps 103 learning_steps 5 score 47.0 avg score 25.8
episode 4 time_steps 146 learning_steps 7 score 43.0 avg score 29.2
episode 5 time_steps 224 learning_steps 11 score 78.0 avg score 37.3
episode 6 time_steps 242 learning_steps 12 score 18.0 avg score 34.6
episode 7 time_steps 296 learning_steps 14 score 54.0 avg score 37.0
episode 8 time_steps 317 learning_steps 15 score 21.0 avg score 35.2
episode 9 time_steps 330 learning_steps 16 score 13.0 avg score 33.0
episode 10 time_steps 349 learning_steps 17 score 19.0 avg score 33.3
episode 11 time_steps 374 learning_steps 18 score 25.0 avg score 34.1
episode 12 time_steps 388 learning_steps 19 score 14.0 avg score 33.2
episode 13 time_steps 473 learning_steps 23 score 85.0 avg score 37.0
episode 14 time_steps 504 learning_steps 25 score 31.0 avg score 35.8
episode 15 time_steps 589 learning_steps 29 score 85.0 avg score 36.5
episode 16 time_steps 658 learning_steps 32 score 69.0 avg score 41.6
episode 17 time_steps 692 lea

## Testing

In [107]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action, _, _ = agent.choose_action(state)
        state, reward, done, info = env.step(action)
        #env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 70.0
Episode: 1 score: 124.0
Episode: 2 score: 75.0
Episode: 3 score: 128.0
Episode: 4 score: 122.0


In [None]:
agent = Agent(n_actions=env.action_space.n, input_dims=env.observation_space.shape, 
              learn_batch_size=64, actor_learning_rate=0.0003, critic_learning_rate=0.0003, learn_epochs=10)

state = env.reset()
action, prob, _ = agent.choose_action(state)
print(state, action)
print()
state, reward, done, info = env.step(0)
action, prob, _ = agent.choose_action(state)
print(state, action)
print()
state, reward, done, info = env.step(0)
action, prob, _ = agent.choose_action(state)
print(state, action)
print()

[ 0.01823638 -0.00567572 -0.03260454  0.01748912] 0

[ 0.01812287 -0.20031529 -0.03225476  0.29970926] 1

[ 0.01411656 -0.39496299 -0.02626057  0.58204769] 0



# PPO tf

## Import

## Experience Replay

## Network

## Agent

## Training

## Testing

In [None]:
#