<a href="https://colab.research.google.com/github/Nguyencongdat1997/RL.TryOut/blob/developments-ppo/Simple_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installization & Import

In [None]:
!pip install tensorflow==2.3.1 gym keras-rl2 gym[atari]

  Found existing installation: h5py 3.1.0
    Uninstalling h5py-3.1.0:
      Successfully uninstalled h5py-3.1.0
  Found existing installation: tensorflow-estimator 2.5.0
    Uninstalling tensorflow-estimator-2.5.0:
      Successfully uninstalled tensorflow-estimator-2.5.0
  Found existing installation: gast 0.4.0
    Uninstalling gast-0.4.0:
      Successfully uninstalled gast-0.4.0
  Found existing installation: tensorflow 2.5.0
    Uninstalling tensorflow-2.5.0:
      Successfully uninstalled tensorflow-2.5.0
Successfully installed gast-0.3.3 h5py-2.10.0 keras-rl2-1.0.5 numpy-1.18.5 tensorflow-2.3.1 tensorflow-estimator-2.3.0


In [1]:
import gym 
import random
import time

import numpy as np
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

# Environment

In [2]:
env = gym.make('CartPole-v0')
observations = env.observation_space.shape[0]
actions = env.action_space.n
action_space = [x for x in range(actions)]

In [3]:
print(actions)
sample_action = env.action_space.sample()
print(sample_action)
print(observations)
state = env.reset()
print(state)
state, reward, done, info = env.step(sample_action)
print(state, reward, done, info)

2
1
4
[0.02315373 0.04200671 0.00834748 0.013585  ]
[ 0.02399386  0.23700796  0.00861918 -0.27645256] 1.0 False {}


In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        # env.render()
        action = random.choice(action_space)
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:25.0
Episode:2 Score:23.0
Episode:3 Score:34.0
Episode:4 Score:11.0
Episode:5 Score:13.0


# KerasRL's DQN

## Import

In [5]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy, BoltzmannQPolicy

ModuleNotFoundError: ignored

## Model

In [None]:
def build_model(observations, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(24, activation='tanh'))
    model.add(Dense(48, activation='tanh'))
    model.add(Dense(actions, activation='linear'))
    return model

In [None]:
model = build_model(observations, actions)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 48)                1200      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 98        
Total params: 1,418
Trainable params: 1,418
Non-trainable params: 0
_________________________________________________________________


## DQN

In [None]:
def build_agent(model, actions):
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=2000,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=0.01, decay=0.01), metrics=['mse'])
    return dqn

In [None]:
dqn = build_agent(model, actions)

In [None]:
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 76.169 seconds


<tensorflow.python.keras.callbacks.History at 0x1b85b1d8388>

In [None]:
dqn.save_weights('./trained_models/CartPole/KeraRL/model_10000')

## Test

In [None]:
dqn.load_weights('./trained_models/CartPole/KeraRL/model_10000')

In [None]:
scores = dqn.test(env, nb_episodes=5, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
200.0


In [None]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = dqn.forward(state)
        state, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 200.0
Episode: 1 score: 200.0
Episode: 2 score: 200.0
Episode: 3 score: 200.0
Episode: 4 score: 200.0


# Stable baseline

## Import

In [None]:
from stable_baselines3.common.cmd_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np 
import os## Import

## Callback

In [None]:
class SavingBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq:int, save_path: str, verbose=1):
        super(SavingBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    def _init_callback(self):
        if self.save_path:
            os.makedirs(self.save_path, exist_ok=True)
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [None]:
CHECKPOINT_DIR = './trained_models/CartPole/StableBaselines/'
LOG_DIR = './logs/CartPole/StableBaselines/'
callback = SavingBestTrainingRewardCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

## Train

In [None]:
agent = A2C('MlpPolicy', env, verbose=0, tensorboard_log=LOG_DIR)
# agent = DQN('MlpPolicy', env, verbose=0, tensorboard_log=LOG_DIR)
#agent = ACER('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR)
#agent = PPO2('CnnPolicy', env, minibaches=2, verbose=1, tensorboard_log=LOG_DIR)
#agent = DQN('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR)

In [None]:
# trained_agent = A2C.load('./train/model_10000', env=env, tensorboard_log=LOG_DIR)

In [None]:
agent.learn(total_timesteps= 20000, callback= callback)

<stable_baselines3.a2c.a2c.A2C at 0x1bb5fae4908>

## Test

In [None]:
agent = A2C.load(CHECKPOINT_DIR + '/model_20000', env=env)

In [None]:
evaluate_policy(agent, env, n_eval_episodes=10, render=True)

(200.0, 0.0)

In [None]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action, states = agent.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 197.0
Episode: 1 score: 37.0
Episode: 2 score: 191.0
Episode: 3 score: 118.0
Episode: 4 score: 199.0


# From scratch - Double Dueling Deep Q - Keras

## Replay Buffer

In [28]:
class ReplayBuffer():
  def __init__(self, max_size, input_shape):
    self.mem_size = max_size
    self.mem_counter = 0
    
    self.states = np.zeros((self.mem_size, *input_shape), dtype=np.float64)
    self.next_states = np.zeros((self.mem_size, *input_shape), dtype=np.float64)
    self.rewards = np.zeros(self.mem_size, dtype=np.float64)
    self.actions = np.zeros(self.mem_size, dtype=np.int32)
    self.done = np.zeros(self.mem_size, dtype=np.bool)

  def store_step(self, state, action, reward, next_state, done):
    index = self.mem_counter % self.mem_size
    self.states[index] = state
    self.next_states[index] = next_state
    self.actions[index] = action
    self.rewards[index] = reward
    self.done[index] = done
    self.mem_counter += 1

  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_counter, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)

    states = self.states[batch]
    next_states = self.next_states[batch]
    rewards = self.rewards[batch]
    actions = self.actions[batch]
    done = self.done[batch]

    return states, actions, rewards, next_states, done
    

## Q Network

In [29]:
class DuelingDeepQNetwork(keras.Model):
  def __init__(self, n_actions):
    super(DuelingDeepQNetwork, self).__init__()

    fc1_dims = 128
    fc2_dims = 128
    self.dense1 = keras.layers.Dense(fc1_dims, activation='relu')
    self.dense2 = keras.layers.Dense(fc2_dims, activation='relu')
    self.V = keras.layers.Dense(1, activation=None)
    self.A = keras.layers.Dense(n_actions, activation=None)

  def call(self, state):
    x = self.dense1(state)
    x = self.dense2(x)
    V = self.V(x)
    A = self.A(x)

    Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
    return Q

  def advantage(self, state):
    x = self.dense1(state)
    x = self.dense2(x)    
    A = self.A(x)
    return A  


## Agent

In [30]:
class Agent():
  def __init__(self, lr, gamma, n_actions, epsilon, batch_size, input_dims, epsilon_dec=1e-3, epsilon_end=0.01, mem_size=1000000, replace=100):
    self.action_space = [i for i in range(n_actions)]
    self.gamma =gamma
    self.epsilon = epsilon
    self.epsilon_dec = epsilon_dec
    self.epsilon_end = epsilon_end
    self.replace = replace
    self.batch_size = batch_size

    self.learned_step_counter = 0
    self.memory = ReplayBuffer(mem_size, input_dims)
    self.q_active =  DuelingDeepQNetwork(n_actions)
    self.q_frozen =  DuelingDeepQNetwork(n_actions)

    self.q_active.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
    self.q_frozen.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

  def store_step(self, state, action, reward, next_state, done):
    self.memory.store_step(state, action, reward, next_state, done)
  
  def choose_action(self, observation):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.action_space)
    else:
      state = np.array([observation])
      actions = self.q_active.advantage(state)
      action = tf.math.argmax(actions, axis=1).numpy()[0]
    return action

  def learn(self):
    if self.memory.mem_counter < self.batch_size:
      return
    
    if self.learned_step_counter % self.replace == 0:
      self.q_frozen.set_weights(self.q_active.get_weights())

    # get data
    states, actions, rewards, next_states, dones = self.memory.sample_buffer(self.batch_size)
    q_pred = self.q_active(states)
    q_next = self.q_frozen(next_states)
    q_target = q_pred.numpy()
    max_next_actions = tf.math.argmax(self.q_active(next_states), axis=1)
    for i, terminated in enumerate(dones):
      q_target[i, actions[i]] = rewards[i] + self.gamma*q_next[i, max_next_actions[i]]*(1-int(dones[i]))

    # train
    self.q_active.train_on_batch(states, q_target)

    self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_end)
    self.learned_step_counter += 1

  def train(self, env, n_games):
    scores = []
    eps_history = []
    steps = 0
    for i in range(n_games):
      done = False
      score = 0
      observation = env.reset()
      while not done:
        steps += 1
        action = self.choose_action(observation)
        next_observation, reward, done, info = env.step(action)
        score += reward
        self.store_step(observation, action, reward, next_observation, done)
        observation = next_observation
        self.learn()
      eps_history.append(self.epsilon)
      scores.append(score)
      avg_score = np.mean(scores[-10:])
      print('Episode', i, '- trained steps', steps, '- score %.1f'%score, '- avg_score %.1f ' % avg_score)

  def save_model(self, train_dir):
    file_name = train_dir + '/d3qn_' + str(self.learned_step_counter) + '/model'
    #self.q_active.save_weights(file_name)
    self.q_active.save_weights(file_name, save_format='tf')

  def load_model(self, train_dir, learned_steps = 100):
    file_name = train_dir + '/d3qn_' + str(learned_steps) + '/model' 
    self.q_active.load_weights(file_name)
    self.q_frozen.set_weights(self.q_active.get_weights())


## Train

In [31]:
d3qn = Agent(lr=0.005, gamma=0.99, n_actions=env.action_space.n, epsilon=1.0, batch_size=64, input_dims=env.observation_space.shape)

In [32]:
n_games = 100
d3qn.train(env, n_games)

Episode 0 - trained steps 18 - score 18.0 - avg_score 18.0 
Episode 1 - trained steps 38 - score 20.0 - avg_score 19.0 
Episode 2 - trained steps 75 - score 37.0 - avg_score 25.0 
Episode 3 - trained steps 91 - score 16.0 - avg_score 22.8 
Episode 4 - trained steps 103 - score 12.0 - avg_score 20.6 
Episode 5 - trained steps 117 - score 14.0 - avg_score 19.5 
Episode 6 - trained steps 134 - score 17.0 - avg_score 19.1 
Episode 7 - trained steps 196 - score 62.0 - avg_score 24.5 
Episode 8 - trained steps 211 - score 15.0 - avg_score 23.4 
Episode 9 - trained steps 225 - score 14.0 - avg_score 22.5 
Episode 10 - trained steps 248 - score 23.0 - avg_score 23.0 
Episode 11 - trained steps 310 - score 62.0 - avg_score 27.2 
Episode 12 - trained steps 322 - score 12.0 - avg_score 24.7 
Episode 13 - trained steps 392 - score 70.0 - avg_score 30.1 
Episode 14 - trained steps 447 - score 55.0 - avg_score 34.4 
Episode 15 - trained steps 460 - score 13.0 - avg_score 34.3 
Episode 16 - trained s

KeyboardInterrupt: ignored

In [11]:
d3qn.epsilon = 0.0
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = d3qn.choose_action(state)
        state, reward, done, info = env.step(action)
        #env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 44.0
Episode: 1 score: 51.0
Episode: 2 score: 61.0
Episode: 3 score: 77.0
Episode: 4 score: 59.0


In [12]:
#train_dir = './trained_models/CartPole/DuelingDeepQ'
train_dir = '.'
d3qn.epsilon = 0.0
d3qn.save_model(train_dir)

## Test

In [14]:
trained_d3qn = Agent(lr=0.005, gamma=0.99, n_actions=env.action_space.n, epsilon=0.0, batch_size=64, input_dims=env.observation_space.shape)
trained_d3qn.load_model(train_dir, learned_steps=469)

In [15]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = trained_d3qn.choose_action(state)
        state, reward, done, info = env.step(action)
        #env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 53.0
Episode: 1 score: 57.0
Episode: 2 score: 58.0
Episode: 3 score: 61.0
Episode: 4 score: 54.0


# From scratch - Deep Q - Keras

## Replay Buffer

In [None]:
class ReplayBuffer():
  def __init__(self, max_size, input_shape):
    self.mem_size = max_size
    self.mem_counter = 0
    
    self.states = np.zeros((self.mem_size, *input_shape), dtype=np.float64)
    self.next_states = np.zeros((self.mem_size, *input_shape), dtype=np.float64)
    self.rewards = np.zeros(self.mem_size, dtype=np.float64)
    self.actions = np.zeros(self.mem_size, dtype=np.int32)
    self.done = np.zeros(self.mem_size, dtype=np.bool)

  def store_step(self, state, action, reward, next_state, done):
    index = self.mem_counter % self.mem_size
    self.states[index] = state
    self.next_states[index] = next_state
    self.actions[index] = action
    self.rewards[index] = reward
    self.done[index] = done
    self.mem_counter += 1

  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_counter, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)

    states = self.states[batch]
    next_states = self.next_states[batch]
    rewards = self.rewards[batch]
    actions = self.actions[batch]
    done = self.done[batch]

    return states, actions, rewards, next_states, done
    

## Q Network

In [18]:
class DeepQNetwork(keras.Model):
  def __init__(self, n_actions):
    super(DeepQNetwork, self).__init__()

    fc1_dims = 128
    fc2_dims = 128
    self.dense1 = keras.layers.Dense(fc1_dims, activation='relu')
    self.dense2 = keras.layers.Dense(fc2_dims, activation='relu')    
    self.Q = keras.layers.Dense(n_actions, activation=None)

  def call(self, state):
    x = self.dense1(state)
    x = self.dense2(x)
    Q = self.Q(x)    
    return Q

## Agent

In [20]:
class Agent():
  def __init__(self, lr, gamma, n_actions, epsilon, batch_size, input_dims, epsilon_dec=1e-3, epsilon_end=0.01, mem_size=1000000, replace=100):
    self.action_space = [i for i in range(n_actions)]
    self.gamma =gamma
    self.epsilon = epsilon
    self.epsilon_dec = epsilon_dec
    self.epsilon_end = epsilon_end
    self.replace = replace
    self.batch_size = batch_size

    self.learned_step_counter = 0
    self.memory = ReplayBuffer(mem_size, input_dims)
    self.q =  DeepQNetwork(n_actions)

    self.q.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

  def store_step(self, state, action, reward, next_state, done):
    self.memory.store_step(state, action, reward, next_state, done)
  
  def choose_action(self, observation):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.action_space)
    else:
      state = np.array([observation])
      actions = self.q(state)
      action = tf.math.argmax(actions, axis=1).numpy()[0]
    return action

  def learn(self):
    if self.memory.mem_counter < self.batch_size:
      return

    # get data
    states, actions, rewards, next_states, dones = self.memory.sample_buffer(self.batch_size)
    q_pred = self.q(states)
    q_next = self.q(next_states)
    q_target = q_pred.numpy()
    max_next_actions = tf.math.argmax(q_next, axis=1)
    for i, terminated in enumerate(dones):
      q_target[i, actions[i]] = rewards[i] + self.gamma*q_next[i, max_next_actions[i]]*(1-int(dones[i]))

    # train
    self.q.train_on_batch(states, q_target)

    self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_end)
    self.learned_step_counter += 1

  def train(self, env, n_games):
    scores = []
    eps_history = []
    steps = 0
    for i in range(n_games):
      done = False
      score = 0
      observation = env.reset()
      while not done:
        steps += 1
        action = self.choose_action(observation)
        next_observation, reward, done, info = env.step(action)
        score += reward
        self.store_step(observation, action, reward, next_observation, done)
        observation = next_observation
        self.learn()
      eps_history.append(self.epsilon)
      scores.append(score)
      avg_score = np.mean(scores[-10:])
      print('Episode', i, '- trained steps', steps, '- score %.1f'%score, '- avg_score %.1f ' % avg_score)

  def save_model(self, train_dir):
    file_name = train_dir + '/dqn_' + str(self.learned_step_counter) + '/model'
    self.q.save_weights(file_name, save_format='tf')

  def load_model(self, train_dir, learned_steps = 100):
    file_name = train_dir + '/dqn_' + str(learned_steps) + '/model' 
    self.q.load_weights(file_name)


## Train

In [21]:
dqn = Agent(lr=0.005, gamma=0.99, n_actions=env.action_space.n, epsilon=1.0, batch_size=64, input_dims=env.observation_space.shape)

In [22]:
n_games = 100
dqn.train(env, n_games)

Episode 0 - trained steps 21 - score 21.0 - avg_score 21.0 
Episode 1 - trained steps 45 - score 24.0 - avg_score 22.5 
Episode 2 - trained steps 63 - score 18.0 - avg_score 21.0 
Episode 3 - trained steps 135 - score 72.0 - avg_score 33.8 
Episode 4 - trained steps 147 - score 12.0 - avg_score 29.4 
Episode 5 - trained steps 172 - score 25.0 - avg_score 28.7 
Episode 6 - trained steps 183 - score 11.0 - avg_score 26.1 
Episode 7 - trained steps 193 - score 10.0 - avg_score 24.1 
Episode 8 - trained steps 213 - score 20.0 - avg_score 23.7 
Episode 9 - trained steps 232 - score 19.0 - avg_score 23.2 
Episode 10 - trained steps 256 - score 24.0 - avg_score 23.5 
Episode 11 - trained steps 279 - score 23.0 - avg_score 23.4 
Episode 12 - trained steps 304 - score 25.0 - avg_score 24.1 
Episode 13 - trained steps 314 - score 10.0 - avg_score 17.9 
Episode 14 - trained steps 325 - score 11.0 - avg_score 17.8 
Episode 15 - trained steps 341 - score 16.0 - avg_score 16.9 
Episode 16 - trained 

KeyboardInterrupt: ignored

In [23]:
dqn.epsilon = 0.0
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = dqn.choose_action(state)
        state, reward, done, info = env.step(action)
        #env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 200.0
Episode: 1 score: 200.0
Episode: 2 score: 200.0
Episode: 3 score: 200.0
Episode: 4 score: 200.0


In [24]:
train_dir = '.'
dqn.epsilon = 0.0
dqn.save_model(train_dir)

## Test

In [26]:
trained_dqn = Agent(lr=0.005, gamma=0.99, n_actions=env.action_space.n, epsilon=0.0, batch_size=64, input_dims=env.observation_space.shape)
trained_dqn.load_model(train_dir, learned_steps=4241)

In [27]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = trained_dqn.choose_action(state)
        state, reward, done, info = env.step(action)
        #env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 200.0
Episode: 1 score: 200.0
Episode: 2 score: 200.0
Episode: 3 score: 200.0
Episode: 4 score: 200.0
