In [11]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import gym
import numpy as np
import random as rand
from bayes_opt import BayesianOptimization

class Agent(object):
    
    def __init__(self, config_data):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.value_size = 1
        
        self.layer_num_actor = int(round(config_data['layer_num_actor'],0))
        self.node_num_actor = int(round(config_data['node_num_actor'],0))
        self.epochs_actor = int(round(config_data['epochs_actor'],0))
        self.layer_num_critic = int(round(config_data['layer_num_critic'],0))
        self.node_num_critic = int(round(config_data['node_num_critic'],0))
        self.epochs_critic = int(round(config_data['epochs_critic'],0))
        
        self.learning_rate_actor = config_data['learning_rate_actor']
        self.learning_rate_critic = config_data['learning_rate_critic']
        self.discount_rate = config_data['discount_rate']
        self.smooth_rate = config_data['smooth_rate']
        self.penalty = int(round(config_data['penalty'],0))
        self.mini_batch_step_size = int(round(config_data['mini_batch_step_size'],0))
        self.loss_clipping = config_data['loss_clipping']

        self.episode_num = 100
        self.moving_avg_size = 20
        
        self.model_actor = self.build_model_actor()
        self.model_critic = self.build_model_critic()
 
        self.states, self.states_next, self.action_matrixs, self.dones, self.action_probs, self.rewards = [],[],[],[],[],[]
        self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE = np.zeros((1,1,self.action_size)), np.zeros((1,1,self.value_size))
    
        self.reward_list= []
        self.count_list = []
        self.moving_avg_list = []
        
    class MyModel(tf.keras.Model):
        def train_step(self, data):
            in_datas, out_action_probs = data
            states, action_matrixs, advantages, loss_clipping = in_datas[0], in_datas[1], in_datas[2], in_datas[3]

            with tf.GradientTape() as tape:
                y_pred = self(states, training=True)
                new_policy = K.max(action_matrixs*y_pred, axis=-1)   
                old_policy = K.max(action_matrixs*out_action_probs, axis=-1)    
                r = new_policy/(old_policy)
                
                LOSS_CLIPPING = K.mean(loss_clipping)
                
                loss = -K.minimum(r*advantages, K.clip(r, 1-LOSS_CLIPPING, 1+LOSS_CLIPPING)*advantages)

            trainable_vars = self.trainable_variables
            gradients = tape.gradient(loss, trainable_vars)
            self.optimizer.apply_gradients(zip(gradients, trainable_vars))
            
    def build_model_actor(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        input_action_matrixs = Input(shape=(1,self.action_size), name='input_action_matrixs')
        input_advantages = Input(shape=(1,self.value_size), name='input_advantages')
        input_loss_clipping = Input(shape=(1,self.value_size), name='input_loss_clipping')        
        
        x = (input_states)
        for i in range(1,self.layer_num_actor+1):            
            x = Dense(self.node_num_actor, activation="relu", kernel_initializer='glorot_normal')(x)
        out_actions = Dense(self.action_size, activation='softmax', name='output')(x)
        
        model = self.MyModel(inputs=[input_states, input_action_matrixs, input_advantages], outputs=out_actions)
        model.compile(optimizer=Adam(lr=self.learning_rate_actor))
        
        return model
    
    def build_model_critic(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        
        x = (input_states)
        for i in range(1,self.layer_num_critic+1):
            x = Dense(self.node_num_critic, activation="relu", kernel_initializer='glorot_normal')(x)
        out_values = Dense(self.value_size, activation='linear', name='output')(x)
        
        model = tf.keras.models.Model(inputs=[input_states], outputs=[out_values])
        model.compile(optimizer=Adam(lr=self.learning_rate_critic),
#                       loss='mean_squared_error'
                      loss = "binary_crossentropy"
                     )
        return model

    def train(self):
        for episode in range(self.episode_num):

            state = self.env.reset()

            count, reward_tot = self.make_memory(episode, state)
            self.train_mini_batch()
            self.clear_memory()
            
            if count < 500:
                reward_tot = reward_tot-self.penalty
            
            self.reward_list.append(reward_tot)
            self.count_list.append(count)
            self.moving_avg_list.append(self.moving_avg(self.count_list,self.moving_avg_size))                
            

    def moving_avg(self, data, size=10):
        if len(data) > size:
            c = np.array(data[len(data)-size:len(data)]) 
        else:
            c = np.array(data) 
        return np.mean(c)
    
    def clear_memory(self):
        self.states, self.states_next, self.action_matrixs, self.done, self.action_probs, self.rewards = [],[],[],[],[],[]
        
    def make_memory(self, episode, state):
        reward_tot = 0
        count = 0
        reward = np.zeros(self.value_size)
        advantage = np.zeros(self.value_size)
        target = np.zeros(self.value_size)
        action_matrix = np.zeros(self.action_size)
        done = False
        
        while not done:
            count+=1

            state_t = np.reshape(self.normalize(state),[1, 1, self.state_size])
            action_matrix_t = np.reshape(action_matrix,[1, 1, self.action_size])
            
            action_prob = self.model_actor.predict([state_t, self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE])
            action = np.random.choice(self.action_size, 1, p=action_prob[0][0])[0]
            action_matrix = np.zeros(self.action_size) #초기화
            action_matrix[action] = 1

            state_next, reward, done, none = self.env.step(action)
            
            state_next_t = np.reshape(self.normalize(state_next),[1, 1, self.state_size])
            
            if count < 500 and done:
                reward = self.penalty 
        
            self.states.append(np.reshape(state_t, [1,self.state_size]))
            self.states_next.append(np.reshape(state_next_t, [1,self.state_size]))
            self.action_matrixs.append(np.reshape(action_matrix, [1,self.action_size]))
            self.dones.append(np.reshape(0 if done else 1, [1,self.value_size]))
            self.action_probs.append(np.reshape(action_prob, [1,self.action_size]))
            self.rewards.append(np.reshape(reward, [1,self.value_size]))
            
            if(count % self.mini_batch_step_size == 0):
                self.train_mini_batch()
                self.clear_memory()

            reward_tot += reward
            state = state_next
            
        return count, reward_tot
    
    def make_gae(self, values, values_next, rewards, dones):
        delta_adv, delta_tar, adv, target = 0, 0, 0, 0
        advantages = np.zeros(np.array(values).shape)
        targets = np.zeros(np.array(values).shape)
        for t in reversed(range(0, len(rewards))):
            delta_adv = rewards[t] + self.discount_rate * values_next[t] * dones[t] - values[t]
            delta_tar = rewards[t] + self.discount_rate * values_next[t] * dones[t]
            adv = delta_adv + self.smooth_rate *  self.discount_rate * dones[t] * adv
            target = delta_tar + self.smooth_rate * self.discount_rate * dones[t] * target
            advantages[t] = adv
            targets[t] = target
        return advantages, targets

    def normalize(self, x):
        norm = np.linalg.norm(x)
        if norm == 0: 
            return x
        return x / norm


    def train_mini_batch(self):
        
        if len(self.states) == 0:
            return
        
        states_t = np.array(self.states)
        states_next_t = np.array(self.states_next)
        action_matrixs_t = np.array(self.action_matrixs)
        action_probs_t = np.array(self.action_probs)
        loss_clipping = [self.loss_clipping for j in range(len(self.states))]
        loss_clipping_t = np.reshape(loss_clipping, [len(self.states),1,1])
        
        values = self.model_critic.predict(states_t)
        values_next = self.model_critic.predict(states_next_t)
        
        advantages, targets = self.make_gae(values, values_next, self.rewards, self.dones)
        advantages_t = np.array(advantages)
        targets_t = np.array(targets)
        
        self.model_actor.fit([states_t, action_matrixs_t, advantages_t, loss_clipping_t], [action_probs_t], 
                             epochs=self.epochs_actor, verbose=0)
        self.model_critic.fit(states_t, targets_t, 
                              epochs=self.epochs_critic, verbose=0)       
        

if __name__ == "__main__":

    def black_box_function(layer_num_actor, node_num_actor, epochs_actor, 
                           layer_num_critic, node_num_critic, epochs_critic,
                           learning_rate_actor, learning_rate_critic,
                           discount_rate, smooth_rate, 
                           penalty, mini_batch_step_size, loss_clipping
                          ):
        config_data = {
            'layer_num_actor':layer_num_actor,
            'node_num_actor':node_num_actor,
            'epochs_actor':epochs_actor,
            'layer_num_critic':layer_num_critic,
            'node_num_critic':node_num_critic,
            'epochs_critic':epochs_critic,
            
            'learning_rate_actor' :learning_rate_actor,
            'learning_rate_critic':learning_rate_critic,
            'discount_rate'       :discount_rate,
            'smooth_rate'       :smooth_rate,
            'penalty'             :penalty,
            'mini_batch_step_size':mini_batch_step_size,
            'loss_clipping'       :loss_clipping
        }
        agent = Agent(config_data)
        agent.train()
        return np.mean(agent.reward_list)
        
    pbounds = {
                'layer_num_actor':(1,2),
                'node_num_actor':(12,128),
                'epochs_actor':(3,6),
                'layer_num_critic':(1,2),
                'node_num_critic':(12,128),
                'epochs_critic':(3,6),

                'learning_rate_actor' :(0.0001,0.001),
                'learning_rate_critic':(0.0001,0.001),
                'discount_rate'       :(0.9,0.99),
                'smooth_rate'       :(0.9,0.99),
                'penalty'             :(-500,-10),
                'mini_batch_step_size':(4,80),
                'loss_clipping'       :(0.1,0.3)
              }

    optimizer = BayesianOptimization(
        f=black_box_function,
        pbounds=pbounds,
        random_state=1,
    )

    optimizer.maximize(
        init_points=5,
        n_iter=20
    )
    

|   iter    |  target   | discou... | epochs... | epochs... | layer_... | layer_... | learni... | learni... | loss_c... | mini_b... | node_n... | node_n... |  penalty  | smooth... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 37.65   [0m | [0m 0.9375  [0m | [0m 5.161   [0m | [0m 3.0     [0m | [0m 1.302   [0m | [0m 1.147   [0m | [0m 0.000183[0m | [0m 0.000267[0m | [0m 0.1691  [0m | [0m 34.15   [0m | [0m 74.5    [0m | [0m 60.63   [0m | [0m-164.2   [0m | [0m 0.9184  [0m |
| [0m 2       [0m | [0m 23.1    [0m | [0m 0.979   [0m | [0m 3.082   [0m | [0m 5.011   [0m | [0m 1.417   [0m | [0m 1.559   [0m | [0m 0.000226[0m | [0m 0.000278[0m | [0m 0.2601  [0m | [0m 77.59   [0m | [0m 48.36   [0m | [0m 92.31   [0m | [0m-70.57   [0m | [0m 0.9805  [0m |
| [0m 3       [0m | [0m 32.31

In [12]:
target_list = []
i=0
for res in optimizer.res:
    target_list.append([res["target"], i])
    i=i+1
target_list.sort(reverse=True)    
target_list

[[334.62, 20],
 [260.99, 7],
 [207.33, 19],
 [165.97, 24],
 [142.99, 5],
 [112.53, 18],
 [90.43, 22],
 [50.81, 11],
 [42.52, 10],
 [37.65, 0],
 [34.92, 4],
 [33.81, 23],
 [33.65, 6],
 [32.31, 2],
 [31.68, 17],
 [29.04, 8],
 [28.45, 16],
 [28.11, 12],
 [25.11, 3],
 [23.1, 1],
 [23.04, 9],
 [20.63, 15],
 [17.65, 14],
 [15.76, 21],
 [15.64, 13]]

In [13]:
print("*result:" , optimizer.res[20]['params'])

*result: {'discount_rate': 0.9138068228055699, 'epochs_actor': 4.135796340297432, 'epochs_critic': 3.068824820615902, 'layer_num_actor': 1.8127168005702576, 'layer_num_critic': 1.2899196661865222, 'learning_rate_actor': 0.0007044665544668867, 'learning_rate_critic': 0.0007325323236616151, 'loss_clipping': 0.2671282081035625, 'mini_batch_step_size': 18.73240705651665, 'node_num_actor': 119.19096504720964, 'node_num_critic': 74.68079589490598, 'penalty': -347.12015260105, 'smooth_rate': 0.9321116290822046}


In [15]:
count_100 = 0
for res in optimizer.res:
    if(res["target"] >= 100):
        count_100 = count_100+1
print("count_100:", count_100)

count_100: 6
