In [None]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import gym
import numpy as np
import random as rand

class Agent(object):
    def __init__(self, config_data):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.value_size = 1
        
        self.layer_num_actor = config_data['layer_num_actor']
        self.node_num_actor = config_data['node_num_actor']
        self.epochs_actor = config_data['epochs_actor']
        self.layer_num_critic = config_data['layer_num_critic']
        self.node_num_critic = config_data['node_num_critic']
        self.epochs_critic = config_data['epochs_critic']
        
        self.learning_rate_actor = config_data['learning_rate_actor']
        self.learning_rate_critic = config_data['learning_rate_critic']
        self.discount_rate = config_data['discount_rate']
        self.smooth_rate = config_data['smooth_rate']
        self.penalty = config_data['penalty']
        self.mini_batch_step_size = config_data['mini_batch_step_size']
        self.loss_clipping = config_data['loss_clipping']

        self.episode_num = 200
        self.moving_avg_size = 20
        
        self.model_actor = self.build_model_actor()
        self.model_critic = self.build_model_critic()
 
        self.states, self.states_next, self.action_matrixs, self.dones, self.action_probs, self.rewards = [],[],[],[],[],[]
        self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE = np.zeros((1,1,self.action_size)), np.zeros((1,1,self.value_size))
    
        self.reward_list= []
        self.count_list = []
        self.moving_avg_list = []
        
    class MyModel(tf.keras.Model):
        def train_step(self, data):
            in_datas, out_action_probs = data
            states, action_matrixs, advantages, loss_clipping = in_datas[0], in_datas[1], in_datas[2], in_datas[3]

            with tf.GradientTape() as tape:
                y_pred = self(states, training=True)
                new_policy = K.max(action_matrixs*y_pred, axis=-1)   
                old_policy = K.max(action_matrixs*out_action_probs, axis=-1)    
                r = new_policy/(old_policy)
                
                LOSS_CLIPPING = K.mean(loss_clipping)
                
                loss = -K.minimum(r*advantages, K.clip(r, 1-LOSS_CLIPPING, 1+LOSS_CLIPPING)*advantages)

            trainable_vars = self.trainable_variables
            gradients = tape.gradient(loss, trainable_vars)
            self.optimizer.apply_gradients(zip(gradients, trainable_vars))
            
    def build_model_actor(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        input_action_matrixs = Input(shape=(1,self.action_size), name='input_action_matrixs')
        input_advantages = Input(shape=(1,self.value_size), name='input_advantages')
        input_loss_clipping = Input(shape=(1,self.value_size), name='input_loss_clipping')        
        
        x = (input_states)
        for i in range(1,self.layer_num_actor+1):            
            x = Dense(self.node_num_actor, activation="relu", kernel_initializer='glorot_normal')(x)
        out_actions = Dense(self.action_size, activation='softmax', name='output')(x)
        
        model = self.MyModel(inputs=[input_states, input_action_matrixs, input_advantages, input_loss_clipping],
                             outputs=out_actions)
        model.compile(optimizer=Adam(lr=self.learning_rate_actor))
        
        return model
    
    def build_model_critic(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        
        x = (input_states)
        for i in range(1,self.layer_num_critic+1):
            x = Dense(self.node_num_critic, activation="relu", kernel_initializer='glorot_normal')(x)
        out_values = Dense(self.value_size, activation='linear', name='output')(x)
        
        model = tf.keras.models.Model(inputs=[input_states], outputs=[out_values])
        model.compile(optimizer=Adam(lr=self.learning_rate_critic),
#                       loss='mean_squared_error'
                      loss = "binary_crossentropy"
                     )
        return model

    def train(self):
        for episode in range(self.episode_num):

            state = self.env.reset()
            state = state[0]

            count, reward_tot = self.make_memory(episode, state)
            self.train_mini_batch()
            self.clear_memory()
            
            if count < 500:
                reward_tot = reward_tot-self.penalty
            
            self.reward_list.append(reward_tot)
            self.count_list.append(count)
            self.moving_avg_list.append(self.moving_avg(self.count_list,self.moving_avg_size))                
            

    def moving_avg(self, data, size=10):
        if len(data) > size:
            c = np.array(data[len(data)-size:len(data)]) 
        else:
            c = np.array(data) 
        return np.mean(c)
    
    def clear_memory(self):
        self.states, self.states_next, self.action_matrixs, self.done, self.action_probs, self.rewards = [],[],[],[],[],[]
        
    def make_memory(self, episode, state):
        reward_tot = 0
        count = 0
        reward = np.zeros(self.value_size)
        advantage = np.zeros(self.value_size)
        target = np.zeros(self.value_size)
        action_matrix = np.zeros(self.action_size)
        done = False
        
        while not done:
            count+=1

            state_t = np.reshape(self.normalize(state),[1, 1, self.state_size])
            action_matrix_t = np.reshape(action_matrix,[1, 1, self.action_size])
            
            action_prob = self.model_actor.predict([state_t, self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE])
            action = np.random.choice(self.action_size, 1, p=action_prob[0][0])[0]
            action_matrix = np.zeros(self.action_size) #초기화
            action_matrix[action] = 1

            state_next, reward, done, none, none2 = self.env.step(action)
            
            state_next_t = np.reshape(self.normalize(state_next),[1, 1, self.state_size])
            
            if count < 500 and done:
                reward = self.penalty 
        
            self.states.append(np.reshape(state_t, [1,self.state_size]))
            self.states_next.append(np.reshape(state_next_t, [1,self.state_size]))
            self.action_matrixs.append(np.reshape(action_matrix, [1,self.action_size]))
            self.dones.append(np.reshape(0 if done else 1, [1,self.value_size]))
            self.action_probs.append(np.reshape(action_prob, [1,self.action_size]))
            self.rewards.append(np.reshape(reward, [1,self.value_size]))
            
            if(count % self.mini_batch_step_size == 0):
                self.train_mini_batch()
                self.clear_memory()

            reward_tot += reward
            state = state_next
            
        return count, reward_tot
    
    def make_gae(self, values, values_next, rewards, dones):
        delta_adv, delta_tar, adv, target = 0, 0, 0, 0
        advantages = np.zeros(np.array(values).shape)
        targets = np.zeros(np.array(values).shape)
        for t in reversed(range(0, len(rewards))):
            delta_adv = rewards[t] + self.discount_rate * values_next[t] * dones[t] - values[t]
            delta_tar = rewards[t] + self.discount_rate * values_next[t] * dones[t]
            adv = delta_adv + self.smooth_rate *  self.discount_rate * dones[t] * adv
            target = delta_tar + self.smooth_rate * self.discount_rate * dones[t] * target
            advantages[t] = adv
            targets[t] = target
        return advantages, targets

    def normalize(self, x):
#         current_min = np.min(x)
#         current_max = np.max(x)
#         x_normed = (x - current_min) / (current_max - current_min)
#         current_mean = np.mean(x)
#         current_std = np.std(x)
#         x_normed = (x - current_mean) / current_std
#         return x_normed
        norm = np.linalg.norm(x)
        if norm == 0: 
            return x
        return x / norm

    def train_mini_batch(self):
        
        if len(self.states) == 0:
            return
        
        states_t = np.array(self.states)
        states_next_t = np.array(self.states_next)
        action_matrixs_t = np.array(self.action_matrixs)
        action_probs_t = np.array(self.action_probs)
        loss_clipping = [self.loss_clipping for j in range(len(self.states))]
        loss_clipping_t = np.reshape(loss_clipping, [len(self.states),1,1])
        
        values = self.model_critic.predict(states_t)
        values_next = self.model_critic.predict(states_next_t)
        
        advantages, targets = self.make_gae(values, values_next, self.rewards, self.dones)
        advantages_t = np.array(advantages)
        targets_t = np.array(targets)
        
        self.model_actor.fit([states_t, action_matrixs_t, advantages_t, loss_clipping_t], [action_probs_t], 
                             epochs=self.epochs_actor, verbose=0)
        self.model_critic.fit(states_t, targets_t, 
                              epochs=self.epochs_critic, verbose=0)       
        

if __name__ == "__main__":

    def random_select():
        config_data = {
            'layer_num_actor':rand.randint(1,2),
            'node_num_actor':rand.randint(12,128),
            'epochs_actor':rand.randint(3,6),
            'layer_num_critic':rand.randint(1,2),
            'node_num_critic':rand.randint(12,128),
            'epochs_critic':rand.randint(3,6),
            
            'learning_rate_actor' :rand.uniform(0.0001,0.001),
            'learning_rate_critic':rand.uniform(0.0001,0.001),
            'discount_rate'       :rand.uniform(0.9,0.99),
            'smooth_rate'       :rand.uniform(0.9,0.99),
            'penalty'             :rand.randint(-500,-10),
            'mini_batch_step_size':rand.randint(4,80),
            'loss_clipping'       :rand.uniform(0.1,0.3)
        }
        return config_data

    results = []
    print("***** start random search *****")        
    for i in range(10):
        config_data = random_select()
        agent = Agent(config_data)
        print("*config:", config_data)
        agent.train()
        result = []
        result.append(config_data)
        result.append(agent.moving_avg_list[len(agent.moving_avg_list)-1])
        result.append(np.mean(agent.reward_list))
        results.append(result)
        print("*result:", i, agent.moving_avg_list[len(agent.moving_avg_list)-1], np.mean(agent.reward_list))
        print("-"*100)
    print("***** end random search *****")    
    

***** start random search *****
*config: {'layer_num_actor': 1, 'node_num_actor': 115, 'epochs_actor': 4, 'layer_num_critic': 2, 'node_num_critic': 37, 'epochs_critic': 5, 'learning_rate_actor': 0.0007144842671363032, 'learning_rate_critic': 0.0007104724029696896, 'discount_rate': 0.9688835618788458, 'smooth_rate': 0.9176809055856051, 'penalty': -156, 'mini_batch_step_size': 71, 'loss_clipping': 0.13103136042461513}
*result: 0 181.65 54.315
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 1, 'node_num_actor': 105, 'epochs_actor': 4, 'layer_num_critic': 1, 'node_num_critic': 84, 'epochs_critic': 4, 'learning_rate_actor': 0.00011337755140372159, 'learning_rate_critic': 0.000732430979644101, 'discount_rate': 0.927844296059655, 'smooth_rate': 0.9590013979191802, 'penalty': -264, 'mini_batch_step_size': 57, 'loss_clipping': 0.2397335589501082}
*result: 1 32.25 28.085
--------------------------------------------

In [None]:
avg_list = []
for i in range(0, 100):
    avg_list.append([results[i][2], i])
avg_list.sort(reverse=True)    
avg_list

In [None]:
print(results[27])
print(results[31])
print(results[96])

In [None]:
count_300 = 0
for i in range(0, 100):
    if(results[i][2] >= 300):
        count_300 = count_300+1
print("count_400:", count_300)