In [2]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import gym
import numpy as np
import random as rand

class Agent(object):
    def __init__(self, config_data):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.value_size = 1
        
        self.layer_num_actor = config_data['layer_num_actor']
        self.node_num_actor = config_data['node_num_actor']
        self.epochs_actor = config_data['epochs_actor']
        self.layer_num_critic = config_data['layer_num_critic']
        self.node_num_critic = config_data['node_num_critic']
        self.epochs_critic = config_data['epochs_critic']
        
        self.learning_rate_actor = config_data['learning_rate_actor']
        self.learning_rate_critic = config_data['learning_rate_critic']
        self.discount_rate = config_data['discount_rate']
        self.smooth_rate = config_data['smooth_rate']
        self.penalty = config_data['penalty']
        self.mini_batch_step_size = config_data['mini_batch_step_size']
        self.loss_clipping = config_data['loss_clipping']

        self.episode_num = 200
        self.moving_avg_size = 20
        
        self.model_actor = self.build_model_actor()
        self.model_critic = self.build_model_critic()
 
        self.states, self.states_next, self.action_matrixs, self.dones, self.action_probs, self.rewards = [],[],[],[],[],[]
        self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE = np.zeros((1,1,self.action_size)), np.zeros((1,1,self.value_size))
    
        self.reward_list= []
        self.count_list = []
        self.moving_avg_list = []
        
    class MyModel(tf.keras.Model):
        def train_step(self, data):
            in_datas, out_action_probs = data
            states, action_matrixs, advantages, loss_clipping = in_datas[0], in_datas[1], in_datas[2], in_datas[3]

            with tf.GradientTape() as tape:
                y_pred = self(states, training=True)
                new_policy = K.max(action_matrixs*y_pred, axis=-1)   
                old_policy = K.max(action_matrixs*out_action_probs, axis=-1)    
                r = new_policy/(old_policy)
                
                LOSS_CLIPPING = K.mean(loss_clipping)
                
                loss = -K.minimum(r*advantages, K.clip(r, 1-LOSS_CLIPPING, 1+LOSS_CLIPPING)*advantages)

            trainable_vars = self.trainable_variables
            gradients = tape.gradient(loss, trainable_vars)
            self.optimizer.apply_gradients(zip(gradients, trainable_vars))
            
    def build_model_actor(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        input_action_matrixs = Input(shape=(1,self.action_size), name='input_action_matrixs')
        input_advantages = Input(shape=(1,self.value_size), name='input_advantages')
        input_loss_clipping = Input(shape=(1,self.value_size), name='input_loss_clipping')        
        
        x = (input_states)
        for i in range(1,self.layer_num_actor+1):            
            x = Dense(self.node_num_actor, activation="relu", kernel_initializer='glorot_normal')(x)
        out_actions = Dense(self.action_size, activation='softmax', name='output')(x)
        
        model = self.MyModel(inputs=[input_states, input_action_matrixs, input_advantages, input_loss_clipping],
                             outputs=out_actions)
        model.compile(optimizer=Adam(lr=self.learning_rate_actor))
        
        return model
    
    def build_model_critic(self):
        input_states = Input(shape=(1,self.state_size), name='input_states')
        
        x = (input_states)
        for i in range(1,self.layer_num_critic+1):
            x = Dense(self.node_num_actor, activation="relu", kernel_initializer='glorot_normal')(x)
        out_values = Dense(self.value_size, activation='linear', name='output')(x)
        
        model = tf.keras.models.Model(inputs=[input_states], outputs=[out_values])
        model.compile(optimizer=Adam(lr=self.learning_rate_critic),
#                       loss='mean_squared_error'
                      loss = "binary_crossentropy"
                     )
        return model

    def train(self):
        for episode in range(self.episode_num):

            state = self.env.reset()

            count, reward_tot = self.make_memory(episode, state)
            self.train_mini_batch()
            self.clear_memory()
            
            if count < 500:
                reward_tot = reward_tot-self.penalty
            
            self.reward_list.append(reward_tot)
            self.count_list.append(count)
            self.moving_avg_list.append(self.moving_avg(self.count_list,self.moving_avg_size))                
            

    def moving_avg(self, data, size=10):
        if len(data) > size:
            c = np.array(data[len(data)-size:len(data)]) 
        else:
            c = np.array(data) 
        return np.mean(c)
    
    def clear_memory(self):
        self.states, self.states_next, self.action_matrixs, self.done, self.action_probs, self.rewards = [],[],[],[],[],[]
        
    def make_memory(self, episode, state):
        reward_tot = 0
        count = 0
        reward = np.zeros(self.value_size)
        advantage = np.zeros(self.value_size)
        target = np.zeros(self.value_size)
        action_matrix = np.zeros(self.action_size)
        done = False
        
        while not done:
            count+=1

            state_t = np.reshape(self.normalize(state),[1, 1, self.state_size])
            action_matrix_t = np.reshape(action_matrix,[1, 1, self.action_size])
            
            action_prob = self.model_actor.predict([state_t, self.DUMMY_ACTION_MATRIX, self.DUMMY_ADVANTAGE])
            action = np.random.choice(self.action_size, 1, p=action_prob[0][0])[0]
            action_matrix = np.zeros(self.action_size)
            action_matrix[action] = 1

            state_next, reward, done, none = self.env.step(action)
            
            state_next_t = np.reshape(self.normalize(state_next),[1, 1, self.state_size])
            
            if count < 500 and done:
                reward = self.penalty 
        
            self.states.append(np.reshape(state_t, [1,self.state_size]))
            self.states_next.append(np.reshape(state_next_t, [1,self.state_size]))
            self.action_matrixs.append(np.reshape(action_matrix, [1,self.action_size]))
            self.dones.append(np.reshape(0 if done else 1, [1,self.value_size]))
            self.action_probs.append(np.reshape(action_prob, [1,self.action_size]))
            self.rewards.append(np.reshape(reward, [1,self.value_size]))
            
            if(count % self.mini_batch_step_size == 0):
                self.train_mini_batch()
                self.clear_memory()

            reward_tot += reward
            state = state_next
            
        return count, reward_tot
    
    def make_gae(self, values, values_next, rewards, dones):
        delta_adv, delta_tar, adv, target = 0, 0, 0, 0
        advantages = np.zeros(np.array(values).shape)
        targets = np.zeros(np.array(values).shape)
        for t in reversed(range(0, len(rewards))):
            delta_adv = rewards[t] + self.discount_rate * values_next[t] * dones[t] - values[t]
            delta_tar = rewards[t] + self.discount_rate * values_next[t] * dones[t]
            adv = delta_adv + self.smooth_rate *  self.discount_rate * dones[t] * adv
            target = delta_tar + self.smooth_rate * self.discount_rate * dones[t] * target
            advantages[t] = adv
            targets[t] = target
        return advantages, targets

    def normalize(self, x):
#         current_min = np.min(x)
#         current_max = np.max(x)
#         x_normed = (x - current_min) / (current_max - current_min)
#         current_mean = np.mean(x)
#         current_std = np.std(x)
#         x_normed = (x - current_mean) / current_std
#         return x_normed
        norm = np.linalg.norm(x)
        if norm == 0: 
            return x
        return x / norm

    def train_mini_batch(self):
        
        if len(self.states) == 0:
            return
        
        states_t = np.array(self.states)
        states_next_t = np.array(self.states_next)
        action_matrixs_t = np.array(self.action_matrixs)
        action_probs_t = np.array(self.action_probs)
        loss_clipping = [self.loss_clipping for j in range(len(self.states))]
        loss_clipping_t = np.reshape(loss_clipping, [len(self.states),1,1])
        
        values = self.model_critic.predict(states_t)
        values_next = self.model_critic.predict(states_next_t)
        
        advantages, targets = self.make_gae(values, values_next, self.rewards, self.dones)
        advantages_t = np.array(advantages)
        targets_t = np.array(targets)
        
        self.model_actor.fit([states_t, action_matrixs_t, advantages_t, loss_clipping_t], [action_probs_t], 
                             epochs=self.epochs_actor, verbose=0)
        self.model_critic.fit(states_t, targets_t, 
                              epochs=self.epochs_critic, verbose=0)       
        

if __name__ == "__main__":

    def random_select():
        config_data = {
            'layer_num_actor':rand.randint(1,2),
            'node_num_actor':rand.randint(12,128),
            'epochs_actor':rand.randint(3,6),
            'layer_num_critic':rand.randint(1,2),
            'node_num_critic':rand.randint(12,128),
            'epochs_critic':rand.randint(3,6),
            
            'learning_rate_actor' :rand.uniform(0.0001,0.001),
            'learning_rate_critic':rand.uniform(0.0001,0.001),
            'discount_rate'       :rand.uniform(0.9,0.99),
            'smooth_rate'       :rand.uniform(0.9,0.99),
            'penalty'             :rand.randint(-500,-10),
            'mini_batch_step_size':rand.randint(4,80),
            'loss_clipping'       :rand.uniform(0.1,0.3)
        }
        return config_data

    results = []
    print("***** start random search *****")        
    for i in range(100):
        config_data = random_select()
        agent = Agent(config_data)
        print("*config:", config_data)
        agent.train()
        result = []
        result.append(config_data)
        result.append(agent.moving_avg_list[len(agent.moving_avg_list)-1])
        result.append(np.mean(agent.reward_list))
        results.append(result)
        print("*result:", i, agent.moving_avg_list[len(agent.moving_avg_list)-1], np.mean(agent.reward_list))
        print("-"*100)
    print("***** end random search *****")    
    

***** start random search *****
*config: {'layer_num_actor': 2, 'node_num_actor': 95, 'epochs_actor': 6, 'layer_num_critic': 2, 'node_num_critic': 124, 'epochs_critic': 5, 'learning_rate_actor': 0.00018213203036520845, 'learning_rate_critic': 0.0005814962731170509, 'discount_rate': 0.962034926159223, 'smooth_rate': 0.9155998432226305, 'penalty': -101, 'mini_batch_step_size': 48, 'loss_clipping': 0.10779084820831981}
*result: 0 321.8 155.71
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 36, 'epochs_actor': 4, 'layer_num_critic': 1, 'node_num_critic': 92, 'epochs_critic': 6, 'learning_rate_actor': 0.00014829843010839537, 'learning_rate_critic': 0.0009351140107065621, 'discount_rate': 0.9558029876140309, 'smooth_rate': 0.9556677031316966, 'penalty': -146, 'mini_batch_step_size': 64, 'loss_clipping': 0.2653178577438546}
*result: 1 19.6 18.11
----------------------------------------------

*result: 16 91.85 33.255
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 63, 'epochs_actor': 4, 'layer_num_critic': 1, 'node_num_critic': 39, 'epochs_critic': 4, 'learning_rate_actor': 0.0002824055347893398, 'learning_rate_critic': 0.0008988229258848825, 'discount_rate': 0.9245027675088912, 'smooth_rate': 0.9398157596871778, 'penalty': -78, 'mini_batch_step_size': 25, 'loss_clipping': 0.19948638955011896}
*result: 17 459.0 262.805
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 109, 'epochs_actor': 3, 'layer_num_critic': 1, 'node_num_critic': 96, 'epochs_critic': 6, 'learning_rate_actor': 0.00039962495194675445, 'learning_rate_critic': 0.0004101401034927788, 'discount_rate': 0.9362233690024067, 'smooth_rate': 0.9523845142002928, 'penalty': -486, 'mini_batch_step_size': 20, 'loss_clip

*result: 32 32.85 27.67
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 32, 'epochs_actor': 6, 'layer_num_critic': 2, 'node_num_critic': 93, 'epochs_critic': 4, 'learning_rate_actor': 0.0006029067888925772, 'learning_rate_critic': 0.0002117606702459449, 'discount_rate': 0.9881063693960008, 'smooth_rate': 0.9494194449528965, 'penalty': -51, 'mini_batch_step_size': 39, 'loss_clipping': 0.225177882405735}
*result: 33 297.8 193.83
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 1, 'node_num_actor': 122, 'epochs_actor': 5, 'layer_num_critic': 1, 'node_num_critic': 34, 'epochs_critic': 4, 'learning_rate_actor': 0.0004384254974661672, 'learning_rate_critic': 0.0009945110711169125, 'discount_rate': 0.9005456621661532, 'smooth_rate': 0.9176040087832467, 'penalty': -455, 'mini_batch_step_size': 18, 'loss_clipping'

*result: 48 229.5 108.12
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 67, 'epochs_actor': 4, 'layer_num_critic': 2, 'node_num_critic': 26, 'epochs_critic': 4, 'learning_rate_actor': 0.0008485728263420081, 'learning_rate_critic': 0.0002028421833667094, 'discount_rate': 0.9544931573734896, 'smooth_rate': 0.9816531556762458, 'penalty': -329, 'mini_batch_step_size': 71, 'loss_clipping': 0.2699759752431376}
*result: 49 497.95 196.175
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 1, 'node_num_actor': 55, 'epochs_actor': 3, 'layer_num_critic': 1, 'node_num_critic': 28, 'epochs_critic': 5, 'learning_rate_actor': 0.00033768685947003254, 'learning_rate_critic': 0.0002600793312649501, 'discount_rate': 0.9207795736344904, 'smooth_rate': 0.9185014480859585, 'penalty': -230, 'mini_batch_step_size': 74, 'loss_clip

*result: 64 129.1 187.215
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 57, 'epochs_actor': 6, 'layer_num_critic': 2, 'node_num_critic': 69, 'epochs_critic': 6, 'learning_rate_actor': 0.0004336469620949757, 'learning_rate_critic': 0.0008601727894876552, 'discount_rate': 0.9462921568545178, 'smooth_rate': 0.9112296018279792, 'penalty': -140, 'mini_batch_step_size': 59, 'loss_clipping': 0.2017328057460958}
*result: 65 177.85 198.115
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 122, 'epochs_actor': 3, 'layer_num_critic': 1, 'node_num_critic': 61, 'epochs_critic': 5, 'learning_rate_actor': 0.000341420296934941, 'learning_rate_critic': 0.0006624090809603055, 'discount_rate': 0.9677256601169689, 'smooth_rate': 0.9458933236365561, 'penalty': -41, 'mini_batch_step_size': 61, 'loss_clipp

*result: 80 467.3 240.515
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 114, 'epochs_actor': 3, 'layer_num_critic': 1, 'node_num_critic': 109, 'epochs_critic': 3, 'learning_rate_actor': 0.0002442572864511702, 'learning_rate_critic': 0.0002286450716248216, 'discount_rate': 0.9104103860801598, 'smooth_rate': 0.9383046451821222, 'penalty': -286, 'mini_batch_step_size': 50, 'loss_clipping': 0.24597685400081165}
*result: 81 482.6 163.1
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 1, 'node_num_actor': 118, 'epochs_actor': 5, 'layer_num_critic': 2, 'node_num_critic': 51, 'epochs_critic': 4, 'learning_rate_actor': 0.0005501557308490698, 'learning_rate_critic': 0.00039253997844456096, 'discount_rate': 0.9097336360596163, 'smooth_rate': 0.983642311745383, 'penalty': -330, 'mini_batch_step_size': 31, 'loss_cli

*result: 96 493.25 326.11
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 1, 'node_num_actor': 19, 'epochs_actor': 5, 'layer_num_critic': 1, 'node_num_critic': 30, 'epochs_critic': 3, 'learning_rate_actor': 0.0005884861891987373, 'learning_rate_critic': 0.00043778988165748937, 'discount_rate': 0.9380430582342352, 'smooth_rate': 0.9065591762670321, 'penalty': -402, 'mini_batch_step_size': 19, 'loss_clipping': 0.29622934908690624}
*result: 97 406.8 224.735
----------------------------------------------------------------------------------------------------
*config: {'layer_num_actor': 2, 'node_num_actor': 29, 'epochs_actor': 5, 'layer_num_critic': 2, 'node_num_critic': 81, 'epochs_critic': 4, 'learning_rate_actor': 0.0002286773961160532, 'learning_rate_critic': 0.0005821629265675623, 'discount_rate': 0.9578698706725295, 'smooth_rate': 0.9309315729377983, 'penalty': -424, 'mini_batch_step_size': 32, 'loss_cli

In [24]:
avg_list = []
for i in range(0, 100):
    avg_list.append([results[i][2], i])
avg_list.sort(reverse=True)    
avg_list

[[336.015, 27],
 [331.3, 31],
 [326.11, 96],
 [320.325, 34],
 [318.81, 41],
 [315.055, 38],
 [311.18, 73],
 [310.87, 26],
 [304.38, 19],
 [304.09, 40],
 [299.015, 4],
 [296.545, 86],
 [292.725, 82],
 [287.06, 87],
 [282.545, 68],
 [280.42, 2],
 [280.075, 23],
 [275.755, 43],
 [271.585, 29],
 [270.84, 67],
 [264.11, 54],
 [262.805, 17],
 [259.73, 10],
 [258.58, 59],
 [257.99, 22],
 [257.515, 89],
 [248.595, 47],
 [240.515, 80],
 [239.625, 72],
 [237.525, 3],
 [234.955, 13],
 [229.89, 24],
 [228.15, 5],
 [224.735, 97],
 [220.05, 9],
 [218.57, 66],
 [218.525, 21],
 [217.59, 46],
 [216.41, 6],
 [215.82, 76],
 [210.805, 95],
 [203.82, 83],
 [201.94, 62],
 [200.145, 30],
 [198.115, 65],
 [196.175, 49],
 [193.83, 33],
 [187.29, 91],
 [187.215, 64],
 [186.41, 18],
 [178.47, 75],
 [174.9, 94],
 [168.69, 11],
 [164.615, 12],
 [163.1, 81],
 [155.71, 0],
 [151.515, 93],
 [147.025, 60],
 [139.82, 20],
 [139.64, 92],
 [134.945, 90],
 [128.47, 51],
 [120.77, 57],
 [110.985, 37],
 [108.12, 48],
 [99.9

In [25]:
print(results[27])
print(results[31])
print(results[96])

[{'layer_num_actor': 1, 'node_num_actor': 103, 'epochs_actor': 4, 'layer_num_critic': 2, 'node_num_critic': 125, 'epochs_critic': 3, 'learning_rate_actor': 0.0005344386158424651, 'learning_rate_critic': 0.00016820136716122927, 'discount_rate': 0.9257023950429729, 'smooth_rate': 0.9524677200660574, 'penalty': -241, 'mini_batch_step_size': 16, 'loss_clipping': 0.11072764945067409}, 457.1, 336.015]
[{'layer_num_actor': 1, 'node_num_actor': 126, 'epochs_actor': 3, 'layer_num_critic': 2, 'node_num_critic': 53, 'epochs_critic': 5, 'learning_rate_actor': 0.0007398376276548852, 'learning_rate_critic': 0.000782161690928647, 'discount_rate': 0.9380889146797763, 'smooth_rate': 0.9575402594948629, 'penalty': -337, 'mini_batch_step_size': 21, 'loss_clipping': 0.10198071982559201}, 458.5, 331.3]
[{'layer_num_actor': 1, 'node_num_actor': 104, 'epochs_actor': 5, 'layer_num_critic': 2, 'node_num_critic': 70, 'epochs_critic': 4, 'learning_rate_actor': 0.0007633079262687019, 'learning_rate_critic': 0.000

In [30]:
count_300 = 0
for i in range(0, 100):
    if(results[i][2] >= 300):
        count_300 = count_300+1
print("count_400:", count_300)

count_400: 10
