## Importação de Bibliotecas

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import Sequential
from keras.layers import Dense
from keras.layers import Input
from keras.optimizers import Adam
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os
import os.path
import random
import time
import math
import subprocess
import json
import socket
import matplotlib.pyplot as plt
import shutil
import pandas as pd
from collections import deque
import pickle

# Importando Address do socket UDP
UDP_IP = "0.0.0.0"
UDP_PORT = 2223
addrinfo = socket.getaddrinfo(
  UDP_IP, UDP_PORT,
  socket.AF_INET, socket.SOCK_DGRAM)[0]

2023-05-23 15:06:29.270067: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Criação do Ambiente

In [2]:
# Criação do ambiente 
class PerseguidorEnv():

    def __init__(self):
        self.state_space = np.array(np.zeros(13))
        self.action_space = np.array(np.zeros(4))
        self.width = 1333
        self.height = 735
        self.high_x = 1333
        self.low_x = 35
        self.high_y = 735
        self.low_y = 114
        self.seconds_done = 60
        self.score_done = 5
        self.max_distance = 1000

        
    # carrega observações do ambiente
    def _get_states(self):
        # carrega dados recebidas do servidor
        info = self._get_info()
        
        self._direction = np.array([
            (1 if info['player']['atual_action']==0 else 0), # direction atual
            (1 if info['player']['atual_action']==1 else 0), # direction atual
            (1 if info['player']['atual_action']==2 else 0), # direction atual
            (1 if info['player']['atual_action']==3 else 0), # direction atual
        ])
        
        self._location = np.array(np.zeros(4))
        self._danger = np.array(np.zeros(4))
        self._walls = np.array(np.zeros(4))
        
        direct = ['upper','right','bottom','left']
        for i in range(len(direct)):
            if info['player']['danger_{}'.format(direct[i])]:
                self._danger[i] = 1
            if info['player']['wall_{}'.format(direct[i])]:
                self._walls[i] = 1
            self._location[i] = info['player']['location_{}'.format(direct[i])]
        
        self._coins = np.array(np.zeros(4))
        if (info["coins"][0]["has"]):
            self._coins[0] = info['coins'][0]['y']< info['player']['y']  # coin up
            self._coins[1] = info['coins'][0]['x'] > info['player']['x']  # coin right
            self._coins[2] = info['coins'][0]['y'] > info['player']['y']  # coin down
            self._coins[3] = info['coins'][0]['x'] < info['player']['x']  # coin left
        if (info["coins"][1]["has"]):
            self._coins[0] = info['coins'][1]['y']< info['player']['y'] if self._coins[0] == 0 else 1  # coin up
            self._coins[1] = info['coins'][1]['x'] > info['player']['x'] if self._coins[1] == 0 else 1  # coin right
            self._coins[2] = info['coins'][1]['y'] > info['player']['y'] if self._coins[2] == 0 else 1  # coin down
            self._coins[3] = info['coins'][1]['x'] < info['player']['x'] if self._coins[3] == 0 else 1  # coin left
       
        # define array game
        self._game = np.array([
            #info['player']['wall'], # wall
            int(info['player']['distance']), # danger distance
            #info["player"]['seconds'],
            #info["player"]['score'],
        ])
        
        i_d = int(info['player']['distance']) 
        self._distance = np.array([
            1 if i_d < 400 and i_d >= 300 else 0,
            1 if i_d < 300 and i_d >= 200 else 0,
            1 if i_d < 200 and i_d >= 100 else 0,
            1 if i_d < 100 and i_d >= 0 else 0,
        ])
        
        self._persons = np.array([
            int(info['player']['x']),
            int(info['player']['y']),
            int(info['enemy']['x']),
            int(info['enemy']['y']),
        ])
        
        states = np.concatenate((self._danger, self._direction, self._walls, self._game), axis=None)
        states = np.array(states)
        # retorno
        return states, info
    
    # carrega dados do servidor
    def _get_info(self):
        with socket.socket(*addrinfo[:3]) as sock:
            sock.connect(addrinfo[4])
            # pecorre ate receber socket do servidor
            while True:
                # envio de socket para servidor
                sock.sendto(str.encode(''), (UDP_IP, UDP_PORT))
                # define dados recebidos
                data, addr = sock.recvfrom(1024)
                # se existir dados
                if data:
                    # conversao de json para python
                    res = json.loads(data)
                    # retorno
                    return res['game']
    
    # recarrega dados do servidor e informaçõs do ambiente
    def reset(self): 
        #define observação do ambiente e dados do servidor
        obs, info = self._get_states() 
        
        # retorno
        return obs, info
    
    def _restart_game(self):
        agent_step = str.encode(json.dumps({'action': -1,'restart': True, 'socket': 'agent'}))
        with socket.socket(*addrinfo[:3]) as sock:
            sock.connect(addrinfo[4])
            sock.sendto(agent_step, (UDP_IP, UDP_PORT))

    # envia para o servidor socket ação recebida e carrega observações do ambiente recente
    def step(self, action):
        # conversão da ação recebida para enviar no servidor socket
        agent_step = str.encode(json.dumps({'action': int(action),'restart': False, 'socket': 'agent'}))
        # envia para servidor socket a ação recebida e preditada.
        with socket.socket(*addrinfo[:3]) as sock:
            sock.connect(addrinfo[4])
            sock.sendto(agent_step, (UDP_IP, UDP_PORT))
        
        #define observação do ambiente e dados do servidor
        obs, info = self._get_states()
        done, reward = False, 0
           
      
        #if info['player']['seconds'] >= self.seconds_done and info['player']['score'] >= self.score_done:
        #    done=True 
        #    self._restart_game()
            
        #if (info['coins'][0]['has'] and info['coins'][0]['distance']<90) or (info['coins'][1]['colision'] and info['coins'][1]['distance']<90):
        #    reward = 200
        #else:
        calc_distance = (info['player']['distance'] / self.max_distance)
        # se a velocidade for 0 e se a distancia for menor que 70
        if (info['player']['distance'] < 300):
            if (info['player']['colision']):
                reward = -100
                done = True
                self._restart_game()
            else:
                reward = -100 * (1-calc_distance)
        #se não
        else:
            reward = int(calc_distance * 15)
                        
        # retorno
        return obs, reward, done, info
    
    # função destinada para iniciar o server socket e o jogo
    def start(self):
        # iniciar processo do servidor socket
        server = subprocess.Popen(['node ~/posgraduacao/tcc/projeto/server/index.js'], shell=True)
        # aguardar 1 segundo
        time.sleep(1)
        # iniciar processo do jogo
        game = subprocess.Popen(['love ~/posgraduacao/tcc/projeto/game'], shell=True)

    # função destinada para parar o server socket e o jogo
    def stop(self):
        try:
            # check se existe o processo do jogo
            pid = list(map(int,subprocess.check_output(["pidof",'love']).split()))[0]
            # condição de existe
            if pid != 0:
                # encerra o processo do jogo.
                os.system('kill '+str(pid))
        except:
            pid = 0
            
        # aguardar 1 segundo
        time.sleep(1)
        try:
            # check se existe o processo do servidor socket
            pid = list(map(int,subprocess.check_output(["pidof",'node']).split()))[0]
            # condição de existe
            if pid != 0:
                # encerra o processo do servidor socket
                os.system('kill '+str(pid))
        except:
            pid = 0


## Funções Auxiliares

In [3]:
def write_list(name, data):
    with open(name, 'wb') as filehandle:
        pickle.dump(data, filehandle)
        filehandle.close()
        
def read_list(name):
    data = []
    with open(name, 'rb') as filehandle:
        data = pickle.load(filehandle)
        filehandle.close()
    return data

In [4]:
def get_hitory_list(algorithm, load_model):
    history = dict()
    history['ep_reward'] = []
    history['avg_reward'] = []
    history['obs_history'] = []
    history['action_history'] = []
    history['reward_history'] = []
    history['loss_history'] = []
    history['memory_history'] = []
    
    for l in history:
        filename = f'{algorithm}/{l}.dat'
        if load_model:
            history[l] = read_list(filename)
            #history[l] = np.loadtxt(filename, dtype=float)
        else:
            if(os.path.isfile(filename)):
                os.remove(filename)
                
    return history

def set_history_list(algorithm,history):
    # Save lists
    for l in history:
        filename = f'{algorithm}/{l}.dat'
        #np.savetxt(filename, np.array(history[l]))
        write_list(filename,history[l])
        
       


## Deep Q Network

In [5]:
class DQN:
    def __init__(self, params, load_model=False):
        self.action_space = params['action_space']
        self.state_space = params['state_space']
        self.epsilon = params['epsilon'] 
        self.gamma = params['gamma'] 
        self.batch_size = params['batch_size'] 
        self.epsilon_min = params['epsilon_min'] 
        self.epsilon_decay = params['epsilon_decay'] 
        self.learning_rate = params['learning_rate']
        self.layer_sizes = params['layer_sizes']
        self.memory = deque(maxlen=100000)
        if load_model:
            self.model = self.load_model()
        else:
            self.model = self.build_model()
            
        if (len(history['memory_history'])>0):
            for i in range(len(history['memory_history'])):
                self.memory.append(history['memory_history'][i])
                
        print('load memory len -> ({})'.format(len(history['memory_history'])))
    
    def compile_model(self, model):
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    def load_model(self):
        model = keras.models.load_model('dqn/perseguidor.tf')
        model.load_weights('dqn/perseguidor.h5')
        return model
        
    
    def build_model(self):
        try:
            os.remove("dqn/perseguidor.tf")
            os.remove("dqn/perseguidor.h5")
        except:
            print("Não existe arquivos e diretorio para exclusão")
        
        model = Sequential()
        for i in range(len(self.layer_sizes)):
            if i == 0:
                model.add(Dense(self.layer_sizes[i], input_shape=(self.state_space,), activation='relu'))
            else:
                model.add(Dense(self.layer_sizes[i], activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))
        return self.compile_model(model)
        

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        history['memory_history'].append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        #targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1)) * (1-dones)
        targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1))
        targets_full = self.model.predict_on_batch(states)

        ind = np.array([i for i in range(self.batch_size)])
        targets_full[[ind], [actions]] = targets

        loss = self.model.fit(states, targets_full, epochs=1, verbose=2)
        history['loss_history'].append(loss.history['loss'][0])   

        #loss = self.model.train_on_batch(states, targets_full)
        #history['loss_history'].append(loss)   
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        #self.memory.clear()
        
def train_dqn(agent, env, params):
    for e in range(params['episode']):
        state, info = env.reset()
        state = np.reshape(state, (1, params['state_space']))
        done, score = False, 0
        
        for s in range(100):
            gamestate = env._get_info()['info']['gamestate']
            if gamestate==1:
                action = agent.act(state)
                history['action_history'].append(action)

                prev_state = state
                next_state, reward, done, info = env.step(action)
                history['obs_history'].append(next_state)
                history['reward_history'].append(reward)
                score += reward

                next_state = np.reshape(next_state, (1, params['state_space']))
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                
                if params['batch_size'] > 1:
                    agent.replay()
                
                print("Episode==>{}, Step==>{}, Reward==>{}, Action ==> {}, State == {}".format(e,s, reward, action,0))
                if done:
                    print(f'final state before dying: {str(prev_state)}')
                    print(f'episode: {e+1}/{e}, score: {score}')
                    break
            else:
                while (gamestate!=1):
                    gamestate = env._get_info()['info']['gamestate']
                
                
                
        history['ep_reward'].append(score)
        avg_reward = np.mean(history['ep_reward'][-40:])
        print("Episode * {} * Avg Reward is ==> {}".format(e, avg_reward))
        history['avg_reward'].append(avg_reward)
        
    

def start_dqn(params, env, load_model=False):
    try:
        agent = DQN(params, load_model)
         # Iniciar Server Socket e Game Love lua
        env.start()
        time.sleep(2)
        
        params['result'] = train_dqn(agent, env, params)

    finally:
        # Fechar Server Socket e Game Love lua
        env.stop()
        
        # Save the models
        agent.model.save('dqn/perseguidor.tf')
        agent.model.save_weights('dqn/perseguidor.h5')
        
        # Save the lists
        set_history_list('dqn',history)
        
    return params

## Deep Deterministic Policy Gradient (DDPG) 

In [6]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)
            
            
def get_actor(params):
        # Initialize weights between -3e-3 and 3-e3
        last_init = tf.random_uniform_initializer(minval=0, maxval=1)  # kernel_initializer=last_init
        inputs = layers.Input(shape=(params['state_space'],))
        out = layers.Dense(params['layer_sizes'][1], activation="relu")(inputs)
        out = layers.Dense(params['layer_sizes'][2], activation="relu")(out)
        outputs = layers.Dense(params['action_space'], activation="tanh")(out)
        model = tf.keras.Model(inputs, outputs)
        return model

def get_critic(params):
    # State as input
    state_input = layers.Input(shape=(params['state_space']))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(params['action_space']))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(params['layer_sizes'][1], activation="relu")(concat)
    out = layers.Dense(params['layer_sizes'][2], activation="relu")(out)
    outputs = layers.Dense(params['action_space'])(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)
    return model

class DDPG:
    def __init__(self, params, load_model=False, buffer_capacity=50000, batch_size=500, ):
        
        self.num_states = params['state_space']
        self.num_actions = params['action_space'] #params['action_space']
        self.upper_bound = params['upper_bound']
        self.lower_bound = params['lower_bound']
        self.gamma = params['gamma']
        self.critic_optimizer = tf.keras.optimizers.Adam(params['learning_rate'])
        self.actor_optimizer = tf.keras.optimizers.Adam(params['learning_rate'])
        self.actor_model = []
        self.critic_model = []
        self.target_actor = []
        self.target_critic = []
        
        # Making the weights equal initially
        if load_model:
            self.actor_model = tf.keras.models.load_model('ddpg/perseguidor_actor')
            self.critic_model = tf.keras.models.load_model('ddpg/perseguidor_critic')
            self.target_actor = tf.keras.models.load_model('ddpg/perseguidor_target_actor')
            self.target_critic = tf.keras.models.load_model('ddpg/perseguidor_target_critic')

            self.actor_model.load_weights("ddpg/perseguidor_actor.h5")
            self.critic_model.load_weights("ddpg/perseguidor_critic.h5")
            self.target_actor.load_weights("ddpg/perseguidor_target_actor.h5")
            self.target_critic.load_weights("ddpg/perseguidor_target_critic.h5")
        else:
            try:
                shutil.rmtree("ddpg/perseguidor_actor")
                shutil.rmtree("ddpg/perseguidor_critic")
                shutil.rmtree("ddpg/perseguidor_target_actor")
                shutil.rmtree("ddpg/perseguidor_target_critic")

                os.remove("ddpg/perseguidor_actor.h5")
                os.remove("ddpg/perseguidor_critic.h5")
                os.remove("ddpg/perseguidor_target_actor.h5")
                os.remove("ddpg/perseguidor_target_critic.h5")
            except:
                print("Não existe arquivos e diretorio para exclusão")

            self.actor_model = get_actor(params)
            self.critic_model = get_critic(params)
            self.target_actor = get_actor(params)
            self.target_critic = get_critic(params)

            self.target_actor.set_weights(self.actor_model.get_weights())
            self.target_critic.set_weights(self.critic_model.get_weights())
        
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, self.num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, self.num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_state_batch, training=True)
            y = reward_batch + self.gamma * self.target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = self.critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = self.actor_model(state_batch, training=True)
            critic_value = self.critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.actor_model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)
        

    def policy(self, state, noise_object):
        sampled_actions = tf.squeeze(self.actor_model(state))
        noise = noise_object()
        # Adding noise to action
        sampled_actions = sampled_actions.numpy() + noise
        # We make sure action is within bounds
        print(sampled_actions)
        legal_action = np.argmax(sampled_actions)
        return [np.squeeze(legal_action)]


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))
        

def start_ddpg(params, env, load_model=False):

    ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(params['std_dev']) * np.ones(1))

    agent = DDPG(params, load_model)
    
    try:
        # Iniciar Server Socket e Game Love lua
        env.start()
        time.sleep(2)

        # Takes about 4 min to train
        for ep in range(params['episode']):
            prev_state, info = env.reset()
            episodic_reward = 0
            done = False
            while not done:
                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
                action_policy = agent.policy(tf_prev_state, ou_noise)
                action = action_policy[0]
                history['action_history'].append(action)
                    
                # Recieve state and reward from environment.
                state, reward, done, info = env.step(action)
                history['obs_history'].append(state)
                history['reward_history'].append(reward)
                
                print("Episode * {} * Avg Reward is ==> {}, Action ==> {}".format(ep, reward, action_policy[0]))

                agent.record((prev_state, action, reward, state))
                episodic_reward += reward

                agent.learn()
                update_target(agent.target_actor.variables, agent.actor_model.variables, params['tau'])
                update_target(agent.target_critic.variables, agent.critic_model.variables, params['tau'])

                # End this episode when `done` is True
                if done:
                    break

                prev_state = state

            history['ep_reward'].append(episodic_reward)
            # Mean of last 40 episodes
            avg_reward = np.mean(history['ep_reward'][-40:])
            print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
            history['avg_reward'].append(avg_reward)
            

        # Plotting graph
        # Episodes versus Avg. Rewards
        plt.plot(history['avg_reward'])
        plt.xlabel("Episode")
        plt.ylabel("Avg. Epsiodic Reward")
        plt.show()

    finally:
        # Fechar Server Socket e Game Love lua
        env.stop()

        # Save the models
        agent.actor_model.save("ddpg/perseguidor_actor")
        agent.critic_model.save("ddpg/perseguidor_critic")
        agent.target_actor.save("ddpg/perseguidor_target_actor")
        agent.target_critic.save("ddpg/perseguidor_target_critic")

        # Save the weights
        agent.actor_model.save_weights("ddpg/perseguidor_actor.h5")
        agent.critic_model.save_weights("ddpg/perseguidor_critic.h5")
        agent.target_actor.save_weights("ddpg/perseguidor_target_actor.h5")
        agent.target_critic.save_weights("ddpg/perseguidor_target_critic.h5")
        
        # Save the list 
        set_history_list('ddpg',history)

    

In [7]:
env = PerseguidorEnv()
algorithm = "ddpg" # "dqn | ddpg"
load_model=False
hidden_layer_neuron = int(((2/3) * env.state_space.shape[0]) + env.action_space.shape[0])

params = dict()
params['action_space'] = env.action_space.shape[0]
params['state_space'] = env.state_space.shape[0]
params['name'] = 'Peserguidor'
params['episode'] = 1000
params['result'] = []
params['epsilon'] = 0 if load_model else 1
params['gamma'] = .99
params['batch_size'] = 500
params['epsilon_min'] = .05
params['epsilon_decay'] = .995
params['learning_rate'] = 0.00025
params['layer_sizes'] = [env.action_space.shape[0], hidden_layer_neuron, hidden_layer_neuron]
params['tau'] = 0.5
params['std_dev'] = 1
params['upper_bound'] = 3
params['lower_bound'] = 0

print('hidden neurons ({})'.format(hidden_layer_neuron))
history = get_hitory_list(algorithm, load_model)
if algorithm=="dqn":
    start_dqn(params, env, load_model)
elif algorithm=="ddpg":
    start_ddpg(params, env, load_model)



hidden neurons (12)


2023-05-23 15:06:31.767365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-23 15:06:31.768494: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Listining to  Address:  0.0.0.0 Port:  2223
[0.93267281 0.93267281 0.93267281 0.93267281]
Episode * 0 * Avg Reward is ==> 8, Action ==> 0
[0.73846813 0.73846813 0.73846813 0.73846813]
Episode * 0 * Avg Reward is ==> 7, Action ==> 0
[0.879917 0.879917 0.879917 0.879917]
Episode * 0 * Avg Reward is ==> 7, Action ==> 0
[0.9124478 0.9124478 0.9124478 0.9124478]
Episode * 0 * Avg Reward is ==> 7, Action ==> 0
[0.94256921 0.94256921 0.94256921 0.94256921]
Episode * 0 * Avg Reward is ==> 7, Action ==> 0
[0.89212047 0.89212047 0.89212047 0.89212047]
Episode * 0 * Avg Reward is ==> 6, Action ==> 0
[0.93942644 0.93942644 0.93942644 0.93942644]
Episode * 0 * Avg Reward is ==> 6, Action ==> 0
[0.61254229 0.61254229 0.61254229 0.61254229]
Episode * 0 * Avg Reward is ==> 6, Action ==> 0
[0.57929224 0.57929224 0.57929224 0.57929224]
Episode * 0 * Avg Reward is ==> 6, Action ==> 0
[0.72276267 0.72276267 0.72276267 0.72276267]
Episode * 0 * Avg Reward is ==> 6, Action ==> 0
[0.81049204 0.81049204 0.810

[1.02270918 1.02270918 1.02270918 1.02270918]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.7824376 0.7824376 0.7824376 0.7824376]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.84739704 0.84739704 0.84739704 0.84739704]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.97361648 0.97361648 0.97361648 0.97361648]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.94119834 0.94119834 0.94119834 0.94119834]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.98838331 0.98838331 0.98838331 0.98838331]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[1.00676662 1.00676662 1.00676662 1.00676662]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.93932188 0.93932188 0.93932188 0.93932188]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.79486734 0.79486734 0.79486734 0.79486734]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[0.85052335 0.85052335 0.85052335 0.85052335]
Episode * 1 * Avg Reward is ==> 10, Action ==> 0
[1.0800587 1.0800587 1.0800587 1.0800587]
Episode * 1 

Episode * 1 * Avg Reward is ==> -72.690053929796, Action ==> 0
[1.11577338 1.11577338 1.11577338 1.11577338]
Episode * 1 * Avg Reward is ==> -73.5799932112, Action ==> 0
[1.19130099 1.19130099 1.19130099 1.19130099]
Episode * 1 * Avg Reward is ==> -74.141228427922, Action ==> 0
[1.15698649 1.15698649 1.15698649 1.15698649]
Episode * 1 * Avg Reward is ==> -74.817368061167, Action ==> 0
[1.26327054 1.26327054 1.26327054 1.26327054]
Episode * 1 * Avg Reward is ==> -75.372029418297, Action ==> 0
[1.26294966 1.26294966 1.26294966 1.26294966]
Episode * 1 * Avg Reward is ==> -76.23717231529, Action ==> 0
[1.208036 1.208036 1.208036 1.208036]
Episode * 1 * Avg Reward is ==> -76.95505089787599, Action ==> 0
[0.95613641 0.95613641 0.95613641 0.95613641]
Episode * 1 * Avg Reward is ==> -77.303446408466, Action ==> 0
[0.9045536 0.9045536 0.9045536 0.9045536]
Episode * 1 * Avg Reward is ==> -77.672323520452, Action ==> 0
[1.00424558 1.00424558 1.00424558 1.00424558]
Episode * 1 * Avg Reward is ==> 

[0.27239579 0.27239579 0.27239579 0.27239579]
Episode * 3 * Avg Reward is ==> 8, Action ==> 0
[0.19048856 0.19048856 0.19048856 0.19048856]
Episode * 3 * Avg Reward is ==> 8, Action ==> 0
[0.24231202 0.24231202 0.24231202 0.24231202]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.33438634 0.33438634 0.33438634 0.33438634]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.32975932 0.32975932 0.32975932 0.32975932]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.4308265 0.4308265 0.4308265 0.4308265]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.31671114 0.31671114 0.31671114 0.31671114]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.28907598 0.28907598 0.28907598 0.28907598]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.19772935 0.19772935 0.19772935 0.19772935]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.24718072 0.24718072 0.24718072 0.24718072]
Episode * 3 * Avg Reward is ==> 7, Action ==> 0
[0.17622332 0.17622332 0.17622332 0.17622332]
Episode * 3 * Avg 

Episode * 4 * Avg Reward is ==> -91.27321830573611, Action ==> 0
[-0.57369217 -0.57369217 -0.57369217 -0.57369217]
Episode * 4 * Avg Reward is ==> -92.1570202123526, Action ==> 0
[-0.52162561 -0.52162561 -0.52162561 -0.52162561]
Episode * 4 * Avg Reward is ==> -93.04449585365249, Action ==> 0
[-0.61692156 -0.61692156 -0.61692156 -0.61692156]
Episode * 4 * Avg Reward is ==> -100, Action ==> 0
Episode * 4 * Avg Reward is ==> -2224.1685586330595
[-0.61875792 -0.61875792 -0.61875792 -0.61875792]
Episode * 5 * Avg Reward is ==> 11, Action ==> 0
[-0.59741997 -0.59741997 -0.59741997 -0.59741997]
Episode * 5 * Avg Reward is ==> 11, Action ==> 0
[-0.80616796 -0.80616796 -0.80616796 -0.80616796]
Episode * 5 * Avg Reward is ==> 11, Action ==> 0
[-0.81707939 -0.81707939 -0.81707939 -0.81707939]
Episode * 5 * Avg Reward is ==> 10, Action ==> 0
[-0.77971604 -0.77971604 -0.77971604 -0.77971604]
Episode * 5 * Avg Reward is ==> 10, Action ==> 0
[-0.93633699 -0.93633699 -0.93633699 -0.93633699]
Episode 

Episode * 7 * Avg Reward is ==> -78.19894962564999, Action ==> 0
[-0.70257178 -0.70257178 -0.70257178 -0.70257178]
Episode * 7 * Avg Reward is ==> -81.507374472747, Action ==> 0
[-0.6879802 -0.6879802 -0.6879802 -0.6879802]
Episode * 7 * Avg Reward is ==> -83.111537066329, Action ==> 0
[-0.68047203 -0.68047203 -0.68047203 -0.68047203]
Episode * 7 * Avg Reward is ==> -84.363310232353, Action ==> 0
[-0.63568692 -0.63568692 -0.63568692 -0.63568692]
Episode * 7 * Avg Reward is ==> -83.703796795716, Action ==> 0
[-0.68758463 -0.68758463 -0.68758463 -0.68758463]
Episode * 7 * Avg Reward is ==> -82.07382533012499, Action ==> 0
[-0.81883599 -0.81883599 -0.81883599 -0.81883599]
Episode * 7 * Avg Reward is ==> -80.312688642723, Action ==> 0
[-0.68273939 -0.68273939 -0.68273939 -0.68273939]
Episode * 7 * Avg Reward is ==> -81.864049095101, Action ==> 0
[-0.58044575 -0.58044575 -0.58044575 -0.58044575]
Episode * 7 * Avg Reward is ==> -83.579287432206, Action ==> 0
[-0.60824624 -0.60824624 -0.60824

Episode * 9 * Avg Reward is ==> -84.43708377943399, Action ==> 0
[0.28562136 0.28562136 0.28562136 0.28562136]
Episode * 9 * Avg Reward is ==> -84.967304438054, Action ==> 0
[0.35598444 0.35598444 0.35598444 0.35598444]
Episode * 9 * Avg Reward is ==> -85.85511886024, Action ==> 0
[0.32781804 0.32781804 0.32781804 0.32781804]
Episode * 9 * Avg Reward is ==> -87.875816883675, Action ==> 0
[0.28228245 0.28228245 0.28228245 0.28228245]
Episode * 9 * Avg Reward is ==> -89.57918400627, Action ==> 0
[0.17209239 0.17209239 0.17209239 0.17209239]
Episode * 9 * Avg Reward is ==> -91.5320852826735, Action ==> 0
[0.04700623 0.04700623 0.04700623 0.04700623]
Episode * 9 * Avg Reward is ==> -92.5962934655435, Action ==> 0
[0.08703497 0.08703497 0.08703497 0.08703497]
Episode * 9 * Avg Reward is ==> -100, Action ==> 0
Episode * 9 * Avg Reward is ==> -1737.4015246938136
[0.09154265 0.09154265 0.09154265 0.09154265]
Episode * 10 * Avg Reward is ==> 11, Action ==> 0
[0.11192969 0.11192969 0.11192969 0.

Episode * 11 * Avg Reward is ==> -84.867433122754, Action ==> 0
[0.83386836 0.83386836 0.83386836 0.83386836]
Episode * 11 * Avg Reward is ==> -86.11139589423699, Action ==> 0
[0.7045731 0.7045731 0.7045731 0.7045731]
Episode * 11 * Avg Reward is ==> -88.414330295181, Action ==> 0
[0.59248437 0.59248437 0.59248437 0.59248437]
Episode * 11 * Avg Reward is ==> -91.2566208266238, Action ==> 0
[0.55714815 0.55714815 0.55714815 0.55714815]
Episode * 11 * Avg Reward is ==> -100, Action ==> 0
Episode * 11 * Avg Reward is ==> -1641.8156289803976
[0.58039804 0.58039804 0.58039804 0.58039804]
Episode * 12 * Avg Reward is ==> 5, Action ==> 0
[0.59867794 0.59867794 0.59867794 0.59867794]
Episode * 12 * Avg Reward is ==> 5, Action ==> 0
[0.50469868 0.50469868 0.50469868 0.50469868]
Episode * 12 * Avg Reward is ==> 5, Action ==> 0
[0.59525918 0.59525918 0.59525918 0.59525918]
Episode * 12 * Avg Reward is ==> 5, Action ==> 0
[0.61929768 0.61929768 0.61929768 0.61929768]
Episode * 12 * Avg Reward is =

Episode * 14 * Avg Reward is ==> 5, Action ==> 0
[1.0963094 1.0963094 1.0963094 1.0963094]
Episode * 14 * Avg Reward is ==> 5, Action ==> 0
[1.11359775 1.11359775 1.11359775 1.11359775]
Episode * 14 * Avg Reward is ==> 5, Action ==> 0
[0.91943162 0.91943162 0.91943162 0.91943162]
Episode * 14 * Avg Reward is ==> 4, Action ==> 0
[0.96824063 0.96824063 0.96824063 0.96824063]
Episode * 14 * Avg Reward is ==> 4, Action ==> 0
[1.11374023 1.11374023 1.11374023 1.11374023]
Episode * 14 * Avg Reward is ==> -70.739063278369, Action ==> 0
[0.97390359 0.97390359 0.97390359 0.97390359]
Episode * 14 * Avg Reward is ==> -72.33250594059399, Action ==> 0
[1.06105502 1.06105502 1.06105502 1.06105502]
Episode * 14 * Avg Reward is ==> -74.291023002913, Action ==> 0
[1.00027377 1.00027377 1.00027377 1.00027377]
Episode * 14 * Avg Reward is ==> -75.171168597822, Action ==> 0
[1.14054636 1.14054636 1.14054636 1.14054636]
Episode * 14 * Avg Reward is ==> -76.412371004335, Action ==> 0
[1.10295117 1.10295117 

Episode * 17 * Avg Reward is ==> -73.734771228476, Action ==> 0
[0.77344156 0.77344156 0.77344156 0.77344156]
Episode * 17 * Avg Reward is ==> -75.863501632494, Action ==> 0
[0.95499652 0.95499652 0.95499652 0.95499652]
Episode * 17 * Avg Reward is ==> -77.65287973404, Action ==> 0
[0.87828013 0.87828013 0.87828013 0.87828013]
Episode * 17 * Avg Reward is ==> -80.476113254385, Action ==> 0
[0.99226467 0.99226467 0.99226467 0.99226467]
Episode * 17 * Avg Reward is ==> -83.49046876076001, Action ==> 0
[0.81879437 0.81879437 0.81879437 0.81879437]
Episode * 17 * Avg Reward is ==> -85.26476534150301, Action ==> 0
[0.84796997 0.84796997 0.84796997 0.84796997]
Episode * 17 * Avg Reward is ==> -88.116327637332, Action ==> 0
[0.8108671 0.8108671 0.8108671 0.8108671]
Episode * 17 * Avg Reward is ==> -91.65323036346021, Action ==> 0
[0.70643919 0.70643919 0.70643919 0.70643919]
Episode * 17 * Avg Reward is ==> -100, Action ==> 0
Episode * 17 * Avg Reward is ==> -1445.8378115449746
[0.81372516 0.

Episode * 20 * Avg Reward is ==> 7, Action ==> 0
[0.16681821 0.16681821 0.16681821 0.16681821]
Episode * 20 * Avg Reward is ==> 7, Action ==> 0
[0.08131188 0.08131188 0.08131188 0.08131188]
Episode * 20 * Avg Reward is ==> 7, Action ==> 0
[0.22395075 0.22395075 0.22395075 0.22395075]
Episode * 20 * Avg Reward is ==> 7, Action ==> 0
[0.18835448 0.18835448 0.18835448 0.18835448]
Episode * 20 * Avg Reward is ==> 6, Action ==> 0
[0.14446211 0.14446211 0.14446211 0.14446211]
Episode * 20 * Avg Reward is ==> 6, Action ==> 0
[0.29531868 0.29531868 0.29531868 0.29531868]
Episode * 20 * Avg Reward is ==> 6, Action ==> 0
[0.30549661 0.30549661 0.30549661 0.30549661]
Episode * 20 * Avg Reward is ==> 6, Action ==> 0
[0.36164573 0.36164573 0.36164573 0.36164573]
Episode * 20 * Avg Reward is ==> 6, Action ==> 0
[0.55841934 0.55841934 0.55841934 0.55841934]
Episode * 20 * Avg Reward is ==> 5, Action ==> 0
[0.49165065 0.49165065 0.49165065 0.49165065]
Episode * 20 * Avg Reward is ==> 5, Action ==> 0
[

sh: 1: kill: No such process



INFO:tensorflow:Assets written to: ddpg/perseguidor_actor/assets
INFO:tensorflow:Assets written to: ddpg/perseguidor_critic/assets
INFO:tensorflow:Assets written to: ddpg/perseguidor_target_actor/assets
INFO:tensorflow:Assets written to: ddpg/perseguidor_target_critic/assets


KeyboardInterrupt: 

In [None]:
history