In [1]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
tf.config.run_functions_eagerly(True)
import ray

import time
import random

import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K


from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
INPUT_SHAPE = (34,)
WINDOW_LENGTH = 1
nb_actions = 5

input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()

# (width, height, channels)
#model.add(Permute((2, 1), input_shape=input_shape))
model.add(Input(shape=input_shape))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 34)                0         
                                                                 
 dense (Dense)               (None, 128)               4480      
                                                                 
 activation (Activation)     (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 activation_1 (Activation)   (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
 activation_2 (Activation)   (None, 5)                 0

In [3]:
class myProcessor(Processor):
    def process_observation(self, observation):
        print("obs", observation)
        return np.array(observation)

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        # processed_batch = batch.astype('float32') / 255.
        return batch

    def process_reward(self, reward):
        return np.array(reward)
        #return np.clip(reward, -1., 1.)

processor = myProcessor()

In [4]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])

In [5]:
cycles = 200

def env_creator(render_mode="rgb_array", cycles=200):
    from src.world import world_utils
    env = world_utils.env(render_mode=render_mode, max_cycles=cycles)
    return env

# Without Ray

In [None]:
def test():
    agent_count = 4
    current_cycle = 0
    action_queue = []
    
    full_report = []
    env = env_creator(render_mode="rgb_array", cycles=cycles)
    env.reset()
    for agent in env.agent_iter():
        if current_cycle >= cycles * agent_count:
            break
        #full_report.append({
        #    "adversary_0": {"action": adversary_0_action},
        #    "adversary_1": {"action": adversary_1_action},
        #    "adversary_2": {"action": adversary_2_action},
        #    "agent_0": {"action": good_agent_action}
        #})
        env.render()
        # obs, reward, done, info = env.last()
        observation, cumulative_rewards, terminations, truncations, infos = env.last()
        #print(observation, cumulative_rewards, terminations, truncations, infos)
        if agent == "agent_0":
            action = 0#random.choice([0, 1, 2, 3, 4])
        else:
            dqn.backward(cumulative_rewards, terminations)
            action = dqn.forward(observation)
            #if abs(cumulative_rewards) > 0.01:
                #print("URCA!", agent, cumulative_rewards)
        env.step(action)
        current_cycle += 1
        full_report.append(agent + "_a" + str(action) +  "_r" + str(cumulative_rewards))
        #full_report.append(
        #full_report[-1][agent]["observation"] = observation
        #full_report[-1][agent]["cumulative_rewards"] = cumulative_rewards
        #full_report[-1][agent]["terminations"] = terminations
        #full_report[-1][agent]["truncations"] = truncations
        #full_report[-1][agent]["infos"] = infos
        
    else:
        env.close()
    return full_report

full_report = test()
print(full_report)

# USING RAY

In [6]:
@ray.remote
def test_run_env():
    dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
    dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])
    
    cycles = 20
    agent_count = 4
    current_cycle = 0
    action_queue = []
    
    full_report = []
    env = env_creator(render_mode="rgb_array", cycles=cycles)
    env.reset()
    for agent in env.agent_iter():
        if current_cycle >= cycles * agent_count:
            break
        #full_report.append({
        #    "adversary_0": {"action": adversary_0_action},
        #    "adversary_1": {"action": adversary_1_action},
        #    "adversary_2": {"action": adversary_2_action},
        #    "agent_0": {"action": good_agent_action}
        #})
        env.render()
        # obs, reward, done, info = env.last()
        observation, cumulative_rewards, terminations, truncations, infos = env.last()
        #print(observation, cumulative_rewards, terminations, truncations, infos)
        if agent == "agent_0":
            action = random.choice([0, 1, 2, 3, 4])
        else:
            action = dqn.forward(observation)
        env.step(action)
        current_cycle += 1
        full_report.append(agent + "_a" + str(action) +  "_r" + str(cumulative_rewards))
        #full_report.append(
        #full_report[-1][agent]["observation"] = observation
        #full_report[-1][agent]["cumulative_rewards"] = cumulative_rewards
        #full_report[-1][agent]["terminations"] = terminations
        #full_report[-1][agent]["truncations"] = truncations
        #full_report[-1][agent]["infos"] = infos
        
    else:
        env.close()
    return full_report[:8], dqn

In [7]:
batches_num = 10
task_handles = [] 
for _ in range(batches_num):
    task_handles.append(test_run_env.remote())

output = ray.get(task_handles)
print(output)

2023-03-22 17:59:22,434	INFO worker.py:1553 -- Started a local Ray instance.


NotImplementedError: numpy() is only available when eager execution is enabled.