In [1]:
from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint


INPUT_SHAPE = (84, 84)
INPUT_SHAPE = (54, 54)
WINDOW_LENGTH = 4


class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert(
            'L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        # saves storage in experience memory
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)


# Get the environment and extract the number of actions.
env = gym.make('ALE/Breakout-v5')
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()

# (width, height, channels)
model.add(Permute((2, 3, 1), input_shape=input_shape))

model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])

# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that now you can use the built-in tensorflow.keras callbacks!
dqn.fit(env, nb_steps=1750000, log_interval=10000)
# After training is done, we save the final weights one more time.
# Finally, evaluate our algorithm for 10 episodes.
dqn.test(env, nb_episodes=10, visualize=False)


  import distutils as _distutils
2022-12-14 00:11:48.243588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-14 00:11:49.247582: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-14 00:11:49.247621: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-14 00:11:51.251350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 permute (Permute)           (None, 54, 54, 4)         0         
                                                                 
 conv2d (Conv2D)             (None, 12, 12, 32)        8224      
                                                                 
 activation (Activation)     (None, 12, 12, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 5, 5, 64)          32832     
                                                                 
 activation_1 (Activation)   (None, 5, 5, 64)          0         
                                                                 
 conv2d_2 (Conv2D)           (None, 3, 3, 64)          36928     
                                                                 
 activation_2 (Activation)   (None, 3, 3, 64)          0

2022-12-14 00:11:53.626183: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-14 00:11:53.626347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-14 00:11:53.626408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-12-14 00:11:53.626451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-12-14 00:11:53.626493: W tensorf

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Training for 1750000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 26:14 - reward: 0.0000e+00

  updates=self.state_updates,
2022-12-14 00:11:54.199263: W tensorflow/c/c_api.cc:291] Operation '{name:'activation_4/activation_4/Identity' id:126 op device:{requested: '', assigned: ''} def:{{{node activation_4/activation_4/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_1/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2022-12-14 00:11:54.215959: W tensorflow/c/c_api.cc:291] Operation '{name:'total_3/Assign' id:392 op device:{requested: '', assigned: ''} def:{{{node total_3/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_3, total_3/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes aft

51 episodes - episode_reward: 1.490 [0.000, 5.000] - lives: 2.996 - episode_frame_number: 420.289 - frame_number: 19855.493

Interval 2 (10000 steps performed)
53 episodes - episode_reward: 1.302 [0.000, 4.000] - lives: 2.808 - episode_frame_number: 405.009 - frame_number: 59525.052

Interval 3 (20000 steps performed)
52 episodes - episode_reward: 1.442 [0.000, 5.000] - lives: 2.894 - episode_frame_number: 413.001 - frame_number: 99217.832

Interval 4 (30000 steps performed)
56 episodes - episode_reward: 1.125 [0.000, 5.000] - lives: 3.001 - episode_frame_number: 381.897 - frame_number: 138881.641

Interval 5 (40000 steps performed)
55 episodes - episode_reward: 1.164 [0.000, 4.000] - lives: 3.024 - episode_frame_number: 381.944 - frame_number: 178544.373

Interval 6 (50000 steps performed)
    1/10000 [..............................] - ETA: 2:52 - reward: 0.0000e+00

2022-12-14 00:15:37.732179: W tensorflow/c/c_api.cc:291] Operation '{name:'activation_4_1/activation_4/Identity' id:251 op device:{requested: '', assigned: ''} def:{{{node activation_4_1/activation_4/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_1_1/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2022-12-14 00:15:38.039313: W tensorflow/c/c_api.cc:291] Operation '{name:'loss_3/AddN' id:493 op device:{requested: '', assigned: ''} def:{{{node loss_3/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_3/mul, loss_3/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2022-12-14 00:15:38.076734: W te

50 episodes - episode_reward: 1.560 [0.000, 5.000] - loss: 0.003 - mae: 0.074 - mean_q: 0.102 - mean_eps: 0.951 - lives: 2.956 - episode_frame_number: 428.209 - frame_number: 218214.775

Interval 7 (60000 steps performed)
54 episodes - episode_reward: 1.315 [0.000, 6.000] - loss: 0.001 - mae: 0.075 - mean_q: 0.102 - mean_eps: 0.942 - lives: 2.993 - episode_frame_number: 406.295 - frame_number: 257890.472

Interval 8 (70000 steps performed)
52 episodes - episode_reward: 1.365 [0.000, 5.000] - loss: 0.001 - mae: 0.081 - mean_q: 0.109 - mean_eps: 0.933 - lives: 2.870 - episode_frame_number: 417.510 - frame_number: 297552.528

Interval 9 (80000 steps performed)
50 episodes - episode_reward: 1.520 [0.000, 4.000] - loss: 0.001 - mae: 0.085 - mean_q: 0.114 - mean_eps: 0.924 - lives: 2.892 - episode_frame_number: 423.955 - frame_number: 337244.024

Interval 10 (90000 steps performed)
56 episodes - episode_reward: 1.179 [0.000, 4.000] - loss: 0.001 - mae: 0.089 - mean_q: 0.120 - mean_eps: 0.915

<keras.callbacks.History at 0x7f6ffe936620>

In [2]:
weights_filename = 'dqn_weights.h5f'
dqn.save_weights(weights_filename, overwrite=True)

In [6]:
# хардкодом проставлен render_mode=human
# я не могу визуал прикрутить!!!
history = dqn.test(env, nb_episodes=10, visualize=False)


Testing for 10 episodes ...
Episode 1: reward: 25.000, steps: 980
Episode 2: reward: 26.000, steps: 1020
Episode 3: reward: 24.000, steps: 977
Episode 4: reward: 29.000, steps: 1134
Episode 5: reward: 36.000, steps: 1344
Episode 6: reward: 28.000, steps: 1082
Episode 7: reward: 32.000, steps: 1171
Episode 8: reward: 34.000, steps: 1270
Episode 9: reward: 19.000, steps: 759
Episode 10: reward: 28.000, steps: 1061
