In [1]:
## ------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pickle

import abc
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1" 
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
import numpy as np
from numpy.random import exponential
import random
import pandas as pd



from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.environments.wrappers import ActionRepeat
from tf_agents.environments import batched_py_environment
from tf_agents.environments import parallel_py_environment
from tensorflow.keras.layers import Reshape
from tf_agents.networks import categorical_q_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.metrics import tf_metrics
from tf_agents.eval.metric_utils import log_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.trajectories.trajectory import to_transition
from tf_agents.utils.common import function
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.agents.categorical_dqn import categorical_dqn_agent
from tf_agents.utils import common

import logging

from collections import deque

import sys
sys.path.insert(0, '../Snake_Emulator')
from Snake_v6 import Snake
from Snake_v6 import SnakeEnv

In [2]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")
            
trial = {
"board_size": 8,
"food_reward": 1,
"death_reward": -1,
"step_reward": 0,
"reward_type": 3,
"food_reward_type": 1,
"counter_mode": 1,
"max_counter": 3,
"health_cutoff": 20,
"DRF_MC_high": 1,
"DRF_MC_low": 1,
"immortal": False,
"wall_penalty": -0.25,
"kill_step_reward": 0,
"food_spawn_mode": 1,
"conv_layer_params": None,
"fc_layer_params": [100, 100, 100],
"dropout_layer_params": None,
"optimizer_learning_rate": 2.5e-4,
"optimizer_decay": 0.95,
"optimizer_momentum": 0,
"optimizer_epsilon": 0.00001,
"epsilon_decay_steps": 500000,
"epsilon_final": 0.01,
"target_update_period": 20000,
"discount_factor": 0.99,
"init_replay_buffer": 200,
"dataset_sample_batch_size": 64,
"dataset_num_steps": 2,
"dataset_num_parallel_calls":3,
"n_iterations": 10000000
}
locals().update(trial)

critic_joint_fc_layer_params = (256, 256)

In [3]:
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
            
tf.random.set_seed(888)
env = SnakeEnv(BOARD_SIZE=int(board_size),
               MAX_HEALTH=100,
               FOOD_REWARD=food_reward,
               STEP_REWARD=step_reward,
               DEATH_REWARD=death_reward,
               KILL_STEP_REWARD = kill_step_reward,
               REWARD_TYPE = reward_type,
               FOOD_REWARD_TYPE = food_reward_type, 
               FOOD_SPAWN_MODE=food_spawn_mode,
               MAX_COUNTER=int(max_counter),
               HEALTH_CUTOFF=health_cutoff,
               DRF_MC_HIGH=DRF_MC_high,
               DRF_MC_LOW=DRF_MC_low,
               IMMORTAL=immortal,
               WALL_PENALTY=wall_penalty)
tf_env = tf_py_environment.TFPyEnvironment(env)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

In [4]:
# Preprocessing layers
board_preprocessing = Sequential([
    keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)),
    keras.layers.Flatten()
])
health_preprocessing =  keras.layers.Flatten()

# Layers params are specified by local variables ovtained from DataFrame
categorical_q_net = categorical_q_network.CategoricalQNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=(board_preprocessing, health_preprocessing),
    preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1),
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params,
    num_atoms=51)
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer 
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=optimizer_learning_rate,
                                                decay=optimizer_decay, momentum=optimizer_momentum,
                                                epsilon=optimizer_epsilon, centered=True)

agent = categorical_dqn_agent.CategoricalDqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    categorical_q_network=categorical_q_net,
    optimizer=optimizer,
    min_q_value=-20,
    max_q_value=20,
    n_step_update=2,
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=0.99,
    train_step_counter=train_step)

agent.initialize()

agent.initialize()
# Speed up as tensorflow function
agent.train = function(agent.train)

In [5]:
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=n_iterations)

# Create the observer that adds trajectories to the replay buffer
replay_buffer_observer = replay_buffer.add_batch
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

logging.getLogger().setLevel(logging.INFO)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

collect_driver = DynamicStepDriver(
    tf_env, # Env to play with
    agent.collect_policy, # Collect policy of the agent
    observers=[replay_buffer_observer] + train_metrics, # pass to all observers
    num_steps=1) 
# Speed up as tensorflow function
collect_driver.run = function(collect_driver.run)

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch, ShowProgress(init_replay_buffer)],
    num_steps=init_replay_buffer) 
final_time_step, final_policy_state = init_driver.run()

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

dataset = replay_buffer.as_dataset(
    sample_batch_size=dataset_sample_batch_size,
    num_steps=dataset_num_steps+1,
    num_parallel_calls=dataset_num_parallel_calls).prefetch(dataset_num_parallel_calls)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

200/200

In [6]:

# a) For storing data
training_info = [[], [], [], []]
def add_metrics(arr, train_metrics):
    arr[0].append(train_metrics[0].result().numpy())
    arr[1].append(train_metrics[1].result().numpy())
    arr[2].append(train_metrics[2].result().numpy())
    arr[3].append(train_metrics[3].result().numpy())

# b) For training agent
def train_agent(n_iterations):
    time_step = None
    # Get initial policy state
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    # Create iterator over dataset and loop
    iterator = iter(dataset)

    for iteration in range(n_iterations):
        # Pass current time step and policy state to get next time step and policy state
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        # Sample a batch of trajectories from the dataset, pass to the train method
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        if iteration % 10000 == 0:
            # This adds training data
            add_metrics(training_info, train_metrics)

train_agent(n_iterations=20000)

# c) For storing frames
def get_vid_frames(policy, filename, num_episodes=100, fps=2):
    frames = []
    for _ in range(num_episodes):
        time_step = tf_env.reset()
        frames.append(np.abs(env.get_board()))
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = tf_env.step(action_step.action)
            frames.append(np.abs(env.get_board()))
    return frames

# Store Data
df = pd.DataFrame(np.array(training_info).T, columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM'])

# Store Frames
frames = get_vid_frames(agent.policy, "trained-agent")

# Store Model
my_policy = agent.policy

# Extra

In [11]:
# To get smooth ani|mations
import matplotlib.pyplot as plt
plt.rcParams['animation.ffmpeg_path'] = '\\Users\\lukep\\ffmpeg\\ffmpeg-20200206-343ccfc-win64-static\\bin\\'
import matplotlib.animation as animation
import matplotlib as mpl
import ffmpeg
mpl.rc('animation', html='jshtml')

import pickle as pkl

import numpy as np

In [12]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0], vmin=0, vmax=200)
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [13]:
frames = np.array(frames)[0:2000]
frames = [255*np.insert(frame[0],2,0,axis=2) for frame in frames]
plot_animation(frames)

AxisError: axis 2 is out of bounds for array of dimension 2

In [14]:
plot_animation(frames)

INFO:matplotlib.animation:Animation.save using <class 'matplotlib.animation.HTMLWriter'>
